diff --git a/README.md b/README.md
index 9700f07b2..36c8fdeda 100644
--- a/README.md
+++ b/README.md
@@ -79,7 +79,7 @@
         <td>FP64算力</td>
       <td>算力</td>
       <td><a href="https://github.com/FlagOpen/FlagPerf/tree/main/base/benchmarks/computation-FP64/nvidia/A100">算子或原语</a>,<br><a href="https://github.com/FlagOpen/FlagPerf/tree/main/base/toolkits/computation-FP64/nvidia/A100">厂商专用工具</a></td>
-      <td><a href="https://github.com/FlagOpen/FlagPerf/tree/main/base/benchmarks/computation-FP64/metax">算子或原语</a></td>
+      <td>N/A</td>
       <td>N/A</td>
     </tr>
     <tr>
@@ -144,7 +144,7 @@
       <td>互联</td>
       <td><a href="https://github.com/FlagOpen/FlagPerf/tree/main/base/benchmarks/interconnect-h2d/nvidia/A100">算子或原语</a>,<br><a href="https://github.com/FlagOpen/FlagPerf/tree/main/base/toolkits/interconnect-h2d/nvidia/A100">厂商专用工具</a></td>
       <td>N/A</td>
-      <td>N/A</td>
+      <td><a href="https://github.com/FlagOpen/FlagPerf/tree/main/base/toolkits/interconnect-h2d/ascend">厂商专用工具</a></td>
     </tr>
     <tr>
       <td>10</td>
@@ -152,7 +152,7 @@
       <td>互联</td>
       <td><a href="https://github.com/FlagOpen/FlagPerf/tree/main/base/benchmarks/interconnect-P2P_intraserver/nvidia/A100">算子或原语</a>,<br><a href="https://github.com/FlagOpen/FlagPerf/tree/main/base/toolkits/interconnect-P2P_intraserver/nvidia/A100">厂商专用工具</a></td>
       <td>N/A</td>
-      <td>N/A</td>
+      <td><a href="https://github.com/FlagOpen/FlagPerf/tree/main/base/toolkits/interconnect-P2P_intraserver/ascend">厂商专用工具</a></td>
     </tr>
     <tr>
       <td>11</td>
@@ -212,6 +212,12 @@
       <td>nativetorch<br>flaggems</td>
       <td><a href="https://github.com/FlagOpen/FlagPerf/tree/main/operation/benchmarks/linear/nvidia">A100_40_SXM</a></td>
     </tr>
+    <tr>
+      <td>...</td>
+        <td>...</td>
+      <td>...</td>
+      <td>...</td>
+    </tr>
 </tbody>
 </table>
 
@@ -244,6 +250,7 @@
     <td class="xl65" x:str>天数智芯</td>
     <td class="xl65" x:str>腾讯九霄</td>
     <td class="xl65" x:str>沐曦</td>
+    <td class="xl65" x:str>海飞科</td>
    </tr>
    <tr height="16.80" style='height:16.80pt;'>
     <td class="xl65" x:str>1</td>
@@ -254,6 +261,7 @@
     <td class="xl69" x:str>f16</td>
        <td class="xl69" x:str>f16</td>
        <td class="xl69" x:str>f32/f16</td>
+       <td class="xl69" x:str>N/A</td>
    </tr>
     <tr height="16.80" style='height:16.80pt;'>
     <td class="xl65" x:str>2</td>
@@ -264,6 +272,7 @@
     <td class="xl69" x:str>f16</td>
         <td class="xl69" x:str>N/A</td>
         <td class="xl69" x:str>f32/f16</td>
+        <td class="xl69" x:str>N/A</td>
    </tr>
     <tr height="16.80" style='height:16.80pt;'>
     <td class="xl65" x:str>3</td>
@@ -274,6 +283,7 @@
     <td class="xl69" x:str>N/A</td>
         <td class="xl69" x:str>N/A</td>
         <td class="xl69" x:str>f32/f16</td>
+        <td class="xl69" x:str>N/A</td>
    </tr>
     <tr height="16.80" style='height:16.80pt;'>
     <td class="xl65" x:str>4</td>
@@ -284,6 +294,7 @@
     <td class="xl69" x:str>f16</td>
         <td class="xl69" x:str>N/A</td>
         <td class="xl69" x:str>f32/f16</td>
+        <td class="xl69" x:str>N/A</td>
    </tr>
    <tr height="16.80" style='height:16.80pt;'>
     <td class="xl65" x:str>5</td>
@@ -294,6 +305,7 @@
     <td class="xl69" x:str>N/A</td>
        <td class="xl69" x:str>N/A</td>
        <td class="xl69" x:str>f32/f16</td>
+       <td class="xl69" x:str>N/A</td>
    </tr>
     <tr height="16.80" style='height:16.80pt;'>
     <td class="xl65" x:str>6</td>
@@ -304,6 +316,7 @@
     <td class="xl69" x:str>N/A</td>
         <td class="xl69" x:str>N/A</td>
         <td class="xl69" x:str>f32/f16</td>
+        <td class="xl69" x:str>N/A</td>
    </tr>
     <tr height="16.80" style='height:16.80pt;'>
     <td class="xl65" x:str>7</td>
@@ -314,6 +327,7 @@
     <td class="xl69" x:str>N/A</td>
         <td class="xl69" x:str>N/A</td>
         <td class="xl69" x:str>f32/f16</td>
+        <td class="xl69" x:str>f32/f16</td>
    </tr>
     <tr height="16.80" style='height:16.80pt;'>
     <td class="xl65" x:str>8</td>
@@ -324,6 +338,7 @@
     <td class="xl69" x:str>N/A</td>
         <td class="xl69" x:str>N/A</td>
         <td class="xl69" x:str>f16</td>
+        <td class="xl69" x:str>N/A</td>
     </tr>
 <tr height="16.80" style='height:16.80pt;'>
    <td class="xl65" x:str>9</td>
@@ -334,6 +349,7 @@
     <td class="xl69" x:str>N/A</td>
         <td class="xl69" x:str>N/A</td>
         <td class="xl69" x:str>f32/f16</td>
+        <td class="xl69" x:str>N/A</td>
     </tr>
     <tr height="16.80" style='height:16.80pt;'>
    <td class="xl65" x:str>10</td>
@@ -344,7 +360,8 @@
     <td class="xl69" x:str>N/A</td>
         <td class="xl69" x:str>N/A</td>
         <td class="xl69" x:str>N/A</td>
-    </tr><<tr height="16.80" style='height:16.80pt;'>
+        <td class="xl69" x:str>N/A</td>
+    </tr><tr height="16.80" style='height:16.80pt;'>
    <td class="xl65" x:str>11</td>
     <td class="xl65" height="33.60" style='height:33.60pt;border-right:none;border-bottom:none;' x:str><a href="https://github.com/FlagOpen/FlagPerf/tree/main/inference/benchmarks/llama3_8b_mmlu" style="text-decoration:none" target="_parent">LLaMA3-8B MMLU</td>
         <td class="xl69" x:str>LLM</td>
@@ -353,7 +370,9 @@
     <td class="xl69" x:str>N/A</td>
         <td class="xl69" x:str>N/A</td>
         <td class="xl69" x:str>N/A</td>
-    </tr></table>
+        <td class="xl69" x:str>N/A</td>
+    </tr>
+</table>
 
 
 ## 如何使用FlagPerf进行AI硬件评测
diff --git a/assets/imgs/overview.png b/assets/imgs/overview.png
index 0728aece5..08d5d8a6b 100644
Binary files a/assets/imgs/overview.png and b/assets/imgs/overview.png differ
diff --git a/base/benchmarks/computation-BF16/cambricon/MLU/README.md b/base/benchmarks/computation-BF16/cambricon/MLU/README.md
new file mode 100644
index 000000000..bf5815c73
--- /dev/null
+++ b/base/benchmarks/computation-BF16/cambricon/MLU/README.md
@@ -0,0 +1,44 @@
+# 参评AI芯片信息
+
+* 厂商：Cambricon
+
+## 服务器1
+
+- 产品名称：/
+- 产品型号：/
+- TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+## 服务器1
+
+* 单服务器内使用卡数：8
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.1 LTS
+* 操作系统内核：linux5.15.0-97-generic
+* CPU：/
+* docker版本：25.0.3
+* 内存：2TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | BF16算力测试值(8卡平均) | BF16算力标定值(8卡平均) | 测试标定比例(8卡平均) |
+| ---- | ---------------- | ---------------- | ------------- |
+| 评测结果 | /      | /      | /        |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗(8卡平均) | 单卡最大功耗(8卡最大) | 单卡功耗标准差(8卡平均) | 单卡TDP |
+| ---- | ------------ | ------------ | ------------- | ----- | ------------- | ------------- | -------------- | ----- |
+| 监控结果 | /      | /     | /        | /     | /        | /       | /         | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(8卡平均) | 单卡平均显存占用(8卡平均) |
+| ---- | --------------- | -------------- | ------------- | --------------- |
+| 监控结果 | /  | /        | /       |/   |
diff --git a/base/benchmarks/computation-BF16/cambricon/MLU/case_config.yaml b/base/benchmarks/computation-BF16/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..c897da6b7
--- /dev/null
+++ b/base/benchmarks/computation-BF16/cambricon/MLU/case_config.yaml
@@ -0,0 +1,6 @@
+M: 10240
+N: 10240
+K: 10240
+WARMUP: 100
+ITERS: 50000
+DIST_BACKEND: "cncl"
diff --git a/base/benchmarks/computation-BF16/cambricon/MLU/env.sh b/base/benchmarks/computation-BF16/cambricon/MLU/env.sh
new file mode 100644
index 000000000..ffc396b9c
--- /dev/null
+++ b/base/benchmarks/computation-BF16/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "Cambricon PLACEHOLDER ENV.SH"
diff --git a/base/benchmarks/computation-BF16/metax/requirements.txt b/base/benchmarks/computation-BF16/cambricon/MLU/requirements.txt
old mode 100755
new mode 100644
similarity index 100%
rename from base/benchmarks/computation-BF16/metax/requirements.txt
rename to base/benchmarks/computation-BF16/cambricon/MLU/requirements.txt
diff --git a/base/benchmarks/computation-BF16/main.py b/base/benchmarks/computation-BF16/main.py
index 1bcf255f8..57786aa8e 100644
--- a/base/benchmarks/computation-BF16/main.py
+++ b/base/benchmarks/computation-BF16/main.py
@@ -3,6 +3,13 @@
 # Licensed under the Apache License, Version 2.0 (the "License")
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
+
+# cambricon mlu import
+try:
+    from torch_mlu.utils.model_transfer import transfer
+except ImportError:
+    pass
+
 import torch
 import torch.distributed as dist
 import os
diff --git a/base/benchmarks/computation-BF16/metax/README.md b/base/benchmarks/computation-BF16/metax/C550/README.md
similarity index 74%
rename from base/benchmarks/computation-BF16/metax/README.md
rename to base/benchmarks/computation-BF16/metax/C550/README.md
index 78c1134fa..94e823681 100755
--- a/base/benchmarks/computation-BF16/metax/README.md
+++ b/base/benchmarks/computation-BF16/metax/C550/README.md
@@ -4,9 +4,9 @@
 
 ## 服务器1
 
-- 产品名称：C500
-- 产品型号：曦云®C500 64G
-- TDP：350W
+- 产品名称：C550
+- 产品型号：曦云®C550 64G
+- TDP：450W
 
 # 所用服务器配置
 
@@ -15,10 +15,10 @@
 ## 服务器1
 
 * 单服务器内使用卡数：8
-* 服务器型号：同泰怡 G658V3
+* 服务器型号：OAM C550-1500
 * 操作系统版本：Ubuntu 20.04.6 LTS
 * 操作系统内核：linux5.15.0-58-generic
-* CPU：Montage Jintide(R) C8458P-176core
+* CPU：Inter(R) Xeon(R) Plattinum 8480+
 * docker版本：24.0.7
 * 内存：2TiB
 * 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
@@ -29,16 +29,16 @@
 
 | 评测项  | BF16算力测试值(8卡平均) | BF16算力标定值(8卡平均) | 测试标定比例(8卡平均) |
 | ---- | ---------------- | ---------------- | ------------- |
-| 评测结果 |      |        | 94.2%         |
+| 评测结果 |      |        | 82.8%         |
 
 ## 能耗监控结果
 
 | 监控项  | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗(8卡平均) | 单卡最大功耗(8卡最大) | 单卡功耗标准差(8卡平均) | 单卡TDP |
 | ---- | ------------ | ------------ | ------------- | ----- | ------------- | ------------- | -------------- | ----- |
-| 监控结果 | 1866.92W      | 1998.0W      | 88.62W        | /     | 62.5W        | 68.0W        | 5.5W          | 350W  |
+| 监控结果 | 4207.5      | 4233.0      | 25.5        | /     | 125.5W        | 150.0W        | 24.5W          | 450W  |
 
 ## 其他重要监控结果
 
 | 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(8卡平均) | 单卡平均显存占用(8卡平均) |
 | ---- | --------------- | -------------- | ------------- | --------------- |
-| 监控结果 | 3.097%          | 1.313%         | 36.0°C       | 3.752%       |
+| 监控结果 | 0.784%          | 0.55%         | 35.5°C       | 5.003%       |
diff --git a/base/benchmarks/computation-BF16/metax/C550/case_config.yaml b/base/benchmarks/computation-BF16/metax/C550/case_config.yaml
new file mode 100755
index 000000000..8ec0f2721
--- /dev/null
+++ b/base/benchmarks/computation-BF16/metax/C550/case_config.yaml
@@ -0,0 +1,5 @@
+M: 6656
+N: 2048
+K: 4096
+ITERS: 500
+DIST_BACKEND: "nccl"
diff --git a/base/benchmarks/computation-BF16/metax/env.sh b/base/benchmarks/computation-BF16/metax/C550/env.sh
similarity index 100%
rename from base/benchmarks/computation-BF16/metax/env.sh
rename to base/benchmarks/computation-BF16/metax/C550/env.sh
diff --git a/base/benchmarks/computation-FP64/metax/requirements.txt b/base/benchmarks/computation-BF16/metax/C550/requirements.txt
similarity index 100%
rename from base/benchmarks/computation-FP64/metax/requirements.txt
rename to base/benchmarks/computation-BF16/metax/C550/requirements.txt
diff --git a/base/benchmarks/computation-FP16/cambricon/MLU/README.md b/base/benchmarks/computation-FP16/cambricon/MLU/README.md
new file mode 100644
index 000000000..c80128fa7
--- /dev/null
+++ b/base/benchmarks/computation-FP16/cambricon/MLU/README.md
@@ -0,0 +1,44 @@
+# 参评AI芯片信息
+
+* 厂商：Cambricon
+
+## 服务器1
+
+- 产品名称：/
+- 产品型号：/
+- TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+## 服务器1
+
+* 单服务器内使用卡数：8
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.1 LTS
+* 操作系统内核：linux5.15.0-97-generic
+* CPU：/
+* docker版本：25.0.3
+* 内存：2TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | FP16算力测试值(8卡平均) | FP16算力标定值(8卡平均) | 测试标定比例(8卡平均) |
+| ---- | ---------------- | ---------------- | ------------- |
+| 评测结果 | /      | /      | /        | |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗(8卡平均) | 单卡最大功耗(8卡最大) | 单卡功耗标准差(8卡平均) | 单卡TDP |
+| ---- | ------------ | ------------ | ------------- | ----- | ------------- | ------------- | -------------- | ----- |
+| 监控结果 | /      | /     | /        | /     | /        | /       | /         | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(8卡平均) | 单卡平均显存占用(8卡平均) |
+| ---- | --------------- | -------------- | ------------- | --------------- |
+| 监控结果 | /  | /        | /       |/   |
diff --git a/base/benchmarks/computation-FP16/cambricon/MLU/case_config.yaml b/base/benchmarks/computation-FP16/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..8407e7b18
--- /dev/null
+++ b/base/benchmarks/computation-FP16/cambricon/MLU/case_config.yaml
@@ -0,0 +1,4 @@
+M: 10240
+N: 10240
+K: 10240
+DIST_BACKEND: "cncl"
diff --git a/base/benchmarks/computation-FP16/cambricon/MLU/env.sh b/base/benchmarks/computation-FP16/cambricon/MLU/env.sh
new file mode 100644
index 000000000..ffc396b9c
--- /dev/null
+++ b/base/benchmarks/computation-FP16/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "Cambricon PLACEHOLDER ENV.SH"
diff --git a/base/benchmarks/computation-TF32/metax/requirements.txt b/base/benchmarks/computation-FP16/cambricon/MLU/requirements.txt
old mode 100755
new mode 100644
similarity index 100%
rename from base/benchmarks/computation-TF32/metax/requirements.txt
rename to base/benchmarks/computation-FP16/cambricon/MLU/requirements.txt
diff --git a/base/benchmarks/computation-FP16/main.py b/base/benchmarks/computation-FP16/main.py
index 5ceefca7e..353ae20a6 100644
--- a/base/benchmarks/computation-FP16/main.py
+++ b/base/benchmarks/computation-FP16/main.py
@@ -3,6 +3,13 @@
 # Licensed under the Apache License, Version 2.0 (the "License")
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
+
+# cambricon mlu import
+try:
+    from torch_mlu.utils.model_transfer import transfer
+except ImportError:
+    pass
+
 import torch
 import torch.distributed as dist
 import os
diff --git a/base/benchmarks/computation-TF32/metax/README.md b/base/benchmarks/computation-FP16/metax/C550/README.md
similarity index 70%
rename from base/benchmarks/computation-TF32/metax/README.md
rename to base/benchmarks/computation-FP16/metax/C550/README.md
index c1478da28..2a2c4b78b 100755
--- a/base/benchmarks/computation-TF32/metax/README.md
+++ b/base/benchmarks/computation-FP16/metax/C550/README.md
@@ -4,9 +4,9 @@
 
 ## 服务器1
 
-- 产品名称：C500
-- 产品型号：曦云®C500 64G
-- TDP：350W
+- 产品名称：C550
+- 产品型号：曦云®C550 64G
+- TDP：450W
 
 # 所用服务器配置
 
@@ -15,10 +15,10 @@
 ## 服务器1
 
 * 单服务器内使用卡数：8
-* 服务器型号：同泰怡 G658V3
+* 服务器型号：OAM C550-1500
 * 操作系统版本：Ubuntu 20.04.6 LTS
 * 操作系统内核：linux5.15.0-58-generic
-* CPU：Montage Jintide(R) C8458P-176core
+* CPU：Inter(R) Xeon(R) Plattinum 8480+
 * docker版本：24.0.7
 * 内存：2TiB
 * 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
@@ -27,18 +27,18 @@
 
 ## 核心评测结果
 
-| 评测项  | BF16算力测试值(8卡平均) | BF16算力标定值(8卡平均) | 测试标定比例(8卡平均) |
+| 评测项  | FP16算力测试值(8卡平均) | FP16算力标定值(8卡平均) | 测试标定比例(8卡平均) |
 | ---- | ---------------- | ---------------- | ------------- |
-| 评测结果 |      |        | 88.55%         |
+| 评测结果 |      |        | 83.5%         |
 
 ## 能耗监控结果
 
 | 监控项  | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗(8卡平均) | 单卡最大功耗(8卡最大) | 单卡功耗标准差(8卡平均) | 单卡TDP |
 | ---- | ------------ | ------------ | ------------- | ----- | ------------- | ------------- | -------------- | ----- |
-| 监控结果 | 2012.0W      | 3486.0W      | 450.54W        | /     | 69.0W        | 81.0W        | 12.0W          | 350W  |
+| 监控结果 | 4182.0W      | 4182.0W      | 0.0W        | /     | 112.5W        | 124.0W        | 11.5W          | 450W  |
 
 ## 其他重要监控结果
 
 | 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(8卡平均) | 单卡平均显存占用(8卡平均) |
 | ---- | --------------- | -------------- | ------------- | --------------- |
-| 监控结果 | 3.423%          | 1.34%         | 36.5°C       | 5.148%       |
+| 监控结果 | 0.872%          | 0.55%         | 34.5°C       | 4.71%       |
diff --git a/base/benchmarks/computation-FP16/metax/C550/case_config.yaml b/base/benchmarks/computation-FP16/metax/C550/case_config.yaml
new file mode 100755
index 000000000..cf5a4cdb2
--- /dev/null
+++ b/base/benchmarks/computation-FP16/metax/C550/case_config.yaml
@@ -0,0 +1,5 @@
+M: 6656
+N: 2048
+K: 4096
+ITERS: 500
+DIST_BACKEND: "nccl"
diff --git a/base/benchmarks/computation-FP16/metax/env.sh b/base/benchmarks/computation-FP16/metax/C550/env.sh
similarity index 100%
rename from base/benchmarks/computation-FP16/metax/env.sh
rename to base/benchmarks/computation-FP16/metax/C550/env.sh
diff --git a/base/benchmarks/main_memory-bandwidth/metax/requirements.txt b/base/benchmarks/computation-FP16/metax/C550/requirements.txt
similarity index 100%
rename from base/benchmarks/main_memory-bandwidth/metax/requirements.txt
rename to base/benchmarks/computation-FP16/metax/C550/requirements.txt
diff --git a/base/benchmarks/computation-FP32/cambricon/MLU/README.md b/base/benchmarks/computation-FP32/cambricon/MLU/README.md
new file mode 100644
index 000000000..57789fe15
--- /dev/null
+++ b/base/benchmarks/computation-FP32/cambricon/MLU/README.md
@@ -0,0 +1,44 @@
+# 参评AI芯片信息
+
+* 厂商：Cambricon
+
+## 服务器1
+
+- 产品名称：/
+- 产品型号：/
+- TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+## 服务器1
+
+* 单服务器内使用卡数：8
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.1 LTS
+* 操作系统内核：linux5.15.0-97-generic
+* CPU：/
+* docker版本：25.0.3
+* 内存：2TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | FP32算力测试值(8卡平均) | FP32算力标定值(8卡平均) | 测试标定比例(8卡平均) |
+| ---- | ---------------- | ---------------- | ------------- |
+| 评测结果 | /      | /       | /        |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗(8卡平均) | 单卡最大功耗(8卡最大) | 单卡功耗标准差(8卡平均) | 单卡TDP |
+| ---- | ------------ | ------------ | ------------- | ----- | ------------- | ------------- | -------------- | ----- |
+| 监控结果 | /      | /     | /        | /     | /        | /       | /         | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(8卡平均) | 单卡平均显存占用(8卡平均) |
+| ---- | --------------- | -------------- | ------------- | --------------- |
+| 监控结果 | /  | /        | /       |/   |
diff --git a/base/benchmarks/computation-FP32/cambricon/MLU/case_config.yaml b/base/benchmarks/computation-FP32/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..3d8cd0adb
--- /dev/null
+++ b/base/benchmarks/computation-FP32/cambricon/MLU/case_config.yaml
@@ -0,0 +1,4 @@
+M: 12888
+N: 12888
+K: 12888
+DIST_BACKEND: "cncl"
diff --git a/base/benchmarks/computation-FP32/cambricon/MLU/env.sh b/base/benchmarks/computation-FP32/cambricon/MLU/env.sh
new file mode 100644
index 000000000..ffc396b9c
--- /dev/null
+++ b/base/benchmarks/computation-FP32/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "Cambricon PLACEHOLDER ENV.SH"
diff --git a/base/benchmarks/main_memory-capacity/metax/requirements.txt b/base/benchmarks/computation-FP32/cambricon/MLU/requirements.txt
old mode 100755
new mode 100644
similarity index 100%
rename from base/benchmarks/main_memory-capacity/metax/requirements.txt
rename to base/benchmarks/computation-FP32/cambricon/MLU/requirements.txt
diff --git a/base/benchmarks/computation-FP32/main.py b/base/benchmarks/computation-FP32/main.py
index cd2161fd5..20afe0f6f 100644
--- a/base/benchmarks/computation-FP32/main.py
+++ b/base/benchmarks/computation-FP32/main.py
@@ -3,6 +3,13 @@
 # Licensed under the Apache License, Version 2.0 (the "License")
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
+
+# cambricon mlu import
+try:
+    from torch_mlu.utils.model_transfer import transfer
+except ImportError:
+    pass
+
 import torch
 import torch.distributed as dist
 import os
diff --git a/base/benchmarks/computation-FP32/metax/README.md b/base/benchmarks/computation-FP32/metax/C550/README.md
similarity index 70%
rename from base/benchmarks/computation-FP32/metax/README.md
rename to base/benchmarks/computation-FP32/metax/C550/README.md
index 3ed5753b9..7b9f63d5e 100755
--- a/base/benchmarks/computation-FP32/metax/README.md
+++ b/base/benchmarks/computation-FP32/metax/C550/README.md
@@ -4,9 +4,9 @@
 
 ## 服务器1
 
-- 产品名称：C500
-- 产品型号：曦云®C500 64G
-- TDP：350W
+- 产品名称：C550
+- 产品型号：曦云®C550 64G
+- TDP：450W
 
 # 所用服务器配置
 
@@ -15,10 +15,10 @@
 ## 服务器1
 
 * 单服务器内使用卡数：8
-* 服务器型号：同泰怡 G658V3
+* 服务器型号：OAM C550-1500
 * 操作系统版本：Ubuntu 20.04.6 LTS
 * 操作系统内核：linux5.15.0-58-generic
-* CPU：Montage Jintide(R) C8458P-176core
+* CPU：Inter(R) Xeon(R) Plattinum 8480+
 * docker版本：24.0.7
 * 内存：2TiB
 * 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
@@ -27,18 +27,18 @@
 
 ## 核心评测结果
 
-| 评测项  | BF16算力测试值(8卡平均) | BF16算力标定值(8卡平均) | 测试标定比例(8卡平均) |
+| 评测项  | FP32算力测试值(8卡平均) | FP32算力标定值(8卡平均) | 测试标定比例(8卡平均) |
 | ---- | ---------------- | ---------------- | ------------- |
-| 评测结果 |      |        | 95.6%         |
+| 评测结果 |      |        | 87.4%         |
 
 ## 能耗监控结果
 
 | 监控项  | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗(8卡平均) | 单卡最大功耗(8卡最大) | 单卡功耗标准差(8卡平均) | 单卡TDP |
 | ---- | ------------ | ------------ | ------------- | ----- | ------------- | ------------- | -------------- | ----- |
-| 监控结果 | 1909.38W      | 2094.0W      | 87.49W        | /     | 168.5W        | 280.0W        | 111.5W          | 350W  |
+| 监控结果 | 4896.0W      | 6069.0W      | 1173W        | /     | 253.0W        | 405.0W        | 152.0W          | 450W  |
 
 ## 其他重要监控结果
 
 | 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(8卡平均) | 单卡平均显存占用(8卡平均) |
 | ---- | --------------- | -------------- | ------------- | --------------- |
-| 监控结果 | 4.108%          | 1.336%         | 37.25°C       | 5.18%       |
+| 监控结果 | 0.793%          | 1.096%         | 60.0°C       | 6.371%       |
diff --git a/base/benchmarks/computation-FP16/metax/case_config.yaml b/base/benchmarks/computation-FP32/metax/C550/case_config.yaml
similarity index 100%
rename from base/benchmarks/computation-FP16/metax/case_config.yaml
rename to base/benchmarks/computation-FP32/metax/C550/case_config.yaml
diff --git a/base/benchmarks/computation-FP32/metax/env.sh b/base/benchmarks/computation-FP32/metax/C550/env.sh
similarity index 100%
rename from base/benchmarks/computation-FP32/metax/env.sh
rename to base/benchmarks/computation-FP32/metax/C550/env.sh
diff --git a/base/benchmarks/computation-FP32/metax/C550/requirements.txt b/base/benchmarks/computation-FP32/metax/C550/requirements.txt
new file mode 100755
index 000000000..330e27963
--- /dev/null
+++ b/base/benchmarks/computation-FP32/metax/C550/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/base/benchmarks/computation-FP64/metax/README.md b/base/benchmarks/computation-FP64/metax/README.md
deleted file mode 100755
index 7bd4502bb..000000000
--- a/base/benchmarks/computation-FP64/metax/README.md
+++ /dev/null
@@ -1,44 +0,0 @@
-# 参评AI芯片信息
-
-* 厂商：Metax
-
-## 服务器1
-
-- 产品名称：C500
-- 产品型号：曦云®C500 64G
-- TDP：350W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-## 服务器1
-
-* 单服务器内使用卡数：8
-* 服务器型号：同泰怡 G658V3
-* 操作系统版本：Ubuntu 20.04.6 LTS
-* 操作系统内核：linux5.15.0-58-generic
-* CPU：Montage Jintide(R) C8458P-176core
-* docker版本：24.0.7
-* 内存：2TiB
-* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
-
-# 评测结果
-
-## 核心评测结果
-
-| 评测项  | BF16算力测试值(8卡平均) | BF16算力标定值(8卡平均) | 测试标定比例(8卡平均) |
-| ---- | ---------------- | ---------------- | ------------- |
-| 评测结果 |      |        | 88.7%         |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗(8卡平均) | 单卡最大功耗(8卡最大) | 单卡功耗标准差(8卡平均) | 单卡TDP |
-| ---- | ------------ | ------------ | ------------- | ----- | ------------- | ------------- | -------------- | ----- |
-| 监控结果 | 2311.85W      | 3462.0W      | 640.64W        | /     | 149.0W        | 241.0W        | 92.0W          | 350W  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(8卡平均) | 单卡平均显存占用(8卡平均) |
-| ---- | --------------- | -------------- | ------------- | --------------- |
-| 监控结果 | 4.751%          |  1.58%         | 40.75°C       | 6.210%       |
diff --git a/base/benchmarks/computation-FP64/metax/case_config.yaml b/base/benchmarks/computation-FP64/metax/case_config.yaml
deleted file mode 100755
index fab052b4e..000000000
--- a/base/benchmarks/computation-FP64/metax/case_config.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-M: 8192
-N: 8192
-K: 8192
-DIST_BACKEND: "nccl"
diff --git a/base/benchmarks/computation-FP16/metax/README.md b/base/benchmarks/computation-TF32/cambricon/MLU/README.md
old mode 100755
new mode 100644
similarity index 58%
rename from base/benchmarks/computation-FP16/metax/README.md
rename to base/benchmarks/computation-TF32/cambricon/MLU/README.md
index 8a0e20bb7..fa96f1f15
--- a/base/benchmarks/computation-FP16/metax/README.md
+++ b/base/benchmarks/computation-TF32/cambricon/MLU/README.md
@@ -1,12 +1,12 @@
 # 参评AI芯片信息
 
-* 厂商：Metax
+* 厂商：Cambricon
 
 ## 服务器1
 
-- 产品名称：C500
-- 产品型号：曦云®C500 64G
-- TDP：350W
+- 产品名称：/
+- 产品型号：/
+- TDP：/
 
 # 所用服务器配置
 
@@ -15,11 +15,11 @@
 ## 服务器1
 
 * 单服务器内使用卡数：8
-* 服务器型号：同泰怡 G658V3
-* 操作系统版本：Ubuntu 20.04.6 LTS
-* 操作系统内核：linux5.15.0-58-generic
-* CPU：Montage Jintide(R) C8458P-176core
-* docker版本：24.0.7
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.1 LTS
+* 操作系统内核：linux5.15.0-97-generic
+* CPU：/
+* docker版本：25.0.3
 * 内存：2TiB
 * 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
 
@@ -27,18 +27,20 @@
 
 ## 核心评测结果
 
-| 评测项  | BF16算力测试值(8卡平均) | BF16算力标定值(8卡平均) | 测试标定比例(8卡平均) |
-| ---- | ---------------- | ---------------- | ------------- |
-| 评测结果 |      |        | 90.9%         |
+| 评测项  | TF32算力测试值(8卡平均) | TF32算力标定值(8卡平均) | 测试标定比例(8卡平均) |
+| ---- | --------------- | --------------- | ------------ |
+| 评测结果 | /    | /       | /        |
+
+## 能耗监控结果
 
 ## 能耗监控结果
 
 | 监控项  | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗(8卡平均) | 单卡最大功耗(8卡最大) | 单卡功耗标准差(8卡平均) | 单卡TDP |
 | ---- | ------------ | ------------ | ------------- | ----- | ------------- | ------------- | -------------- | ----- |
-| 监控结果 | 1942.62W      | 2574.0W      | 203.73W        | /     | 101.0W        | 145.0W        | 4.0W          | 350W  |
+| 监控结果 | /      | /     | /        | /     | /        | /       | /         | /  |
 
 ## 其他重要监控结果
 
 | 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(8卡平均) | 单卡平均显存占用(8卡平均) |
 | ---- | --------------- | -------------- | ------------- | --------------- |
-| 监控结果 | 4.371%          | 1.39%         | 37.5.0°C       | 3.922%       |
+| 监控结果 | /  | /        | /       |/   |
diff --git a/base/benchmarks/computation-TF32/cambricon/MLU/case_config.yaml b/base/benchmarks/computation-TF32/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..fa08932d9
--- /dev/null
+++ b/base/benchmarks/computation-TF32/cambricon/MLU/case_config.yaml
@@ -0,0 +1,5 @@
+M: 8192
+N: 8192
+K: 8192
+ITERS: 40000
+DIST_BACKEND: "cncl"
diff --git a/base/benchmarks/computation-TF32/cambricon/MLU/env.sh b/base/benchmarks/computation-TF32/cambricon/MLU/env.sh
new file mode 100644
index 000000000..ffc396b9c
--- /dev/null
+++ b/base/benchmarks/computation-TF32/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "Cambricon PLACEHOLDER ENV.SH"
diff --git a/base/benchmarks/computation-TF32/cambricon/MLU/requirements.txt b/base/benchmarks/computation-TF32/cambricon/MLU/requirements.txt
new file mode 100644
index 000000000..330e27963
--- /dev/null
+++ b/base/benchmarks/computation-TF32/cambricon/MLU/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/base/benchmarks/computation-TF32/main.py b/base/benchmarks/computation-TF32/main.py
index c94cd588c..690410696 100644
--- a/base/benchmarks/computation-TF32/main.py
+++ b/base/benchmarks/computation-TF32/main.py
@@ -3,6 +3,13 @@
 # Licensed under the Apache License, Version 2.0 (the "License")
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
+
+# cambricon mlu import
+try:
+    from torch_mlu.utils.model_transfer import transfer
+except ImportError:
+    pass
+
 import torch
 import torch.distributed as dist
 import os
diff --git a/base/benchmarks/computation-TF32/metax/C550/README.md b/base/benchmarks/computation-TF32/metax/C550/README.md
new file mode 100755
index 000000000..2d5c0f1c7
--- /dev/null
+++ b/base/benchmarks/computation-TF32/metax/C550/README.md
@@ -0,0 +1,40 @@
+## 服务器1
+
+- 产品名称：C550
+- 产品型号：曦云®C550 64G
+- TDP：450W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+## 服务器1
+
+* 单服务器内使用卡数：8
+* 服务器型号：OAM C550-1500
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.15.0-58-generic
+* CPU：Inter(R) Xeon(R) Plattinum 8480+
+* docker版本：24.0.7
+* 内存：2TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | TF32算力测试值(8卡平均) | TF32算力标定值(8卡平均) | 测试标定比例(8卡平均) |
+| ---- | ---------------- | ---------------- | ------------- |
+| 评测结果 |      |        | 82.8%         |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗(8卡平均) | 单卡最大功耗(8卡最大) | 单卡功耗标准差(8卡平均) | 单卡TDP |
+| ---- | ------------ | ------------ | ------------- | ----- | ------------- | ------------- | -------------- | ----- |
+| 监控结果 | 4207.5W      | 4233.0W      | 25.5W        | /     | 292.0W        | 484.0W        | 192.0W          | 450W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(8卡平均) | 单卡平均显存占用(8卡平均) |
+| ---- | --------------- | -------------- | ------------- | --------------- |
+| 监控结果 | 0.998%          | 0.718%         | 49.5°C       | 6.371%       |
diff --git a/base/benchmarks/computation-FP32/metax/case_config.yaml b/base/benchmarks/computation-TF32/metax/C550/case_config.yaml
similarity index 72%
rename from base/benchmarks/computation-FP32/metax/case_config.yaml
rename to base/benchmarks/computation-TF32/metax/C550/case_config.yaml
index 0f479f7a1..a7f91ccbe 100755
--- a/base/benchmarks/computation-FP32/metax/case_config.yaml
+++ b/base/benchmarks/computation-TF32/metax/C550/case_config.yaml
@@ -1,4 +1,5 @@
 M: 8192
 N: 8192
 K: 8192
+ITERS: 1000
 DIST_BACKEND: "nccl"
diff --git a/base/benchmarks/computation-TF32/metax/C550/env.sh b/base/benchmarks/computation-TF32/metax/C550/env.sh
new file mode 100755
index 000000000..5a1bd0f4f
--- /dev/null
+++ b/base/benchmarks/computation-TF32/metax/C550/env.sh
@@ -0,0 +1,2 @@
+echo "Metax PLACEHOLDER ENV.SH"
+export TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=1
\ No newline at end of file
diff --git a/base/benchmarks/computation-TF32/metax/C550/requirements.txt b/base/benchmarks/computation-TF32/metax/C550/requirements.txt
new file mode 100755
index 000000000..330e27963
--- /dev/null
+++ b/base/benchmarks/computation-TF32/metax/C550/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/base/benchmarks/computation-TF32/metax/case_config.yaml b/base/benchmarks/computation-TF32/metax/case_config.yaml
deleted file mode 100755
index 0f479f7a1..000000000
--- a/base/benchmarks/computation-TF32/metax/case_config.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-M: 8192
-N: 8192
-K: 8192
-DIST_BACKEND: "nccl"
diff --git a/base/benchmarks/drivers/utils.py b/base/benchmarks/drivers/utils.py
index a05954a0b..33c65ecab 100644
--- a/base/benchmarks/drivers/utils.py
+++ b/base/benchmarks/drivers/utils.py
@@ -9,6 +9,9 @@
 def set_ieee_float32(vendor):
     if vendor == "nvidia":
         torch.backends.cuda.matmul.allow_tf32 = False
+    elif "cambricon" in vendor:
+        torch.backends.mlu.matmul.allow_tf32 = False
+        torch.backends.cnnl.allow_tf32 = False
     else:
         print("unspecified vendor {}, do nothing".format(vendor))
 
@@ -16,6 +19,9 @@ def set_ieee_float32(vendor):
 def unset_ieee_float32(vendor):
     if vendor == "nvidia":
         torch.backends.cuda.matmul.allow_tf32 = True
+    elif "cambricon" in vendor:
+        torch.backends.mlu.matmul.allow_tf32 = True
+        torch.backends.cnnl.allow_tf32 = True
     else:
         print("unspecified vendor {}, do nothing".format(vendor))
 
diff --git a/base/benchmarks/interconnect-MPI_interserver/cambricon/MLU/README.md b/base/benchmarks/interconnect-MPI_interserver/cambricon/MLU/README.md
new file mode 100644
index 000000000..04b06e2ec
--- /dev/null
+++ b/base/benchmarks/interconnect-MPI_interserver/cambricon/MLU/README.md
@@ -0,0 +1,52 @@
+# 参评AI芯片信息
+
+- 产品名称：/
+- 产品型号：/
+- TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：2
+* 单服务器内使用卡数：8
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.1 LTS
+* 操作系统内核：linux5.15.0-97-generic
+* CPU：/
+* docker版本：25.0.3
+* 内存：2TiB
+* 服务器间多卡的MPI互联带宽采用多种通信方式组合，无标定互联带宽
+
+# 指标选型
+
+The following are the three performance metrics commonly used
+    1. samples/s (algbw): This metric measures the number of samples processed per second, indicating the algorithmic bandwidth. It reflects the computational efficiency of the algorithm.
+    2. busbw: This metric represents the bus bandwidth, which measures the data transfer rate across the system's bus. It is crucial for understanding the communication efficiency between different parts of the system.
+    3. busbw * 2: This metric is an extension of busbw, accounting for bidirectional data transfer. It doubles the bus bandwidth to reflect the full duplex capability of the system.
+The second metric, busbw, is chosen for the following reasons:
+    1. This number is obtained applying a formula to the algorithm bandwidth to reflect the speed of the inter-GPU communication. Using this bus bandwidth, we can compare it with the hardware peak bandwidth, independently of the number of ranks used.
+    2. We can horizontally compare the MPI of different patterns such as all-gather/all-reduce/reduce-scatter.
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 服务器间多卡的MPI互联带宽测试值(16卡平均) | 服务器间多卡的MPI互联带宽标定值(16卡平均) | 测试标定比例(16卡平均) |
+| ---- | -------------- | -------------- | ------------ |
+| 评测结果 | /    | /       | /    |
+
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗(16卡平均) | 单卡最大功耗(16卡最大) | 单卡功耗标准差(16卡最大) | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| 监控结果 | / | / | /   | /     | /      | /      | /        | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(16卡平均) | 单卡平均显存占用(16卡平均) |
+| ---- | --------- | -------- | ------------ | -------------- |
+| 监控结果 | /    | /  | /      | /        |
+
+使用torch.all_reduce，进行多机多卡的MPI互联操作，计算服务器内MPI互联带宽
+
+* 注：如镜像启动时ssh并未随命令开启，请切换至[容器内启动](https://github.com/FlagOpen/FlagPerf/blob/main/docs/utils/definitions/IN_CONTAINER_LAUNCH.md)
diff --git a/base/benchmarks/interconnect-MPI_interserver/cambricon/MLU/case_config.yaml b/base/benchmarks/interconnect-MPI_interserver/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..37fed4984
--- /dev/null
+++ b/base/benchmarks/interconnect-MPI_interserver/cambricon/MLU/case_config.yaml
@@ -0,0 +1,4 @@
+Melements: 1024
+WARMUP: 100
+ITERS: 10000
+DIST_BACKEND: "cncl"
diff --git a/base/benchmarks/interconnect-MPI_interserver/cambricon/MLU/env.sh b/base/benchmarks/interconnect-MPI_interserver/cambricon/MLU/env.sh
new file mode 100644
index 000000000..7f15bc38a
--- /dev/null
+++ b/base/benchmarks/interconnect-MPI_interserver/cambricon/MLU/env.sh
@@ -0,0 +1,4 @@
+echo "Cambricon PLACEHOLDER ENV.SH"
+export CNCL_MLULINK_OVER_ROCE_DISABLE=1
+export CNCL_MLULINK_CROSS_HOSTS_ENABLE=0
+export CNCL_MLU_DIRECT_LEVEL=1
diff --git a/base/benchmarks/interconnect-MPI_interserver/cambricon/MLU/requirements.txt b/base/benchmarks/interconnect-MPI_interserver/cambricon/MLU/requirements.txt
new file mode 100644
index 000000000..7248303e5
--- /dev/null
+++ b/base/benchmarks/interconnect-MPI_interserver/cambricon/MLU/requirements.txt
@@ -0,0 +1 @@
+loguru
\ No newline at end of file
diff --git a/base/benchmarks/interconnect-MPI_interserver/main.py b/base/benchmarks/interconnect-MPI_interserver/main.py
index fafa3cb74..163d9eec1 100644
--- a/base/benchmarks/interconnect-MPI_interserver/main.py
+++ b/base/benchmarks/interconnect-MPI_interserver/main.py
@@ -3,6 +3,13 @@
 # Licensed under the Apache License, Version 2.0 (the "License")
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
+
+# cambricon mlu import
+try:
+    from torch_mlu.utils.model_transfer import transfer
+except ImportError:
+    pass
+
 import torch
 import torch.distributed as dist
 import os
@@ -113,4 +120,4 @@ def main(config, case_config, rank, world_size, local_rank):
             print(r"[FlagPerf Result]Rank {}'s transfer-bandwidth=".format(dist.get_rank()) + str(gib) + "GiB/s")
         multi_device_sync(config.vendor)
 
-    dist.destroy_process_group()
\ No newline at end of file
+    dist.destroy_process_group()
diff --git a/base/benchmarks/interconnect-MPI_interserver/metax/C550/README.md b/base/benchmarks/interconnect-MPI_interserver/metax/C550/README.md
new file mode 100755
index 000000000..c1060cd3d
--- /dev/null
+++ b/base/benchmarks/interconnect-MPI_interserver/metax/C550/README.md
@@ -0,0 +1,57 @@
+# 参评AI芯片信息
+
+* 厂商：Metax
+
+
+* 产品名称：C550
+* 产品型号：曦云®C550 64G
+* TDP：450W
+
+# 所用服务器配置
+
+* 服务器数量：2
+
+
+* 单服务器内使用卡数：8
+* 服务器型号：OAM C550-1500
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.15.0-58-generic
+* CPU：Inter(R) Xeon(R) Plattinum 8480+
+* docker版本：24.0.7
+* 内存：2TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 指标选型
+
+The following are the three performance metrics commonly used
+    1. samples/s (algbw): This metric measures the number of samples processed per second, indicating the algorithmic bandwidth. It reflects the computational efficiency of the algorithm.
+    2. busbw: This metric represents the bus bandwidth, which measures the data transfer rate across the system's bus. It is crucial for understanding the communication efficiency between different parts of the system.
+    3. busbw * 2: This metric is an extension of busbw, accounting for bidirectional data transfer. It doubles the bus bandwidth to reflect the full duplex capability of the system.
+The second metric, busbw, is chosen for the following reasons:
+    1. This number is obtained applying a formula to the algorithm bandwidth to reflect the speed of the inter-GPU communication. Using this bus bandwidth, we can compare it with the hardware peak bandwidth, independently of the number of ranks used.
+    2. We can horizontally compare the MPI of different patterns such as all-gather/all-reduce/reduce-scatter.
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 服务器间多卡的MPI互联带宽测试值(16卡平均) | 服务器间多卡的MPI互联带宽标定值(16卡平均) | 测试标定比例(16卡平均) |
+| ---- | -------------- | -------------- | ------------ |
+| 评测结果 | /    | /       | /    |
+
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗(16卡平均) | 单卡最大功耗(16卡最大) | 单卡功耗标准差(16卡最大) | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| 监控结果 | 4335.0W | 4488.0W | 153.0W    | /     | 136.5W       | 173.0W       | 36.5W        | 450W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(16卡平均) | 单卡平均显存占用(16卡平均) |
+| ---- | --------- | -------- | ------------ | -------------- |
+| 监控结果 | 1.059%    |0.957%   | 36.5°C      | 11.253%        |
+
+使用torch.all_reduce，进行多机多卡的MPI互联操作，计算服务器内MPI互联带宽
+
+* 注：如镜像启动时ssh并未随命令开启，请切换至[容器内启动](https://github.com/FlagOpen/FlagPerf/blob/main/docs/utils/definitions/IN_CONTAINER_LAUNCH.md)
\ No newline at end of file
diff --git a/base/benchmarks/interconnect-MPI_interserver/metax/C550/case_config.yaml b/base/benchmarks/interconnect-MPI_interserver/metax/C550/case_config.yaml
new file mode 100755
index 000000000..a4f8687ff
--- /dev/null
+++ b/base/benchmarks/interconnect-MPI_interserver/metax/C550/case_config.yaml
@@ -0,0 +1 @@
+DIST_BACKEND: "nccl"
\ No newline at end of file
diff --git a/base/benchmarks/interconnect-MPI_interserver/metax/C550/env.sh b/base/benchmarks/interconnect-MPI_interserver/metax/C550/env.sh
new file mode 100755
index 000000000..2523270bc
--- /dev/null
+++ b/base/benchmarks/interconnect-MPI_interserver/metax/C550/env.sh
@@ -0,0 +1 @@
+echo "METAX PLACEHOLDER ENV.SH"
\ No newline at end of file
diff --git a/base/benchmarks/interconnect-MPI_interserver/metax/C550/requirements.txt b/base/benchmarks/interconnect-MPI_interserver/metax/C550/requirements.txt
new file mode 100755
index 000000000..7248303e5
--- /dev/null
+++ b/base/benchmarks/interconnect-MPI_interserver/metax/C550/requirements.txt
@@ -0,0 +1 @@
+loguru
\ No newline at end of file
diff --git a/base/benchmarks/interconnect-MPI_intraserver/cambricon/MLU/README.md b/base/benchmarks/interconnect-MPI_intraserver/cambricon/MLU/README.md
new file mode 100644
index 000000000..d01c5af5b
--- /dev/null
+++ b/base/benchmarks/interconnect-MPI_intraserver/cambricon/MLU/README.md
@@ -0,0 +1,49 @@
+# 参评AI芯片信息
+
+- 产品名称：/
+- 产品型号：/
+- TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数：8
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.1 LTS
+* 操作系统内核：linux5.15.0-97-generic
+* CPU：/
+* docker版本：25.0.3
+* 内存：2TiB
+* 服务器间多卡的MPI互联带宽采用多种通信方式组合，无标定互联带宽
+
+# 指标选型
+
+The following are the three performance metrics commonly used
+    1. samples/s (algbw): This metric measures the number of samples processed per second, indicating the algorithmic bandwidth. It reflects the computational efficiency of the algorithm.
+    2. busbw: This metric represents the bus bandwidth, which measures the data transfer rate across the system's bus. It is crucial for understanding the communication efficiency between different parts of the system.
+    3. busbw * 2: This metric is an extension of busbw, accounting for bidirectional data transfer. It doubles the bus bandwidth to reflect the full duplex capability of the system.
+The second metric, busbw, is chosen for the following reasons:
+    1. This number is obtained applying a formula to the algorithm bandwidth to reflect the speed of the inter-GPU communication. Using this bus bandwidth, we can compare it with the hardware peak bandwidth, independently of the number of ranks used.
+    2. We can horizontally compare the MPI of different patterns such as all-gather/all-reduce/reduce-scatter.
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 单机多卡的MPI互联带宽测试值(8卡平均) | 单机多卡的MPI互联带宽标定值(8卡平均) | 测试标定比例(8卡平均) |
+| ---- | -------------- | -------------- | ------------ |
+| 评测结果 | /    | /       | /    |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗(8卡平均) | 单卡最大功耗(8卡最大) | 单卡功耗标准差(8卡最大) | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| 监控结果 | /    | /  | /      | /        |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(8卡平均) | 单卡平均显存占用(8卡平均) |
+| ---- | --------- | -------- | ------------ | -------------- |
+| 监控结果 | /    | /   | /     | /       |
+
+使用torch.all_reduce，进行单机多卡的MPI互联操作，计算服务器内MPI互联带宽
\ No newline at end of file
diff --git a/base/benchmarks/interconnect-MPI_intraserver/cambricon/MLU/case_config.yaml b/base/benchmarks/interconnect-MPI_intraserver/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..34befe4dc
--- /dev/null
+++ b/base/benchmarks/interconnect-MPI_intraserver/cambricon/MLU/case_config.yaml
@@ -0,0 +1,4 @@
+Melements: 1024
+WARMUP: 100
+ITERS: 5000
+DIST_BACKEND: "cncl"
\ No newline at end of file
diff --git a/base/benchmarks/interconnect-MPI_intraserver/cambricon/MLU/env.sh b/base/benchmarks/interconnect-MPI_intraserver/cambricon/MLU/env.sh
new file mode 100644
index 000000000..91d59b490
--- /dev/null
+++ b/base/benchmarks/interconnect-MPI_intraserver/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "Cambricon PLACEHOLDER ENV.SH"
\ No newline at end of file
diff --git a/base/benchmarks/interconnect-MPI_intraserver/cambricon/MLU/requirements.txt b/base/benchmarks/interconnect-MPI_intraserver/cambricon/MLU/requirements.txt
new file mode 100644
index 000000000..7248303e5
--- /dev/null
+++ b/base/benchmarks/interconnect-MPI_intraserver/cambricon/MLU/requirements.txt
@@ -0,0 +1 @@
+loguru
\ No newline at end of file
diff --git a/base/benchmarks/interconnect-MPI_intraserver/main.py b/base/benchmarks/interconnect-MPI_intraserver/main.py
index 4576e63b9..80d140249 100644
--- a/base/benchmarks/interconnect-MPI_intraserver/main.py
+++ b/base/benchmarks/interconnect-MPI_intraserver/main.py
@@ -3,6 +3,13 @@
 # Licensed under the Apache License, Version 2.0 (the "License")
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
+
+# cambricon mlu import
+try:
+    from torch_mlu.utils.model_transfer import transfer
+except ImportError:
+    pass
+
 import torch
 import torch.distributed as dist
 import os
diff --git a/base/benchmarks/interconnect-MPI_intraserver/metax/C550/README.md b/base/benchmarks/interconnect-MPI_intraserver/metax/C550/README.md
new file mode 100755
index 000000000..80e09da6b
--- /dev/null
+++ b/base/benchmarks/interconnect-MPI_intraserver/metax/C550/README.md
@@ -0,0 +1,45 @@
+# 参评AI芯片信息
+
+* 厂商：Metax
+
+
+* 产品名称：C550
+* 产品型号：曦云®C550 64G
+* TDP：450W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：8
+* 服务器型号：OAM C550-1500
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.15.0-58-generic
+* CPU：Inter(R) Xeon(R) Plattinum 8480+
+* docker版本：24.0.7
+* 内存：2TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 单机多卡的MPI互联带宽测试值(8卡平均) | 单机多卡的MPI互联带宽标定值(8卡平均) | 测试标定比例(8卡平均) |
+| ---- | -------------- | -------------- | ------------ |
+| 评测结果 | /    | /       | /    |
+
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗(8卡平均) | 单卡最大功耗(8卡最大) | 单卡功耗标准差(8卡最大) | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| 监控结果 | 4437.0W | 4692.0W | 255.0W    | /     | 160.0W       | 219.0W       | 59.0W        | 450W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(8卡平均) | 单卡平均显存占用(8卡平均) |
+| ---- | --------- | -------- | ------------ | -------------- |
+| 监控结果 | 1.022%    | 1.008%   | 37.5°C      | 10.765%        |
+
+使用torch.all_reduce，进行单机多卡的MPI互联操作，计算服务器内MPI互联带宽
diff --git a/base/benchmarks/interconnect-MPI_intraserver/metax/C550/case_config.yaml b/base/benchmarks/interconnect-MPI_intraserver/metax/C550/case_config.yaml
new file mode 100755
index 000000000..a4f8687ff
--- /dev/null
+++ b/base/benchmarks/interconnect-MPI_intraserver/metax/C550/case_config.yaml
@@ -0,0 +1 @@
+DIST_BACKEND: "nccl"
\ No newline at end of file
diff --git a/base/benchmarks/interconnect-MPI_intraserver/metax/C550/env.sh b/base/benchmarks/interconnect-MPI_intraserver/metax/C550/env.sh
new file mode 100755
index 000000000..2523270bc
--- /dev/null
+++ b/base/benchmarks/interconnect-MPI_intraserver/metax/C550/env.sh
@@ -0,0 +1 @@
+echo "METAX PLACEHOLDER ENV.SH"
\ No newline at end of file
diff --git a/base/benchmarks/interconnect-MPI_intraserver/metax/C550/requirements.txt b/base/benchmarks/interconnect-MPI_intraserver/metax/C550/requirements.txt
new file mode 100755
index 000000000..7248303e5
--- /dev/null
+++ b/base/benchmarks/interconnect-MPI_intraserver/metax/C550/requirements.txt
@@ -0,0 +1 @@
+loguru
\ No newline at end of file
diff --git a/base/benchmarks/interconnect-P2P_interserver/cambricon/MLU/README.md b/base/benchmarks/interconnect-P2P_interserver/cambricon/MLU/README.md
new file mode 100644
index 000000000..166a1af30
--- /dev/null
+++ b/base/benchmarks/interconnect-P2P_interserver/cambricon/MLU/README.md
@@ -0,0 +1,36 @@
+# 参评AI芯片信息
+
+- 产品名称：/
+- 产品型号：/
+- TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：2
+* 单服务器内使用卡数：1
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.1 LTS
+* 操作系统内核：linux5.15.0-97-generic
+* CPU：/
+* docker版本：25.0.3
+* 内存：2TiB
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 跨服务器P2P互联带宽测试值(2卡平均) | 跨服务器P2P互联带宽标定值(2卡平均) | 测试标定比例(2卡平均) |
+| ---- | -------------- | -------------- | ------------ |
+| 评测结果 | /    |  /      |  /     |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗(2卡平均) | 单卡最大功耗(2卡最大) | 单卡功耗标准差(2卡最大) | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| 监控结果 | / | / | /    | /     | /       | /       | /        | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(2卡平均) | 单卡平均显存占用(2卡平均) |
+| ---- | --------- | -------- | ------------ | -------------- |
+| 监控结果 | %    | %   |  /      | /   |
\ No newline at end of file
diff --git a/base/benchmarks/interconnect-P2P_interserver/cambricon/MLU/case_config.yaml b/base/benchmarks/interconnect-P2P_interserver/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6c1cd5a60
--- /dev/null
+++ b/base/benchmarks/interconnect-P2P_interserver/cambricon/MLU/case_config.yaml
@@ -0,0 +1,4 @@
+Melements: 1024
+WARMUP: 100
+ITERS: 2000
+DIST_BACKEND: "cncl"
\ No newline at end of file
diff --git a/base/benchmarks/interconnect-P2P_interserver/cambricon/MLU/env.sh b/base/benchmarks/interconnect-P2P_interserver/cambricon/MLU/env.sh
new file mode 100644
index 000000000..7f15bc38a
--- /dev/null
+++ b/base/benchmarks/interconnect-P2P_interserver/cambricon/MLU/env.sh
@@ -0,0 +1,4 @@
+echo "Cambricon PLACEHOLDER ENV.SH"
+export CNCL_MLULINK_OVER_ROCE_DISABLE=1
+export CNCL_MLULINK_CROSS_HOSTS_ENABLE=0
+export CNCL_MLU_DIRECT_LEVEL=1
diff --git a/base/benchmarks/interconnect-P2P_interserver/cambricon/MLU/requirements.txt b/base/benchmarks/interconnect-P2P_interserver/cambricon/MLU/requirements.txt
new file mode 100644
index 000000000..330e27963
--- /dev/null
+++ b/base/benchmarks/interconnect-P2P_interserver/cambricon/MLU/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/base/benchmarks/interconnect-P2P_interserver/main.py b/base/benchmarks/interconnect-P2P_interserver/main.py
index b4f03106b..bf0c5c576 100644
--- a/base/benchmarks/interconnect-P2P_interserver/main.py
+++ b/base/benchmarks/interconnect-P2P_interserver/main.py
@@ -3,6 +3,13 @@
 # Licensed under the Apache License, Version 2.0 (the "License")
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
+
+# cambricon mlu import
+try:
+    from torch_mlu.utils.model_transfer import transfer
+except ImportError:
+    pass
+
 import torch
 import torch.distributed as dist
 import os
diff --git a/base/benchmarks/interconnect-P2P_interserver/metax/C550/README.md b/base/benchmarks/interconnect-P2P_interserver/metax/C550/README.md
new file mode 100755
index 000000000..1e234d8ad
--- /dev/null
+++ b/base/benchmarks/interconnect-P2P_interserver/metax/C550/README.md
@@ -0,0 +1,42 @@
+# 参评AI芯片信息
+
+* 厂商：Metax
+
+
+* 产品名称：C550
+* 产品型号：曦云®C550 64G
+* TDP：450W
+
+# 所用服务器配置
+
+* 服务器数量：2
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：OAM C550-1500
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.15.0-58-generic
+* CPU：Inter(R) Xeon(R) Plattinum 8480+
+* docker版本：24.0.7
+* 内存：2TiB
+* IB网卡：400Gbps/s
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 跨服务器P2P互联带宽测试值(2卡平均) | 跨服务器P2P互联带宽标定值(2卡平均) | 测试标定比例(2卡平均) |
+| ---- | -------------- | -------------- | ------------ |
+| 评测结果 | /   | /       | /   |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗(2卡平均) | 单卡最大功耗(2卡最大) | 单卡功耗标准差(2卡最大) | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| 监控结果 | 4207.5W | 4233.0W | 25.5W    | /     | 114.0W       | 128.0W       | 14.0W        | 450W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(2卡平均) | 单卡平均显存占用(2卡平均) |
+| ---- | --------- | -------- | ------------ | -------------- |
+| 监控结果 | 1.289%    | 1.029%   | 36.0°C      | 54.563%       |
diff --git a/base/benchmarks/main_memory-bandwidth/metax/case_config.yaml b/base/benchmarks/interconnect-P2P_interserver/metax/C550/case_config.yaml
similarity index 100%
rename from base/benchmarks/main_memory-bandwidth/metax/case_config.yaml
rename to base/benchmarks/interconnect-P2P_interserver/metax/C550/case_config.yaml
diff --git a/base/benchmarks/interconnect-P2P_interserver/metax/C550/env.sh b/base/benchmarks/interconnect-P2P_interserver/metax/C550/env.sh
new file mode 100755
index 000000000..0cdec082d
--- /dev/null
+++ b/base/benchmarks/interconnect-P2P_interserver/metax/C550/env.sh
@@ -0,0 +1 @@
+echo "METAX PLACEHOLDER ENV.SH"
diff --git a/base/benchmarks/interconnect-P2P_interserver/metax/C550/requirements.txt b/base/benchmarks/interconnect-P2P_interserver/metax/C550/requirements.txt
new file mode 100755
index 000000000..330e27963
--- /dev/null
+++ b/base/benchmarks/interconnect-P2P_interserver/metax/C550/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/base/benchmarks/interconnect-P2P_intraserver/cambricon/MLU/README.md b/base/benchmarks/interconnect-P2P_intraserver/cambricon/MLU/README.md
new file mode 100644
index 000000000..24515a340
--- /dev/null
+++ b/base/benchmarks/interconnect-P2P_intraserver/cambricon/MLU/README.md
@@ -0,0 +1,43 @@
+# 参评AI芯片信息
+
+* 厂商：Cambricon
+
+## 服务器1
+
+- 产品名称：/
+- 产品型号：/
+- TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+## 服务器1
+
+* 单服务器内使用卡数：2
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.1 LTS
+* 操作系统内核：linux5.15.0-97-generic
+* CPU：/
+* docker版本：25.0.3
+* 内存：2TiB
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 服务器内P2P互联带宽测试值(2卡平均) | 服务器内P2P互联带宽标定值(2卡平均) | 测试标定比例(2卡平均) |
+| ---- | -------------- | -------------- | ------------ |
+| 评测结果 | /    | /       | /       |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗(2卡平均) | 单卡最大功耗(2卡最大) | 单卡功耗标准差(2卡最大) | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| 监控结果 | / | / | /    | /     | /       | /       | /        | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(2卡平均) | 单卡平均显存占用(2卡平均) |
+| ---- | --------- | -------- | ------------ | -------------- |
+| 监控结果 | /    | /   | /     | /        |
diff --git a/base/benchmarks/interconnect-P2P_intraserver/cambricon/MLU/case_config.yaml b/base/benchmarks/interconnect-P2P_intraserver/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..490d2365d
--- /dev/null
+++ b/base/benchmarks/interconnect-P2P_intraserver/cambricon/MLU/case_config.yaml
@@ -0,0 +1,4 @@
+DIST_BACKEND: "cncl"
+Melements: 1024
+WARMUP: 100
+ITERS: 3000
diff --git a/base/benchmarks/interconnect-P2P_intraserver/cambricon/MLU/env.sh b/base/benchmarks/interconnect-P2P_intraserver/cambricon/MLU/env.sh
new file mode 100644
index 000000000..ffc396b9c
--- /dev/null
+++ b/base/benchmarks/interconnect-P2P_intraserver/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "Cambricon PLACEHOLDER ENV.SH"
diff --git a/base/benchmarks/interconnect-P2P_intraserver/cambricon/MLU/requirements.txt b/base/benchmarks/interconnect-P2P_intraserver/cambricon/MLU/requirements.txt
new file mode 100644
index 000000000..330e27963
--- /dev/null
+++ b/base/benchmarks/interconnect-P2P_intraserver/cambricon/MLU/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/base/benchmarks/interconnect-P2P_intraserver/main.py b/base/benchmarks/interconnect-P2P_intraserver/main.py
index cd314dc2e..15314fe78 100644
--- a/base/benchmarks/interconnect-P2P_intraserver/main.py
+++ b/base/benchmarks/interconnect-P2P_intraserver/main.py
@@ -3,6 +3,13 @@
 # Licensed under the Apache License, Version 2.0 (the "License")
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
+
+# cambricon mlu import
+try:
+    from torch_mlu.utils.model_transfer import transfer
+except ImportError:
+    pass
+
 import torch
 import torch.distributed as dist
 import os
diff --git a/base/benchmarks/interconnect-P2P_intraserver/metax/C550/README.md b/base/benchmarks/interconnect-P2P_intraserver/metax/C550/README.md
new file mode 100755
index 000000000..56d0cf3ab
--- /dev/null
+++ b/base/benchmarks/interconnect-P2P_intraserver/metax/C550/README.md
@@ -0,0 +1,41 @@
+# 参评AI芯片信息
+
+* 厂商：Metax
+
+
+* 产品名称：C550
+* 产品型号：曦云®C550 64G
+* TDP：450W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：2
+* 服务器型号：OAM C550-1500
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.15.0-58-generic
+* CPU：Inter(R) Xeon(R) Plattinum 8480+
+* docker版本：24.0.7
+* 内存：2TiB
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 服务器内P2P互联带宽测试值(2卡平均) | 服务器内P2P互联带宽标定值(2卡平均) | 测试标定比例(2卡平均) |
+| ---- | -------------- | -------------- | ------------ |
+| 评测结果 | /    | /       | /        |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗(2卡平均) | 单卡最大功耗(2卡最大) | 单卡功耗标准差(2卡最大) | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| 监控结果 | 4490.22W | 4539.0W | 74.41W    | /     | 149.3W       | 153.0W       | 12.3W        | 450W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(2卡平均) | 单卡平均显存占用(2卡平均) |
+| ---- | --------- | -------- | ------------ | -------------- |
+| 监控结果 | 0.802%    | 1.62%   | 36.95°C      | 12.328%        |
diff --git a/base/benchmarks/interconnect-P2P_intraserver/metax/C550/case_config.yaml b/base/benchmarks/interconnect-P2P_intraserver/metax/C550/case_config.yaml
new file mode 100755
index 000000000..c787d269c
--- /dev/null
+++ b/base/benchmarks/interconnect-P2P_intraserver/metax/C550/case_config.yaml
@@ -0,0 +1,2 @@
+ITERS: 1000
+DIST_BACKEND: "nccl"
diff --git a/base/benchmarks/interconnect-P2P_intraserver/metax/C550/env.sh b/base/benchmarks/interconnect-P2P_intraserver/metax/C550/env.sh
new file mode 100755
index 000000000..0cdec082d
--- /dev/null
+++ b/base/benchmarks/interconnect-P2P_intraserver/metax/C550/env.sh
@@ -0,0 +1 @@
+echo "METAX PLACEHOLDER ENV.SH"
diff --git a/base/benchmarks/interconnect-P2P_intraserver/metax/C550/requirements.txt b/base/benchmarks/interconnect-P2P_intraserver/metax/C550/requirements.txt
new file mode 100755
index 000000000..330e27963
--- /dev/null
+++ b/base/benchmarks/interconnect-P2P_intraserver/metax/C550/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/base/benchmarks/interconnect-h2d/cambricon/MLU/README.md b/base/benchmarks/interconnect-h2d/cambricon/MLU/README.md
new file mode 100644
index 000000000..58725f980
--- /dev/null
+++ b/base/benchmarks/interconnect-h2d/cambricon/MLU/README.md
@@ -0,0 +1,46 @@
+# 参评AI芯片信息
+
+* 厂商：Cambricon
+
+## 服务器1
+
+- 产品名称：/
+- 产品型号：/
+- TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+## 服务器1
+
+* 单服务器内使用卡数：8
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.1 LTS
+* 操作系统内核：linux5.15.0-97-generic
+* CPU：/
+* docker版本：25.0.3
+* 内存：2TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | CPU-芯片互联带宽测试值(8卡平均) | CPU-芯片互联带宽标定值(8卡平均) | 测试标定比例(8卡平均) |
+| ---- | -------------- | -------------- | ------------ |
+| 评测结果 | /    | /       | /        |
+
+注: h2d/d2h带宽受到CPU、PCIE、内存等服务器内AI芯片以外的模块影响，无标定值
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗(8卡平均) | 单卡最大功耗(8卡最大) | 单卡功耗标准差(8卡平均) | 单卡TDP |
+| ---- | ------------ | ------------ | ------------- | ----- | ------------- | ------------- | -------------- | ----- |
+| 监控结果 | /      | /      | /       | /     | /        | /       | /         | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(8卡平均) | 单卡平均显存占用(8卡平均) |
+| ---- | --------- | -------- | ------------ | -------------- |
+| 监控结果 | /   | /  | /      | /        |
diff --git a/base/benchmarks/interconnect-h2d/cambricon/MLU/case_config.yaml b/base/benchmarks/interconnect-h2d/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..360adeac8
--- /dev/null
+++ b/base/benchmarks/interconnect-h2d/cambricon/MLU/case_config.yaml
@@ -0,0 +1,4 @@
+Melements: 1024
+WARMUP: 100
+ITERS: 750
+DIST_BACKEND: "cncl"
diff --git a/base/benchmarks/interconnect-h2d/cambricon/MLU/env.sh b/base/benchmarks/interconnect-h2d/cambricon/MLU/env.sh
new file mode 100644
index 000000000..91d59b490
--- /dev/null
+++ b/base/benchmarks/interconnect-h2d/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "Cambricon PLACEHOLDER ENV.SH"
\ No newline at end of file
diff --git a/base/benchmarks/interconnect-h2d/main.py b/base/benchmarks/interconnect-h2d/main.py
index 811526e43..71a064c7e 100644
--- a/base/benchmarks/interconnect-h2d/main.py
+++ b/base/benchmarks/interconnect-h2d/main.py
@@ -3,6 +3,13 @@
 # Licensed under the Apache License, Version 2.0 (the "License")
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
+
+# cambricon mlu import
+try:
+    from torch_mlu.utils.model_transfer import transfer
+except ImportError:
+    pass
+
 import torch
 import torch.distributed as dist
 import os
@@ -96,4 +103,4 @@ def main(config, case_config, rank, world_size, local_rank):
             print(r"[FlagPerf Result]Rank {}'s transfer-bandwidth=".format(dist.get_rank()) + str(gib) + "GiB/s")
         multi_device_sync(config.vendor)
 
-    dist.destroy_process_group()
\ No newline at end of file
+    dist.destroy_process_group()
diff --git a/base/benchmarks/main_memory-bandwidth/cambricon/MLU/README.md b/base/benchmarks/main_memory-bandwidth/cambricon/MLU/README.md
new file mode 100644
index 000000000..7199b2fbc
--- /dev/null
+++ b/base/benchmarks/main_memory-bandwidth/cambricon/MLU/README.md
@@ -0,0 +1,42 @@
+# 参评AI芯片信息
+
+* 厂商：Cambricon
+
+## 服务器1
+
+- 产品名称：/
+- 产品型号：/
+- TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+## 服务器1
+
+* 单服务器内使用卡数：8
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.1 LTS
+* 操作系统内核：linux5.15.0-97-generic
+* CPU：/
+* docker版本：25.0.3
+* 内存：2TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+## 核心评测结果
+
+| 评测项  | 主存储带宽测试值(8卡平均) | 主存储带宽标定值(8卡平均) | 测试标定比例(8卡平均) |
+| ---- | -------------- | -------------- | ------------ |
+| 评测结果 | /   | /       | /        |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗(8卡平均) | 单卡最大功耗(8卡最大) | 单卡功耗标准差(8卡最大) | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| 监控结果 | / | / | /    | /     | /       | /       | /W        | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(8卡平均) | 单卡平均显存占用(8卡平均) |
+| ---- | --------- | -------- | ------------ | -------------- |
+| 监控结果 | /   | /  | / | /       |
diff --git a/base/benchmarks/main_memory-bandwidth/cambricon/MLU/case_config.yaml b/base/benchmarks/main_memory-bandwidth/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..f86bca2b0
--- /dev/null
+++ b/base/benchmarks/main_memory-bandwidth/cambricon/MLU/case_config.yaml
@@ -0,0 +1,4 @@
+Melements: 1024
+WARMUP: 100
+ITERS: 100000
+DIST_BACKEND: "cncl"
\ No newline at end of file
diff --git a/base/benchmarks/main_memory-bandwidth/cambricon/MLU/env.sh b/base/benchmarks/main_memory-bandwidth/cambricon/MLU/env.sh
new file mode 100644
index 000000000..ffc396b9c
--- /dev/null
+++ b/base/benchmarks/main_memory-bandwidth/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "Cambricon PLACEHOLDER ENV.SH"
diff --git a/base/benchmarks/main_memory-bandwidth/cambricon/MLU/requirements.txt b/base/benchmarks/main_memory-bandwidth/cambricon/MLU/requirements.txt
new file mode 100644
index 000000000..330e27963
--- /dev/null
+++ b/base/benchmarks/main_memory-bandwidth/cambricon/MLU/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/base/benchmarks/main_memory-bandwidth/main.py b/base/benchmarks/main_memory-bandwidth/main.py
index 8b6830cd7..be62f7311 100644
--- a/base/benchmarks/main_memory-bandwidth/main.py
+++ b/base/benchmarks/main_memory-bandwidth/main.py
@@ -3,6 +3,13 @@
 # Licensed under the Apache License, Version 2.0 (the "License")
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
+
+# cambricon mlu import
+try:
+    from torch_mlu.utils.model_transfer import transfer
+except ImportError:
+    pass
+
 import torch
 import torch.distributed as dist
 import os
diff --git a/base/benchmarks/main_memory-bandwidth/metax/README.md b/base/benchmarks/main_memory-bandwidth/metax/C550/README.md
similarity index 70%
rename from base/benchmarks/main_memory-bandwidth/metax/README.md
rename to base/benchmarks/main_memory-bandwidth/metax/C550/README.md
index f922eb50e..01a55ae00 100755
--- a/base/benchmarks/main_memory-bandwidth/metax/README.md
+++ b/base/benchmarks/main_memory-bandwidth/metax/C550/README.md
@@ -2,23 +2,21 @@
 
 * 厂商：Metax
 
-## 服务器1
 
-- 产品名称：C500
-- 产品型号：曦云®C500 64G
-- TDP：350W
+* 产品名称：C550
+* 产品型号：曦云®C550 64G
+* TDP：450W
 
 # 所用服务器配置
 
 * 服务器数量：1
 
-## 服务器1
 
-* 单服务器内使用卡数：8
-* 服务器型号：同泰怡 G658V3
+* 单服务器内使用卡数：2
+* 服务器型号：OAM C550-1500
 * 操作系统版本：Ubuntu 20.04.6 LTS
 * 操作系统内核：linux5.15.0-58-generic
-* CPU：Montage Jintide(R) C8458P-176core
+* CPU：Inter(R) Xeon(R) Plattinum 8480+
 * docker版本：24.0.7
 * 内存：2TiB
 * 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
@@ -29,16 +27,16 @@
 
 | 评测项  | 主存储带宽测试值(8卡平均) | 主存储带宽标定值(8卡平均) | 测试标定比例(8卡平均) |
 | ---- | -------------- | -------------- | ------------ |
-| 评测结果 | 1376.045GB/s    | 1.8TB/s       | 74.6%        |
+| 评测结果 | 1390.5GB/s    | 1.8TB/s       | 75.44%        |
 
 ## 能耗监控结果
 
 | 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗(8卡平均) | 单卡最大功耗(8卡最大) | 单卡功耗标准差(8卡最大) | 单卡TDP |
 | ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| 监控结果 | 1946.31W | 2334.0W | 132.74W    | /     | 60.5W       | 64.0W       | 3.5W        | 350W  |
+| 监控结果 | 4233.0W | 4284.0W | 51.0W    | /     | 164.5W       | 229.0W       | 64.5W        | 450W  |
 
 ## 其他重要监控结果
 
 | 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(8卡平均) | 单卡平均显存占用(8卡平均) |
 | ---- | --------- | -------- | ------------ | -------------- |
-| 监控结果 | 3.381%    | 1.401%   | 36.3°C      | 18.67%        |
+| 监控结果 | 1.311%    | 0.855%   | 36.0°C      | 18.424%        |
diff --git a/base/benchmarks/main_memory-bandwidth/metax/C550/case_config.yaml b/base/benchmarks/main_memory-bandwidth/metax/C550/case_config.yaml
new file mode 100755
index 000000000..d1e00b7b3
--- /dev/null
+++ b/base/benchmarks/main_memory-bandwidth/metax/C550/case_config.yaml
@@ -0,0 +1 @@
+DIST_BACKEND: "nccl"
diff --git a/base/benchmarks/computation-FP64/metax/env.sh b/base/benchmarks/main_memory-bandwidth/metax/C550/env.sh
similarity index 100%
rename from base/benchmarks/computation-FP64/metax/env.sh
rename to base/benchmarks/main_memory-bandwidth/metax/C550/env.sh
diff --git a/base/benchmarks/main_memory-bandwidth/metax/C550/requirements.txt b/base/benchmarks/main_memory-bandwidth/metax/C550/requirements.txt
new file mode 100755
index 000000000..330e27963
--- /dev/null
+++ b/base/benchmarks/main_memory-bandwidth/metax/C550/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/base/benchmarks/main_memory-bandwidth/metax/env.sh b/base/benchmarks/main_memory-bandwidth/metax/env.sh
deleted file mode 100755
index 3c1eac6cf..000000000
--- a/base/benchmarks/main_memory-bandwidth/metax/env.sh
+++ /dev/null
@@ -1 +0,0 @@
-echo "Metax PLACEHOLDER ENV.SH"
diff --git a/base/benchmarks/main_memory-capacity/ascend/README.md b/base/benchmarks/main_memory-capacity/ascend/README.md
new file mode 100644
index 000000000..12a04d8ee
--- /dev/null
+++ b/base/benchmarks/main_memory-capacity/ascend/README.md
@@ -0,0 +1,42 @@
+# 参评AI芯片信息
+
+* 厂商：Ascend
+
+
+* 产品名称：Atlas800T A2
+* 产品型号：Atlas800T A2
+* TDP：350W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+## 服务器
+
+
+* 单服务器内使用卡数：8
+* 服务器型号：Atlas 800T A2训练服务器
+* 操作系统版本：Ubuntu 22.04 LTS
+* 操作系统内核：5.15.0-25-generic
+* CPU：Kunpeng 920
+* docker版本：此评测样例无需docker环境
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项   | 主存储容量测试值(8卡平均) | 主存储容量标定值(8卡平均) | 测试标定比例(8卡平均) |
+| -------- | ------------------------- | ------------------------- | --------------------- |
+| 评测结果 | 60.32GiB(64.77GB)         | 64GiB                     | 94.25%                |
+
+## 能耗监控结果
+
+此评测样例中无意义
+
+## 其他重要监控结果
+
+| 监控项   | 系统平均CPU占用 | 系统平均内存占用 |
+| -------- | --------------- | ---------------- |
+| 监控结果 | 16.400%         | 0.100%           |
diff --git a/base/benchmarks/main_memory-capacity/ascend/case_config.yaml b/base/benchmarks/main_memory-capacity/ascend/case_config.yaml
new file mode 100644
index 000000000..662beb6bd
--- /dev/null
+++ b/base/benchmarks/main_memory-capacity/ascend/case_config.yaml
@@ -0,0 +1 @@
+DIST_BACKEND: "gloo"
diff --git a/base/benchmarks/main_memory-capacity/ascend/env.sh b/base/benchmarks/main_memory-capacity/ascend/env.sh
new file mode 100644
index 000000000..291c49182
--- /dev/null
+++ b/base/benchmarks/main_memory-capacity/ascend/env.sh
@@ -0,0 +1 @@
+echo "ASCEND PLACEHOLDER ENV.SH"
diff --git a/base/benchmarks/main_memory-capacity/ascend/npu_memory_capacity.py b/base/benchmarks/main_memory-capacity/ascend/npu_memory_capacity.py
new file mode 100644
index 000000000..8190bac3b
--- /dev/null
+++ b/base/benchmarks/main_memory-capacity/ascend/npu_memory_capacity.py
@@ -0,0 +1,47 @@
+import torch
+
+import torch_npu
+from torch_npu.contrib import transfer_to_npu
+
+
+def test_gpu_memory_capacity():
+    # Initial tensor size in MiB
+    initial_byte_size = 10240
+    current_byte_size = initial_byte_size
+    min_byte_size = 1
+    total_allocated = 0
+
+    tensor_list = []
+
+    print(f"Init tensor size: {initial_byte_size} MiB...")
+
+    # Loop to reduce tensor size until it reaches the minimum size
+    while current_byte_size >= min_byte_size:
+        allocation_failed = False
+
+        # Attempt to allocate memory until failure
+        while not allocation_failed:
+            try:
+                # Allocate tensor of size `current_byte_size` MiB on the GPU
+                tensor = torch.cuda.FloatTensor(int(current_byte_size * 1024 * 1024 / 4))
+                tensor_list.append(tensor)
+                total_allocated += current_byte_size
+                print(f"Allocated: {total_allocated} MiB")
+            except RuntimeError as e:
+                # Handle out-of-memory error
+                print(f"CUDA OOM at tensor size {current_byte_size} MiB. Allocated: {total_allocated} MiB")
+                allocation_failed = True
+
+        # Halve the tensor size for the next iteration
+        current_byte_size /= 2
+        print(f"Reduce tensor size to {current_byte_size} MiB")
+
+    # Print the total allocated memory in GiB
+    print(f"[FlagPerf Result]main_memory-capacity={total_allocated / 1024.0:.2f}GiB")
+    # Print the total allocated memory in GB (decimal)
+    print(
+        f"[FlagPerf Result]main_memory-capacity={total_allocated * 1024.0 * 1024.0 / (1000.0 * 1000.0 * 1000.0):.2f}GB")
+
+
+if __name__ == "__main__":
+    test_gpu_memory_capacity()
diff --git a/base/benchmarks/main_memory-capacity/ascend/requirements.txt b/base/benchmarks/main_memory-capacity/ascend/requirements.txt
new file mode 100644
index 000000000..330e27963
--- /dev/null
+++ b/base/benchmarks/main_memory-capacity/ascend/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/base/benchmarks/main_memory-capacity/cambricon/MLU/README.md b/base/benchmarks/main_memory-capacity/cambricon/MLU/README.md
new file mode 100644
index 000000000..4519d8f88
--- /dev/null
+++ b/base/benchmarks/main_memory-capacity/cambricon/MLU/README.md
@@ -0,0 +1,42 @@
+# 参评AI芯片信息
+
+* 厂商：Cambricon
+
+## 服务器1
+
+- 产品名称：/
+- 产品型号：/
+- TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+## 服务器1
+
+* 单服务器内使用卡数：8
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.1 LTS
+* 操作系统内核：linux5.15.0-97-generic
+* CPU：/
+* docker版本：25.0.3
+* 内存：2TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 主存储容量测试值(8卡平均)    | 主存储容量标定值(8卡平均) | 测试标定比例(8卡平均) |
+| ---- | ----------------- | -------------- | ------------ |
+| 评测结果 | / | /   | /       |
+
+## 能耗监控结果
+
+此评测样例中无意义
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 |
+| ---- | --------- | -------- |
+| 监控结果 | /    | /   |
diff --git a/base/benchmarks/main_memory-capacity/cambricon/MLU/case_config.yaml b/base/benchmarks/main_memory-capacity/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..04d37c4d6
--- /dev/null
+++ b/base/benchmarks/main_memory-capacity/cambricon/MLU/case_config.yaml
@@ -0,0 +1,2 @@
+DIST_BACKEND: "cncl"
+INITSIZE: 2
\ No newline at end of file
diff --git a/base/benchmarks/main_memory-capacity/cambricon/MLU/env.sh b/base/benchmarks/main_memory-capacity/cambricon/MLU/env.sh
new file mode 100644
index 000000000..ffc396b9c
--- /dev/null
+++ b/base/benchmarks/main_memory-capacity/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "Cambricon PLACEHOLDER ENV.SH"
diff --git a/base/benchmarks/main_memory-capacity/cambricon/MLU/requirements.txt b/base/benchmarks/main_memory-capacity/cambricon/MLU/requirements.txt
new file mode 100644
index 000000000..330e27963
--- /dev/null
+++ b/base/benchmarks/main_memory-capacity/cambricon/MLU/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/base/benchmarks/main_memory-capacity/kunlunxin/R300p/case_config.yaml b/base/benchmarks/main_memory-capacity/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..f9c9b3ae9
--- /dev/null
+++ b/base/benchmarks/main_memory-capacity/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1 @@
+DIST_BACKEND: "xccl"
diff --git a/base/benchmarks/main_memory-capacity/kunlunxin/R300p/env.sh b/base/benchmarks/main_memory-capacity/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..c8701d44c
--- /dev/null
+++ b/base/benchmarks/main_memory-capacity/kunlunxin/R300p/env.sh
@@ -0,0 +1,2 @@
+set -x
+echo "KUNLUNXIN PLACEHOLDER ENV.SH"
diff --git a/base/benchmarks/main_memory-capacity/kunlunxin/R300p/requirements.txt b/base/benchmarks/main_memory-capacity/kunlunxin/R300p/requirements.txt
new file mode 100644
index 000000000..330e27963
--- /dev/null
+++ b/base/benchmarks/main_memory-capacity/kunlunxin/R300p/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/base/benchmarks/main_memory-capacity/main.py b/base/benchmarks/main_memory-capacity/main.py
index 505b0c8ff..6c61ffa69 100644
--- a/base/benchmarks/main_memory-capacity/main.py
+++ b/base/benchmarks/main_memory-capacity/main.py
@@ -2,6 +2,12 @@
 #
 # Licensed under the Apache License, Version 2.0 (the "License")
 
+# cambricon mlu import
+try:
+    from torch_mlu.utils.model_transfer import transfer
+except ImportError:
+    pass
+
 import torch
 import torch.distributed as dist
 import os
@@ -47,7 +53,7 @@ def main(config, case_config, rank, world_size, local_rank):
             total_allocated += byte_size
             print(f"Allocated: {total_allocated} MiB")
         except RuntimeError as e:
-            if "CUDA out of memory" in str(e):
+            if "out of memory" in str(e):
                 print(f"CUDA OOM at tensor size {byte_size} MiB. Allocated:{total_allocated} MiB")
                 byte_size //= 2
                 if byte_size < min_byte_size:
diff --git a/base/benchmarks/main_memory-capacity/metax/README.md b/base/benchmarks/main_memory-capacity/metax/C550/README.md
similarity index 73%
rename from base/benchmarks/main_memory-capacity/metax/README.md
rename to base/benchmarks/main_memory-capacity/metax/C550/README.md
index b739de5f4..f9d3ff50f 100755
--- a/base/benchmarks/main_memory-capacity/metax/README.md
+++ b/base/benchmarks/main_memory-capacity/metax/C550/README.md
@@ -2,23 +2,21 @@
 
 * 厂商：Metax
 
-## 服务器1
 
-- 产品名称：C500
-- 产品型号：曦云®C500 64G
-- TDP：350W
+* 产品名称：C550
+* 产品型号：曦云®C550 64G
+* TDP：450W
 
 # 所用服务器配置
 
 * 服务器数量：1
 
-## 服务器1
 
 * 单服务器内使用卡数：8
-* 服务器型号：同泰怡 G658V3
+* 服务器型号：OAM C550-1500
 * 操作系统版本：Ubuntu 20.04.6 LTS
 * 操作系统内核：linux5.15.0-58-generic
-* CPU：Montage Jintide(R) C8458P-176core
+* CPU：Inter(R) Xeon(R) Plattinum 8480+
 * docker版本：24.0.7
 * 内存：2TiB
 * 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
@@ -29,7 +27,7 @@
 
 | 评测项  | 主存储容量测试值(8卡平均)    | 主存储容量标定值(8卡平均) | 测试标定比例(8卡平均) |
 | ---- | ----------------- | -------------- | ------------ |
-| 评测结果 | 60.71 | 64GiB          | 94.86%       |
+| 评测结果 | 65.25 | 64GiB          | 101.95%       |
 
 ## 能耗监控结果
 
@@ -39,4 +37,4 @@
 
 | 监控项  | 系统平均CPU占用 | 系统平均内存占用 |
 | ---- | --------- | -------- |
-| 监控结果 | 7.09%    | 2.263%   |
+| 监控结果 | 3.484%    | 1.328%   |
diff --git a/base/benchmarks/main_memory-capacity/metax/case_config.yaml b/base/benchmarks/main_memory-capacity/metax/C550/case_config.yaml
similarity index 100%
rename from base/benchmarks/main_memory-capacity/metax/case_config.yaml
rename to base/benchmarks/main_memory-capacity/metax/C550/case_config.yaml
diff --git a/base/benchmarks/computation-TF32/metax/env.sh b/base/benchmarks/main_memory-capacity/metax/C550/env.sh
similarity index 100%
rename from base/benchmarks/computation-TF32/metax/env.sh
rename to base/benchmarks/main_memory-capacity/metax/C550/env.sh
diff --git a/base/benchmarks/main_memory-capacity/metax/C550/requirements.txt b/base/benchmarks/main_memory-capacity/metax/C550/requirements.txt
new file mode 100755
index 000000000..330e27963
--- /dev/null
+++ b/base/benchmarks/main_memory-capacity/metax/C550/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/base/benchmarks/main_memory-capacity/metax/env.sh b/base/benchmarks/main_memory-capacity/metax/env.sh
deleted file mode 100755
index 3c1eac6cf..000000000
--- a/base/benchmarks/main_memory-capacity/metax/env.sh
+++ /dev/null
@@ -1 +0,0 @@
-echo "Metax PLACEHOLDER ENV.SH"
diff --git a/base/configs/host.yaml b/base/configs/host.yaml
old mode 100644
new mode 100755
index 32fbbcff9..229b8dea5
--- a/base/configs/host.yaml
+++ b/base/configs/host.yaml
@@ -16,6 +16,7 @@ MASTER_PORT: "29501"
 SHM_SIZE: "32G"
 ACCE_CONTAINER_OPT: " --gpus all"
 # for nvidia, using " -- gpus all"
+# for kunlunxin, using "--device=/dev/xpu0 --device=/dev/xpu1 --device=/dev/xpu2 --device=/dev/xpu3 --device=/dev/xpu4 --device=/dev/xpu5 --device=/dev/xpu6 --device=/dev/xpu7 --device=/dev/xpuctrl"
 # for xxx, using
 PIP_SOURCE: "https://mirror.baidu.com/pypi/simple"
 CLEAR_CACHES: True
@@ -53,3 +54,19 @@ CASES:
 # nvidia "interconnect-P2P_intraserver:A100": "pyorch_2.3"
 # nvidia "interconnect-P2P_interserver:A100": "pytorch_ssh"
 # nvidia "interconnect-MPI_interserver:A100": "pytorch_ssh"
+
+# metax   "computation-FP16:C550": "pytorch_2.0"
+# metax   "interconnect-h2d:C550": "pytorch_2.0"
+# metax   "interconnect-MPI_interserver:C550": "pytorch_2.0"
+# metax   "interconnect-MPI_intraserver:C550": "pytorch_2.0"
+# metax   "interconnect-P2P_interserver:C550": "pytorch_2.0"
+# metax   "interconnect-P2P_intraserver:C550": "pytorch_2.0"
+# metax   "computation-FP32:C550": "pytorch_2.0"
+# metax   "computation-TF32:C550": "pytorch_2.0"
+# metax   "computation-BF16:C550": "pytorch_2.0"
+# metax   "computation-INT8:C550": "pytorch_2.0"
+# metax   "main_memory-bandwidth:C550": "pytorch_2.0"
+# metax   "main_memory-capacity:C550": "pytorch_2.0"
+# metax   "computation-FP64:C550": "pytorch_2.0"
+
+    # kunlunxin   "main_memory-capacity:R300p": "xpytorch029"
diff --git a/base/run.py b/base/run.py
old mode 100644
new mode 100755
index a9f957b84..d61058914
--- a/base/run.py
+++ b/base/run.py
@@ -143,8 +143,7 @@ def clear_caches_cluster(clear, nnodes):
 
 def start_monitors_in_cluster(dp_path, case_log_dir, nnodes, config):
     '''Start sytem and vendor's monitors.'''
-    start_mon_cmd = "cd " + dp_path + " && " + sys.executable \
-                    + " ../utils/sys_monitor.py -o restart -l "
+    start_mon_cmd = "cd " + dp_path + " && " + sys.executable + " ../utils/sys_monitor.py -v " + config.VENDOR +  " -o restart -l "
     timeout = 60
     RUN_LOGGER.debug("Run cmd in the cluster to start system monitors: " +
                      start_mon_cmd)
@@ -201,8 +200,13 @@ def start_tasks_in_cluster(dp_path, container_name, config, base_args,
     nnodes = len(config.HOSTS)
     framework = config.CASES[case]
 
-    env_dir = os.path.join(
-        config.FLAGPERF_PATH, "benchmarks", case, config.VENDOR)
+    case_absname = case.split(":")[0] if ":" in case else case
+    chipname = case.split(":")[1] if ":" in case else None
+    if chipname is not None:
+        env_dir = os.path.join(config.FLAGPERF_PATH, "benchmarks", case_absname, config.VENDOR, chipname)
+    else:
+        env_dir = os.path.join(config.FLAGPERF_PATH, "benchmarks", case_absname, config.VENDOR)
+
 
     env_shell = os.path.join(env_dir, "env.sh")
     req_file = os.path.join(env_dir, "requirements.txt")
diff --git a/base/toolkits/computation-BF16/cambricon/MLU/README.md b/base/toolkits/computation-BF16/cambricon/MLU/README.md
new file mode 100644
index 000000000..bd0cabe99
--- /dev/null
+++ b/base/toolkits/computation-BF16/cambricon/MLU/README.md
@@ -0,0 +1,41 @@
+# 参评AI芯片信息
+
+* 厂商：Cambricon
+* 产品名称：MLU
+* 产品型号：/
+* TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数：1
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.1 LTS
+* 操作系统内核：linux5.15.0-97-generic
+* CPU：/
+* docker版本：25.0.3
+* 内存：2TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | BF16算力测试值   | BF16算力标定值  | 测试标定比例 |
+| ---- | ----------- | ---------- | ------ |
+| 评测结果 | / | / | / |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗  | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- |
+| 监控结果 | / | / | /   | /     | / | / | /   | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度  | 单卡平均显存占用 |
+| ---- | --------- | -------- | ------- | -------- |
+| 监控结果 | / | /   | / | /   |
+
+# 厂商测试工具原理说明
+
diff --git a/base/toolkits/computation-BF16/cambricon/MLU/cnvs.example.yml b/base/toolkits/computation-BF16/cambricon/MLU/cnvs.example.yml
new file mode 100644
index 000000000..7469216b0
--- /dev/null
+++ b/base/toolkits/computation-BF16/cambricon/MLU/cnvs.example.yml
@@ -0,0 +1,13 @@
+custom:
+- custom:
+    matmul_performance:
+      matrix_dimension_m: 8192
+      matrix_dimension_k: 8192
+      matrix_dimension_n: 8192
+      transpose_a: false
+      transpose_b: true
+      input_data_type: "bfloat16"
+      output_data_type: "bfloat16"
+      input_data_random: false
+      correct_check: false
+      iterations: 50000
diff --git a/base/toolkits/computation-BF16/cambricon/MLU/main.sh b/base/toolkits/computation-BF16/cambricon/MLU/main.sh
new file mode 100644
index 000000000..b63a1585d
--- /dev/null
+++ b/base/toolkits/computation-BF16/cambricon/MLU/main.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+export MLU_VISIBLE_DEVICES=0
+LOG_PATH=`pwd`/`hostname -i | awk '{print $1}'`_run_log
+cnvs -r matmul_performance -c `pwd`/cnvs.example.yml 2>&1 | tee ${LOG_PATH}
+value=$(grep -o 'matmul performance(GOPS): [0-9.]\+' ${LOG_PATH} )
+number=$(echo $value | grep -o '[0-9.]\+')
+result=$(python3 -c "print(float($number) / 1000)")
+echo "[FlagPerf Result] computation-BF16=$result TFLOPS"
+rm -rf cnvs_stats ${LOG_PATH} #删除缓存文件
diff --git a/base/toolkits/computation-FP16/cambricon/MLU/README.md b/base/toolkits/computation-FP16/cambricon/MLU/README.md
new file mode 100644
index 000000000..40aa5371b
--- /dev/null
+++ b/base/toolkits/computation-FP16/cambricon/MLU/README.md
@@ -0,0 +1,40 @@
+# 参评AI芯片信息
+
+* 厂商：Cambricon
+* 产品名称：MLU
+* 产品型号：/
+* TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数：1
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.1 LTS
+* 操作系统内核：linux5.15.0-97-generic
+* CPU：/
+* docker版本：25.0.3
+* 内存：2TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | FP16算力测试值   | BF16算力标定值  | 测试标定比例 |
+| ---- | ----------- | ---------- | ------ |
+| 评测结果 | / | / | / |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗  | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- |
+| 监控结果 | / | / | /   | /     | / | / | /   | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度  | 单卡平均显存占用 |
+| ---- | --------- | -------- | ------- | -------- |
+| 监控结果 | / | /   | / | /   |
+
+# 厂商测试工具原理说明
\ No newline at end of file
diff --git a/base/toolkits/computation-FP16/cambricon/MLU/cnvs.example.yml b/base/toolkits/computation-FP16/cambricon/MLU/cnvs.example.yml
new file mode 100644
index 000000000..21a04cb60
--- /dev/null
+++ b/base/toolkits/computation-FP16/cambricon/MLU/cnvs.example.yml
@@ -0,0 +1,13 @@
+custom:
+- custom:
+    matmul_performance:
+      matrix_dimension_m: 8192
+      matrix_dimension_k: 8192
+      matrix_dimension_n: 8192
+      transpose_a: false
+      transpose_b: true
+      input_data_type: "half"
+      output_data_type: "half"
+      input_data_random: false
+      correct_check: false
+      iterations: 50000
diff --git a/base/toolkits/computation-FP16/cambricon/MLU/main.sh b/base/toolkits/computation-FP16/cambricon/MLU/main.sh
new file mode 100644
index 000000000..a2e861e91
--- /dev/null
+++ b/base/toolkits/computation-FP16/cambricon/MLU/main.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+export MLU_VISIBLE_DEVICES=0
+LOG_PATH=`pwd`/`hostname -i | awk '{print $1}'`_run_log
+cnvs -r matmul_performance -c `pwd`/cnvs.example.yml 2>&1 | tee ${LOG_PATH}
+value=$(grep -o 'matmul performance(GOPS): [0-9.]\+' ${LOG_PATH} )
+number=$(echo $value | grep -o '[0-9.]\+')
+result=$(python3 -c "print(float($number) / 1000)")
+echo "[FlagPerf Result] computation-FP16=$result TFLOPS"
+rm -rf cnvs_stats ${LOG_PATH} #删除缓存文件
diff --git a/base/toolkits/computation-FP32/cambricon/MLU/README.md b/base/toolkits/computation-FP32/cambricon/MLU/README.md
new file mode 100644
index 000000000..a59afabb9
--- /dev/null
+++ b/base/toolkits/computation-FP32/cambricon/MLU/README.md
@@ -0,0 +1,40 @@
+# 参评AI芯片信息
+
+* 厂商：Cambricon
+* 产品名称：MLU
+* 产品型号：/
+* TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数：1
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.1 LTS
+* 操作系统内核：linux5.15.0-97-generic
+* CPU：/
+* docker版本：25.0.3
+* 内存：2TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | FP32算力测试值   | BF16算力标定值  | 测试标定比例 |
+| ---- | ----------- | ---------- | ------ |
+| 评测结果 | / | / | / |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗  | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- |
+| 监控结果 | / | / | /   | /     | / | / | /   | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度  | 单卡平均显存占用 |
+| ---- | --------- | -------- | ------- | -------- |
+| 监控结果 | / | /   | / | /   |
+
+# 厂商测试工具原理说明
\ No newline at end of file
diff --git a/base/toolkits/computation-FP32/cambricon/MLU/cnvs.example.yml b/base/toolkits/computation-FP32/cambricon/MLU/cnvs.example.yml
new file mode 100644
index 000000000..90c69aa4a
--- /dev/null
+++ b/base/toolkits/computation-FP32/cambricon/MLU/cnvs.example.yml
@@ -0,0 +1,13 @@
+custom:
+- custom:
+    matmul_performance:
+      matrix_dimension_m: 8192
+      matrix_dimension_k: 8192
+      matrix_dimension_n: 8192
+      transpose_a: false
+      transpose_b: true
+      input_data_type: "float"
+      output_data_type: "float"
+      input_data_random: false
+      correct_check: false
+      iterations: 15000
diff --git a/base/toolkits/computation-FP32/cambricon/MLU/main.sh b/base/toolkits/computation-FP32/cambricon/MLU/main.sh
new file mode 100644
index 000000000..4cfc42197
--- /dev/null
+++ b/base/toolkits/computation-FP32/cambricon/MLU/main.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+export MLU_VISIBLE_DEVICES=0
+LOG_PATH=`pwd`/`hostname -i | awk '{print $1}'`_run_log
+cnvs -r matmul_performance -c `pwd`/cnvs.example.yml 2>&1 | tee ${LOG_PATH}
+value=$(grep -o 'matmul performance(GOPS): [0-9.]\+' ${LOG_PATH} )
+number=$(echo $value | grep -o '[0-9.]\+')
+result=$(python3 -c "print(float($number) / 1000)")
+echo "[FlagPerf Result] computation-FP32=$result TFLOPS"
+rm -rf cnvs_stats ${LOG_PATH} #删除缓存文件
diff --git a/base/toolkits/computation-INT8/cambricon/MLU/README.md b/base/toolkits/computation-INT8/cambricon/MLU/README.md
new file mode 100644
index 000000000..41ed84307
--- /dev/null
+++ b/base/toolkits/computation-INT8/cambricon/MLU/README.md
@@ -0,0 +1,42 @@
+# 参评AI芯片信息
+
+* 厂商：Cambricon
+* 产品名称：MLU
+* 产品型号：/
+* TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数：1
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.1 LTS
+* 操作系统内核：linux5.15.0-97-generic
+* CPU：/
+* docker版本：25.0.3
+* 内存：2TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 评测结果
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | INT8算力测试值   | BF16算力标定值  | 测试标定比例 |
+| ---- | ----------- | ---------- | ------ |
+| 评测结果 | / | / | / |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗  | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- |
+| 监控结果 | / | / | /   | /     | / | / | /   | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度  | 单卡平均显存占用 |
+| ---- | --------- | -------- | ------- | -------- |
+| 监控结果 | / | /   | / | /   |
+
+# 厂商测试工具原理说明
\ No newline at end of file
diff --git a/base/toolkits/computation-INT8/cambricon/MLU/cnvs.example.yml b/base/toolkits/computation-INT8/cambricon/MLU/cnvs.example.yml
new file mode 100644
index 000000000..2b4638d91
--- /dev/null
+++ b/base/toolkits/computation-INT8/cambricon/MLU/cnvs.example.yml
@@ -0,0 +1,13 @@
+custom:
+- custom:
+    matmul_performance:
+      matrix_dimension_m: 8192
+      matrix_dimension_k: 8192
+      matrix_dimension_n: 8192
+      transpose_a: false
+      transpose_b: true
+      input_data_type: "int8"
+      output_data_type: "half"
+      input_data_random: false
+      correct_check: false
+      iterations: 200000
diff --git a/base/toolkits/computation-INT8/cambricon/MLU/main.sh b/base/toolkits/computation-INT8/cambricon/MLU/main.sh
new file mode 100644
index 000000000..7f74fd698
--- /dev/null
+++ b/base/toolkits/computation-INT8/cambricon/MLU/main.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+export MLU_VISIBLE_DEVICES=0
+LOG_PATH=`pwd`/`hostname -i | awk '{print $1}'`_run_log
+cnvs -r matmul_performance -c `pwd`/cnvs.example.yml 2>&1 | tee ${LOG_PATH}
+value=$(grep -o 'matmul performance(GOPS): [0-9.]\+' ${LOG_PATH} )
+number=$(echo $value | grep -o '[0-9.]\+')
+result=$(python3 -c "print(float($number) / 1000)")
+echo "[FlagPerf Result] computation-INT8=$result TOPS"
+rm -rf cnvs_stats ${LOG_PATH} #删除缓存文件
diff --git a/base/toolkits/computation-INT8/metax/README.md b/base/toolkits/computation-INT8/metax/C550/README.md
similarity index 79%
rename from base/toolkits/computation-INT8/metax/README.md
rename to base/toolkits/computation-INT8/metax/C550/README.md
index 5d359c816..23b2d3f88 100755
--- a/base/toolkits/computation-INT8/metax/README.md
+++ b/base/toolkits/computation-INT8/metax/C550/README.md
@@ -2,23 +2,21 @@
 
 * 厂商：Metax
 
-## 服务器1
 
-- 产品名称：C500
-- 产品型号：曦云®C500 64G
-- TDP：350W
+* 产品名称：C550
+* 产品型号：曦云®C550 64G
+* TDP：450W
 
 # 所用服务器配置
 
 * 服务器数量：1
 
-## 服务器1
 
-* 单服务器内使用卡数：8
-* 服务器型号：同泰怡 G658V3
+* 单服务器内使用卡数：2
+* 服务器型号：OAM C550-1500
 * 操作系统版本：Ubuntu 20.04.6 LTS
 * 操作系统内核：linux5.15.0-58-generic
-* CPU：Montage Jintide(R) C8458P-176core
+* CPU：Inter(R) Xeon(R) Plattinum 8480+
 * docker版本：24.0.7
 * 内存：2TiB
 * 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
@@ -29,13 +27,13 @@
 
 | 评测项  | BF16算力测试值(8卡平均) | BF16算力标定值(8卡平均) | 测试标定比例(8卡平均) |
 | ---- | ---------------- | ---------------- | ------------- |
-| 评测结果 |      |        | 80.83%         |
+| 评测结果 |      |        | 79.3%         |
 
 ## 能耗监控结果
 
 | 监控项  | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗(8卡平均) | 单卡最大功耗(8卡最大) | 单卡功耗标准差(8卡平均) | 单卡TDP |
 | ---- | ------------ | ------------ | ------------- | ----- | ------------- | ------------- | -------------- | ----- |
-| 监控结果 | 1820.77W      | 1950.0W      | 54.52W        | /     | 56.5W        | 57.0W        | 0.5W          | 350W  |
+| 监控结果 | 1820.77W      | 1950.0W      | 54.52W        | /     | 56.5W        | 57.0W        | 0.5W          | 450W  |
 
 ## 其他重要监控结果
 
diff --git a/base/toolkits/computation-INT8/metax/gemm.cu b/base/toolkits/computation-INT8/metax/C550/gemm.cu
similarity index 100%
rename from base/toolkits/computation-INT8/metax/gemm.cu
rename to base/toolkits/computation-INT8/metax/C550/gemm.cu
diff --git a/base/toolkits/computation-INT8/metax/gemm_maca.cu b/base/toolkits/computation-INT8/metax/C550/gemm_maca.cu
similarity index 100%
rename from base/toolkits/computation-INT8/metax/gemm_maca.cu
rename to base/toolkits/computation-INT8/metax/C550/gemm_maca.cu
diff --git a/base/toolkits/computation-INT8/metax/main.sh b/base/toolkits/computation-INT8/metax/C550/main.sh
similarity index 100%
rename from base/toolkits/computation-INT8/metax/main.sh
rename to base/toolkits/computation-INT8/metax/C550/main.sh
diff --git a/base/toolkits/computation-TF32/cambricon/MLU/README.md b/base/toolkits/computation-TF32/cambricon/MLU/README.md
new file mode 100644
index 000000000..ebef1da53
--- /dev/null
+++ b/base/toolkits/computation-TF32/cambricon/MLU/README.md
@@ -0,0 +1,40 @@
+# 参评AI芯片信息
+
+* 厂商：Cambricon
+* 产品名称：MLU
+* 产品型号：/
+* TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数：1
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.1 LTS
+* 操作系统内核：linux5.15.0-97-generic
+* CPU：/
+* docker版本：25.0.3
+* 内存：2TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | TF32算力测试值   | BF16算力标定值  | 测试标定比例 |
+| ---- | ----------- | ---------- | ------ |
+| 评测结果 | / | / | / |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗  | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- |
+| 监控结果 | / | / | /   | /     | / | / | /   | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度  | 单卡平均显存占用 |
+| ---- | --------- | -------- | ------- | -------- |
+| 监控结果 | / | /   | / | /   |
+
+# 厂商测试工具原理说明
\ No newline at end of file
diff --git a/base/toolkits/computation-TF32/cambricon/MLU/cnvs.example.yml b/base/toolkits/computation-TF32/cambricon/MLU/cnvs.example.yml
new file mode 100644
index 000000000..0d1e81dad
--- /dev/null
+++ b/base/toolkits/computation-TF32/cambricon/MLU/cnvs.example.yml
@@ -0,0 +1,13 @@
+custom:
+- custom:
+    matmul_performance:
+      matrix_dimension_m: 8192
+      matrix_dimension_k: 8192
+      matrix_dimension_n: 8192
+      transpose_a: false
+      transpose_b: true
+      input_data_type: "tfloat"
+      output_data_type: "tfloat"
+      input_data_random: false
+      correct_check: false
+      iterations: 30000
diff --git a/base/toolkits/computation-TF32/cambricon/MLU/main.sh b/base/toolkits/computation-TF32/cambricon/MLU/main.sh
new file mode 100644
index 000000000..1a907750a
--- /dev/null
+++ b/base/toolkits/computation-TF32/cambricon/MLU/main.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+export MLU_VISIBLE_DEVICES=0
+LOG_PATH=`pwd`/`hostname -i | awk '{print $1}'`_run_log
+cnvs -r matmul_performance -c `pwd`/cnvs.example.yml 2>&1 | tee ${LOG_PATH}
+value=$(grep -o 'matmul performance(GOPS): [0-9.]\+' ${LOG_PATH} )
+number=$(echo $value | grep -o '[0-9.]\+')
+result=$(python3 -c "print(float($number) / 1000)")
+echo "[FlagPerf Result] computation-TF32=$result TFLOPS"
+rm -rf cnvs_stats ${LOG_PATH} #删除缓存文件
diff --git a/base/toolkits/interconnect-MPI_interserver/cambricon/MLU/README.md b/base/toolkits/interconnect-MPI_interserver/cambricon/MLU/README.md
new file mode 100644
index 000000000..a2324d0c8
--- /dev/null
+++ b/base/toolkits/interconnect-MPI_interserver/cambricon/MLU/README.md
@@ -0,0 +1,42 @@
+# 参评AI芯片信息
+
+* 厂商：Cambricon
+* 产品名称：MLU
+* 产品型号：/
+* TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：2
+* 单服务器内使用卡数：8
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.1 LTS
+* 操作系统内核：linux5.15.0-97-generic
+* CPU：/
+* docker版本：25.0.3
+* 内存：2TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 服务器间多卡的MPI互联带宽测试值(16卡平均) | 服务器间多卡的MPI互联带宽标定值(16卡平均) | 测试标定比例(16卡平均) |
+| ---- | -------------- | -------------- | ------------ |
+| 评测结果 | /   | /       | /    |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗  | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- |
+| 监控结果 | / | / | /   | /     | / | / | /   | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度  | 单卡平均显存占用 |
+| ---- | --------- | -------- | ------- | -------- |
+| 监控结果 | / | /   | / | /   |
+
+
+# 厂商测试工具原理说明
\ No newline at end of file
diff --git a/base/toolkits/interconnect-MPI_interserver/cambricon/MLU/main.sh b/base/toolkits/interconnect-MPI_interserver/cambricon/MLU/main.sh
new file mode 100644
index 000000000..4f76405e0
--- /dev/null
+++ b/base/toolkits/interconnect-MPI_interserver/cambricon/MLU/main.sh
@@ -0,0 +1,57 @@
+# step-1 获取ip
+file="../../../../configs/host.yaml"
+hosts=$(grep "HOSTS" "$file" | sed -n 's/.*\[\(.*\)\].*/\1/p')
+IFS=',' read -ra ADDR <<< "$hosts"
+ip1=$(echo "${ADDR[0]}" | sed 's/^[ \t]*//;s/[ \t]*$//' | sed 's/"//g')
+ip2=$(echo "${ADDR[1]}" | sed 's/^[ \t]*//;s/[ \t]*$//' | sed 's/"//g')
+
+# step-1 配置免密
+echo 'root:123456' | sudo chpasswd
+rm -rf ~/.ssh/* && ssh-keygen -t rsa -N '' -f /root/.ssh/id_rsa -q
+sed -i '/StrictHostKeyChecking/c StrictHostKeyChecking no' /etc/ssh/ssh_config
+sed -i 's/#Port 22/Port 1234/g' /etc/ssh/sshd_config
+sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/g' /etc/ssh/sshd_config
+sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' /etc/ssh/sshd_config
+/etc/init.d/ssh restart
+sleep 10
+sshpass -p "123456" ssh-copy-id -i ~/.ssh/id_rsa.pub -p 1234 root@${ip1}
+sshpass -p "123456" ssh-copy-id -i ~/.ssh/id_rsa.pub -p 1234 root@${ip2}
+
+# step-3 正式测试
+finished_str="All Result Check: PASSED" 
+LOG_PATH=`pwd`/`hostname -i | awk '{print $1}'`_run_log
+cur_ip=`hostname -i | awk '{print $1}'`
+tcp_if_include=`echo ${ip1} | awk -F'.' '{print $1"."$2"."$3}'`
+
+export CNCL_MLULINK_OVER_ROCE_DISABLE=1
+export CNCL_MLULINK_CROSS_HOSTS_ENABLE=0
+export CNCL_MLU_DIRECT_LEVEL=1
+
+if [ "$cur_ip" == "$ip1" ]; then
+	/usr/local/openmpi/bin/mpirun --allow-run-as-root -n 16 --host ${ip1}:8,${ip2}:8 \
+                -x CNCL_MLULINK_OVER_ROCE_DISABLE -x CNCL_MLULINK_CROSS_HOSTS_ENABLE -x CNCL_MLU_DIRECT_LEVEL \
+		-x PATH -x LD_LIBRARY_PATH -mca btl ^openib  -bind-to none -map-by slot -mca plm_rsh_args \
+		"-p 1234" -mca btl_tcp_if_include ${tcp_if_include}.0/24 \
+		/usr/local/neuware/bin/allreduce --warmup_loop 20 --thread 1 --loop 2000 --mincount 1 --maxcount 512M --multifactor 2 --async 1 --block 0 \
+		2>&1 | tee ${LOG_PATH}
+	data=$(tail -n 2 ${LOG_PATH} | awk '{print $11}')
+	while  [ ! -f ${ip2}_run_log ] || ! grep -q "$finished_str" ${ip2}_run_log ; do
+		# "等待ip2测试完成..."
+		sleep 1  # 等待1秒再次检查
+	done
+	echo "[FlagPerf Result]interconnect-MPI_interserver-bandwidth=$data GB/s"
+	rm -rf ${ip1}_run_log ${ip2}_run_log
+else
+	while [ ! -f ${ip1}_run_log ] || ! grep -q "$finished_str" ${ip1}_run_log ; do
+		# "等待ip1测试完成..."
+		sleep 1  # 等待1秒再次检查
+	done
+	/usr/local/openmpi/bin/mpirun --allow-run-as-root -n 16 --host ${ip1}:8,${ip2}:8 \
+                -x CNCL_MLULINK_OVER_ROCE_DISABLE -x CNCL_MLULINK_CROSS_HOSTS_ENABLE -x CNCL_MLU_DIRECT_LEVEL \
+		-x PATH -x LD_LIBRARY_PATH -mca btl ^openib  -bind-to none -map-by slot -mca plm_rsh_args \
+		"-p 1234" -mca btl_tcp_if_include ${tcp_if_include}.0/24 \
+		/usr/local/neuware/bin/allreduce --warmup_loop 20 --thread 1 --loop 2000 --mincount 1 --maxcount 512M --multifactor 2 --async 1 --block 0 \
+		2>&1 | tee ${LOG_PATH}
+	data=$(tail -n 2 ${LOG_PATH} | awk '{print $11}')
+	echo "[FlagPerf Result]interconnect-MPI_interserver-bandwidth=$data GB/s"
+fi
diff --git a/base/toolkits/interconnect-MPI_interserver/nvidia/A100/README.md b/base/toolkits/interconnect-MPI_interserver/nvidia/A100/README.md
index 2d10971b3..20d377315 100644
--- a/base/toolkits/interconnect-MPI_interserver/nvidia/A100/README.md
+++ b/base/toolkits/interconnect-MPI_interserver/nvidia/A100/README.md
@@ -19,6 +19,7 @@
 * CPU：AMD EPYC7742-64core
 * docker版本：20.10.16
 * 内存：1TiB
+* 机内总线协议：Speed 16GT/s, Width x16（PCIE4）
 * 服务器间多卡的MPI互联带宽采用多种通信方式组合，无标定互联带宽
 
 # 指标选型
@@ -35,10 +36,16 @@ The second metric, busbw, is chosen for the following reasons:
 
 ## 核心评测结果
 
-| 评测项  | 服务器间多卡的MPI互联带宽测试值(8卡平均) | 服务器间多卡的MPI互联带宽标定值(8卡平均) | 测试标定比例(8卡平均) |
+| 评测项  | 服务器间多卡的MPI互联算法带宽测试值(8卡平均) | 服务器间多卡的MPI互联算法带宽标定值(8卡平均) | 测试标定比例(8卡平均) |
 | ---- | -------------- | -------------- | ------------ |
-| 评测结果 | 48.31GB/s    | /       | /    |
+| 评测结果 | 25.77GB/s    | /       | /    |
 
+| 评测项  | 服务器间多卡的MPI互联等效带宽测试值(8卡平均) | 服务器间多卡的MPI互联等效带宽标定值(8卡平均) | 测试标定比例(8卡平均) |
+| ---- | -------------- | -------------- | ------------ |
+| 评测结果 | 96.62GB/s    | /       | /    |
+* 等效带宽为双向带宽
+
+* 算法带宽、等效带宽计算参考：https://github.com/NVIDIA/nccl-tests/blob/master/doc/PERFORMANCE.md
 
 ## 能耗监控结果
 
diff --git a/base/toolkits/interconnect-MPI_interserver/nvidia/A100/bandwidth.cu b/base/toolkits/interconnect-MPI_interserver/nvidia/A100/bandwidth.cu
index 69dd88995..c0376edb1 100644
--- a/base/toolkits/interconnect-MPI_interserver/nvidia/A100/bandwidth.cu
+++ b/base/toolkits/interconnect-MPI_interserver/nvidia/A100/bandwidth.cu
@@ -4,6 +4,7 @@
 #include <mpi.h>
 #include <vector>
 #include <iostream>
+#include <iomanip>
 
 #define GB (1024ULL * 1024ULL * 1024ULL)
 #define SIZE (4ULL * GB)
@@ -98,9 +99,20 @@ int main(int argc, char *argv[]) {
     */
     double algbw = SIZE * ITERATIONS / (elapsed_time / 1000.0);
     double bandwidth = algbw * (2.0 * (total_gpus - 1) / total_gpus);
+    bandwidth = bandwidth + bandwidth;
     if (rank == 0) {
-        printf("[FlagPerf Result]interconnect-MPI_interserver-bandwidth=%.2fGiB/s\n", bandwidth / (1024.0 * 1024.0 * 1024.0));
-        printf("[FlagPerf Result]interconnect-MPI_interserver-bandwidth=%.2fGB/s\n", bandwidth / (1000.0 * 1000.0 * 1000.0));
+        std::cout << "[FlagPerf Result]interconnect-MPI_interserver-algbw=" 
+              << std::fixed << std::setprecision(2) << algbw / (1024.0 * 1024.0 * 1024.0) 
+              << "GiB/s" << std::endl;
+        std::cout << "[FlagPerf Result]interconnect-MPI_interserver-algbw=" 
+              << std::fixed << std::setprecision(2) << algbw / (1000.0 * 1000.0 * 1000.0) 
+              << "GB/s" << std::endl; 
+        std::cout << "[FlagPerf Result]interconnect-MPI_interserver-bandwidth=" 
+              << std::fixed << std::setprecision(2) << bandwidth / (1024.0 * 1024.0 * 1024.0) 
+              << "GiB/s" << std::endl;
+        std::cout << "[FlagPerf Result]interconnect-MPI_interserver-bandwidth=" 
+              << std::fixed << std::setprecision(2) << bandwidth / (1000.0 * 1000.0 * 1000.0) 
+              << "GB/s" << std::endl;
     }
     checkCudaError(cudaFree(d_src), "cudaFree");
     checkCudaError(cudaFree(d_dst), "cudaFree");
diff --git a/base/toolkits/interconnect-MPI_interserver/nvidia/A100/main.sh b/base/toolkits/interconnect-MPI_interserver/nvidia/A100/main.sh
index 1dabcabb5..7d871a092 100644
--- a/base/toolkits/interconnect-MPI_interserver/nvidia/A100/main.sh
+++ b/base/toolkits/interconnect-MPI_interserver/nvidia/A100/main.sh
@@ -2,11 +2,12 @@ export NCCL_DEBUG=WARN
 export OPAL_PREFIX=/opt/hpcx/ompi
 export PATH=/usr/local/nvm/versions/node/v16.20.2/bin:/usr/local/lib/python3.10/dist-packages/torch_tensorrt/bin:/usr/local/mpi/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/ucx/bin:/opt/tensorrt/bin
 export LD_LIBRARY_PATH=/usr/local/lib/python3.10/dist-packages/torch/lib:/usr/local/lib/python3.10/dist-packages/torch_tensorrt/lib:/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+HOSTS=$(yq '.HOSTS | map(. + ":8") | join(",")' ../../../../configs/host.yaml)
 nvcc -c -o bandwidth.o bandwidth.cu -I/usr/local/cuda/include -I/usr/local/nccl/include -I/usr/local/mpi/include
 mpic++ -o bdtest bandwidth.o -L/usr/local/cuda/lib64 -L/usr/local/nccl/lib -L/usr/local/mpi/lib -lcudart -lnccl -lcuda -lmpi
 echo "NODERANK: $NODERANK"
 if [ "$NODERANK" -eq 0 ]; then
     echo "NODERANK is 0, executing the final command..."
     sleep 10
-    mpirun --allow-run-as-root --host 10.1.2.155:8,10.1.2.158:8 -np 16 -x NCCL_DEBUG=WARN -x NCCL_IB_DISABLE=0 -x NCCL_IB_HCA=mlx5_2,mlx5_5 -x CUDA_DEVICE_MAX_CONNECTIONS=1 ./bdtest
+    mpirun --allow-run-as-root --host $HOSTS -x NCCL_DEBUG=WARN -x NCCL_IB_DISABLE=0 -x NCCL_IB_HCA=mlx5_2,mlx5_5 -x CUDA_DEVICE_MAX_CONNECTIONS=1 ./bdtest
 fi
\ No newline at end of file
diff --git a/base/toolkits/interconnect-MPI_intraserver/cambricon/MLU/README.md b/base/toolkits/interconnect-MPI_intraserver/cambricon/MLU/README.md
new file mode 100644
index 000000000..095fbe4d2
--- /dev/null
+++ b/base/toolkits/interconnect-MPI_intraserver/cambricon/MLU/README.md
@@ -0,0 +1,42 @@
+# 参评AI芯片信息
+
+* 厂商：Cambricon
+* 产品名称：MLU
+* 产品型号：/
+* TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数：8
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.1 LTS
+* 操作系统内核：linux5.15.0-97-generic
+* CPU：/
+* docker版本：25.0.3
+* 内存：2TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 单机多卡的MLULink互联带宽测试值    | 单机多卡的MPI互联带宽标定值 | 测试标定比例 |
+| ---- | ----------- | -------- | ------ |
+| 评测结果 | |  |   |
+
+**注意:上述结果为示例结果值的2倍，对齐Nvidia测试方法。**
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗  | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- |
+| 监控结果 | / | / | /   | /     | / | / | /   | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度  | 单卡平均显存占用 |
+| ---- | --------- | -------- | ------- | -------- |
+| 监控结果 | / | /   | / | /   |
+
+# 厂商测试工具原理说明
\ No newline at end of file
diff --git a/base/toolkits/interconnect-MPI_intraserver/cambricon/MLU/main.sh b/base/toolkits/interconnect-MPI_intraserver/cambricon/MLU/main.sh
new file mode 100644
index 000000000..59e18b329
--- /dev/null
+++ b/base/toolkits/interconnect-MPI_intraserver/cambricon/MLU/main.sh
@@ -0,0 +1,14 @@
+LOG_PATH=`pwd`/`hostname -i | awk '{print $1}'`_run_log
+/usr/local/neuware/bin/allreduce \
+    --warmup_loop 20  \
+    --thread 8 \
+    --loop 2000 \
+    --mincount 1 \
+    --maxcount 512M \
+    --multifactor 2 \
+    --async 1 \
+    --block 0 2>&1 | tee ${LOG_PATH}
+data=$(tail -n 2 ${LOG_PATH} | awk '{print $11 }')
+result=$(python3 -c "print(float($data) * 2)")
+echo "[FlagPerf Result]interconnect-MPI_intraserver-bandwidth=$result GB/s"
+rm -rf ${LOG_PATH} #删除缓存文件
\ No newline at end of file
diff --git a/base/toolkits/interconnect-MPI_intraserver/nvidia/A100/README.md b/base/toolkits/interconnect-MPI_intraserver/nvidia/A100/README.md
index 71852d84d..30cff9b6f 100644
--- a/base/toolkits/interconnect-MPI_intraserver/nvidia/A100/README.md
+++ b/base/toolkits/interconnect-MPI_intraserver/nvidia/A100/README.md
@@ -15,15 +15,21 @@
 * CPU：AMD EPYC7742-64core
 * docker版本：20.10.16
 * 内存：1TiB
+* 机内总线协议：Speed 16GT/s, Width x16（PCIE4）
 
 # 评测结果
 
 ## 核心评测结果
 
-| 评测项  | 单机多卡的MPI互联带宽测试值    | 单机多卡的MPI互联带宽标定值 | 测试标定比例 |
+| 评测项  | 单机多卡的MPI互联算法带宽测试值    | 单机多卡的MPI互联算法带宽标定值 | 测试标定比例 |
+| ---- | ----------- | -------- | ------ |
+| 评测结果 | 132.76GB/s | / | /  |
+
+| 评测项  | 单机多卡的MPI互联等效带宽测试值    | 单机多卡的MPI互联等效带宽标定值 | 测试标定比例 |
 | ---- | ----------- | -------- | ------ |
 | 评测结果 | 464.65GB/s | 600GB/s | 77.4%  |
 
+* 算法带宽、等效带宽计算参考：https://github.com/NVIDIA/nccl-tests/blob/master/doc/PERFORMANCE.md
 
 ## 能耗监控结果
 
diff --git a/base/toolkits/interconnect-MPI_intraserver/nvidia/A100/bandwidth.cu b/base/toolkits/interconnect-MPI_intraserver/nvidia/A100/bandwidth.cu
index 3403a3de0..a6ba9d856 100644
--- a/base/toolkits/interconnect-MPI_intraserver/nvidia/A100/bandwidth.cu
+++ b/base/toolkits/interconnect-MPI_intraserver/nvidia/A100/bandwidth.cu
@@ -6,7 +6,7 @@
 #include <nccl.h>
 #include <vector>
 #include <iostream>
-
+#include <iomanip>
 
 #define GB (1024ULL * 1024ULL * 1024ULL)
 #define SIZE (4ULL * GB)
@@ -75,7 +75,6 @@ int main() {
     checkCudaError(cudaEventRecord(end), "cudaEventRecord"); 
     checkCudaError(cudaEventSynchronize(end), "cudaEventSynchronize");
     checkCudaError(cudaEventElapsedTime(&elapsed_time, start, end), "cudaEventElapsedTime");
-
     /*
         algbw = S/t
     Considering that each rank has a bandwidth to the outside world of B, the time to perform an allReduce operation of S elements is at best :
@@ -92,12 +91,22 @@ int main() {
     we have multiplied the bandwidth result here by two.
     */
     double algbw = SIZE * ITERATIONS / (elapsed_time / 1000.0);
-    double bandwidth = algbw * (2.0 * (num_gpus-1) / num_gpus);
-    bandwidth = bandwidth * 2.0;
+    std::cout << "[FlagPerf Result]interconnect-MPI_intraserver-algbw=" 
+              << std::fixed << std::setprecision(2) << algbw / (1024.0 * 1024.0 * 1024.0) 
+              << "GiB/s" << std::endl;
 
-    printf("[FlagPerf Result]interconnect-MPI_intraserver-bandwidth=%.2fGiB/s\n", bandwidth / (1024.0 * 1024.0 * 1024.0));
-    printf("[FlagPerf Result]interconnect-MPI_intraserver-bandwidth=%.2fGB/s\n", bandwidth / (1000.0 * 1000.0 * 1000.0));
+    std::cout << "[FlagPerf Result]interconnect-MPI_intraserver-algbw=" 
+              << std::fixed << std::setprecision(2) << algbw / (1000.0 * 1000.0 * 1000.0) 
+              << "GB/s" << std::endl;
+    double bandwidth = algbw * (2.0 * (num_gpus-1) / num_gpus);
+    bandwidth = bandwidth + bandwidth;
+    std::cout << "[FlagPerf Result]interconnect-MPI_intraserver-bandwidth=" 
+              << std::fixed << std::setprecision(2) << bandwidth / (1024.0 * 1024.0 * 1024.0) 
+              << "GiB/s" << std::endl;
 
+    std::cout << "[FlagPerf Result]interconnect-MPI_intraserver-bandwidth=" 
+              << std::fixed << std::setprecision(2) << bandwidth / (1000.0 * 1000.0 * 1000.0) 
+              << "GB/s" << std::endl;
     for (int i = 0; i < num_gpus; ++i) {
         checkCudaError(cudaFree(d_src[i]), "cudaFree");
         checkCudaError(cudaFree(d_dst[i]), "cudaFree");
diff --git a/base/toolkits/interconnect-P2P_interserver/cambricon/MLU/README.md b/base/toolkits/interconnect-P2P_interserver/cambricon/MLU/README.md
new file mode 100644
index 000000000..abd254f37
--- /dev/null
+++ b/base/toolkits/interconnect-P2P_interserver/cambricon/MLU/README.md
@@ -0,0 +1,40 @@
+# 参评AI芯片信息
+
+* 厂商：Cambricon
+* 产品名称：MLU
+* 产品型号：/
+* TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：2
+* 单服务器内使用卡数：1
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.1 LTS
+* 操作系统内核：linux5.15.0-97-generic
+* CPU：/
+* docker版本：25.0.3
+* 内存：2TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 跨服务器P2P互联带宽测试值(2卡平均) | 跨服务器P2P互联带宽标定值(2卡平均) | 测试标定比例(2卡平均) |
+| ---- | -------------- | -------------- | ------------ |
+| 评测结果 |  /  | 无  | 无    |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗  | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- |
+| 监控结果 | / | / | /   | /     | / | / | /   | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度  | 单卡平均显存占用 |
+| ---- | --------- | -------- | ------- | -------- |
+| 监控结果 | / | /   | / | /   |
+
+# 厂商测试工具原理说明
diff --git a/base/toolkits/interconnect-P2P_interserver/cambricon/MLU/main.sh b/base/toolkits/interconnect-P2P_interserver/cambricon/MLU/main.sh
new file mode 100644
index 000000000..a49273b54
--- /dev/null
+++ b/base/toolkits/interconnect-P2P_interserver/cambricon/MLU/main.sh
@@ -0,0 +1,45 @@
+# step-1 获取ip
+file="../../../../configs/host.yaml"
+hosts=$(grep "HOSTS" "$file" | sed -n 's/.*\[\(.*\)\].*/\1/p')
+IFS=',' read -ra ADDR <<< "$hosts"
+ip1=$(echo "${ADDR[0]}" | sed 's/^[ \t]*//;s/[ \t]*$//' | sed 's/"//g')
+ip2=$(echo "${ADDR[1]}" | sed 's/^[ \t]*//;s/[ \t]*$//' | sed 's/"//g')
+
+# step-1 配置免密
+echo 'root:123456' | sudo chpasswd
+rm -rf ~/.ssh/* && ssh-keygen -t rsa -N '' -f /root/.ssh/id_rsa -q
+sed -i '/StrictHostKeyChecking/c StrictHostKeyChecking no' /etc/ssh/ssh_config
+sed -i 's/#Port 22/Port 1234/g' /etc/ssh/sshd_config
+sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/g' /etc/ssh/sshd_config
+sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' /etc/ssh/sshd_config
+/etc/init.d/ssh restart
+sleep 10
+sshpass -p "123456" ssh-copy-id -i ~/.ssh/id_rsa.pub -p 1234 root@${ip1}
+sshpass -p "123456" ssh-copy-id -i ~/.ssh/id_rsa.pub -p 1234 root@${ip2}
+
+# step-3 正式测试
+cur_ip=`hostname -i | awk '{print $1}'`
+if [ "$cur_ip" == "$ip1" ]; then
+    export MLU_VISIBLE_DEVICES=0
+else
+    export MLU_VISIBLE_DEVICES=1
+fi
+LOG_PATH=`pwd`/`hostname -i | awk '{print $1}'`_run_log
+tcp_if_include=`echo ${ip1} | awk -F'.' '{print $1"."$2"."$3}'`
+
+export CNCL_MLULINK_OVER_ROCE_DISABLE=1
+export CNCL_MLULINK_CROSS_HOSTS_ENABLE=0
+export CNCL_MLU_DIRECT_LEVEL=1
+
+/usr/local/openmpi/bin/mpirun \
+        --allow-run-as-root -n 2 --host ${ip1}:1,${ip2}:1 \
+        -x PATH -x LD_LIBRARY_PATH -x MLU_VISIBLE_DEVICES \
+        -x CNCL_MLULINK_OVER_ROCE_DISABLE -x CNCL_MLULINK_CROSS_HOSTS_ENABLE -x CNCL_MLU_DIRECT_LEVEL \
+        -mca btl ^openib  -bind-to none -map-by slot -mca plm_rsh_args \
+        "-p 1234" -mca btl_tcp_if_include ${tcp_if_include}.0/24 \
+        /usr/local/neuware/bin/sendrecv --warmup_loop 21 --thread 1 --loop 250 --mincount 1 --maxcount 512M --multifactor 2 --async 1 --block 0 \
+        2>&1 | tee ${LOG_PATH}
+data=$(tail -n 2 ${LOG_PATH} | awk '{print $10}')
+sleep 30
+echo "[FlagPerf Result]interconnect-P2P_interserver-bandwidth=$data GB/s"
+rm -rf ${LOG_PATH}
diff --git a/base/toolkits/interconnect-P2P_interserver/nvidia/A100/README.md b/base/toolkits/interconnect-P2P_interserver/nvidia/A100/README.md
index 1f852bcb2..5f76e150c 100644
--- a/base/toolkits/interconnect-P2P_interserver/nvidia/A100/README.md
+++ b/base/toolkits/interconnect-P2P_interserver/nvidia/A100/README.md
@@ -19,15 +19,16 @@
 * CPU：AMD EPYC7742-64core
 * docker版本：20.10.16
 * 内存：1TiB
-* RDMA网卡：25GB/s
+* 机内总线协议：Speed 16GT/s, Width x16（PCIE4）
+* RDMA网卡：50GB/s（双向）
 
 # 评测结果
 
 ## 核心评测结果
 
-| 评测项  | 跨服务器P2P互联带宽测试值(2卡平均) | 跨服务器P2P互联带宽标定值(2卡平均) | 测试标定比例(2卡平均) |
+| 评测项  | 跨服务器P2P互联带宽测试值(2卡平均，双向) | 跨服务器P2P互联带宽标定值(2卡平均，双向) | 测试标定比例(2卡平均) |
 | ---- | -------------- | -------------- | ------------ |
-| 评测结果 | 19.57GB/s    | 25.00GB/s       | 78.28%     |
+| 评测结果 | 39.14GB/s    | 50.00GB/s       | 78.28%     |
 
 ## 能耗监控结果
 
diff --git a/base/toolkits/interconnect-P2P_interserver/nvidia/A100/bandwidth.cu b/base/toolkits/interconnect-P2P_interserver/nvidia/A100/bandwidth.cu
index 4297a1083..10701294b 100644
--- a/base/toolkits/interconnect-P2P_interserver/nvidia/A100/bandwidth.cu
+++ b/base/toolkits/interconnect-P2P_interserver/nvidia/A100/bandwidth.cu
@@ -5,6 +5,8 @@
 #include <cuda_runtime.h>
 #include <nccl.h>
 #include <mpi.h>
+#include <iostream>
+#include <iomanip>
 
 #define SIZE (1024ULL * 1024ULL * 1024ULL * sizeof(float))
 #define WARMUP_ITERATIONS 1000
@@ -92,10 +94,14 @@ int main(int argc, char **argv) {
     checkCudaError(cudaEventSynchronize(end), "cudaEventSynchronize");
     checkCudaError(cudaEventElapsedTime(&elapsed_time, start, end), "cudaEventElapsedTime");
 
-    double bandwidth = SIZE * ITERATIONS / (elapsed_time / 1000.0);
-    printf("[FlagPerf Result]interconnect-MPI_intraserver-bandwidth=%.2fGiB/s\n", bandwidth / (1024.0 * 1024.0 * 1024.0));
-    printf("[FlagPerf Result]interconnect-MPI_intraserver-bandwidth=%.2fGB/s\n", bandwidth / (1000.0 * 1000.0 * 1000.0));
+    double bandwidth = SIZE * ITERATIONS / (elapsed_time / 1000.0) + SIZE * ITERATIONS / (elapsed_time / 1000.0);
+    std::cout << "[FlagPerf Result]interconnect-MPI_intraserver-bandwidth=" 
+              << std::fixed << std::setprecision(2) << bandwidth / (1024.0 * 1024.0 * 1024.0) 
+              << "GiB/s" << std::endl;
 
+    std::cout << "[FlagPerf Result]interconnect-MPI_intraserver-bandwidth=" 
+              << std::fixed << std::setprecision(2) << bandwidth / (1000.0 * 1000.0 * 1000.0) 
+              << "GB/s" << std::endl;
     checkCudaError(cudaEventDestroy(start), "cudaEventDestroy");
     checkCudaError(cudaEventDestroy(end), "cudaEventDestroy");
     checkCudaError(cudaFree(d_tensor), "cudaFree");
diff --git a/base/toolkits/interconnect-P2P_interserver/nvidia/A100/main.sh b/base/toolkits/interconnect-P2P_interserver/nvidia/A100/main.sh
index 9c20b8285..8baae226b 100644
--- a/base/toolkits/interconnect-P2P_interserver/nvidia/A100/main.sh
+++ b/base/toolkits/interconnect-P2P_interserver/nvidia/A100/main.sh
@@ -6,8 +6,9 @@ export PATH=/usr/local/nvm/versions/node/v16.20.2/bin:/usr/local/lib/python3.10/
 export LD_LIBRARY_PATH=/usr/local/lib/python3.10/dist-packages/torch/lib:/usr/local/lib/python3.10/dist-packages/torch_tensorrt/lib:/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
 nvcc -c -o bandwidth.o bandwidth.cu -I/usr/local/cuda/include -I/usr/local/nccl/include -I/usr/local/mpi/include
 mpic++ -o bdtest bandwidth.o -L/usr/local/cuda/lib64 -L/usr/local/nccl/lib -L/usr/local/mpi/lib -lcudart -lnccl -lcuda -lmpi
+HOSTS=$(yq '.HOSTS | join(",")' ../../../../configs/host.yaml)
 echo "NODERANK: $NODERANK"
 if [ "$NODERANK" -eq 0 ]; then
     echo "NODERANK is 0, executing the final command..."
-    mpirun --allow-run-as-root --host 10.1.2.155,10.1.2.158 -np 2 -x NCCL_DEBUG=WARN -x NCCL_IB_DISABLE=0 -x NCCL_IB_HCA=mlx5_2,mlx5_5 -x CUDA_DEVICE_MAX_CONNECTIONS=1 ./bdtest
+    mpirun --allow-run-as-root --host $HOSTS -np 2 -x NCCL_DEBUG=WARN -x NCCL_IB_DISABLE=0 -x NCCL_IB_HCA=mlx5_2,mlx5_5 -x CUDA_DEVICE_MAX_CONNECTIONS=1 ./bdtest
 fi
diff --git a/base/toolkits/interconnect-P2P_intraserver/cambricon/MLU/README.md b/base/toolkits/interconnect-P2P_intraserver/cambricon/MLU/README.md
new file mode 100644
index 000000000..72e0824ef
--- /dev/null
+++ b/base/toolkits/interconnect-P2P_intraserver/cambricon/MLU/README.md
@@ -0,0 +1,40 @@
+# 参评AI芯片信息
+
+* 厂商：Cambricon
+* 产品名称：MLU
+* 产品型号：/
+* TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数：2
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.1 LTS
+* 操作系统内核：linux5.15.0-97-generic
+* CPU：/
+* docker版本：25.0.3
+* 内存：2TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 服务器内P2P互联带宽测试值    | 服务器P2P互联带宽标定值 | 测试标定比例 |
+| ---- | ----------- | -------- | ------ |
+| 评测结果 | / | / | / |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗  | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- |
+| 监控结果 | / | / | /   | /     | / | / | /   | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度  | 单卡平均显存占用 |
+| ---- | --------- | -------- | ------- | -------- |
+| 监控结果 | / | /   | / | /   |
+
+# 厂商测试工具原理说明
diff --git a/base/toolkits/interconnect-P2P_intraserver/cambricon/MLU/cnvs.example.yml b/base/toolkits/interconnect-P2P_intraserver/cambricon/MLU/cnvs.example.yml
new file mode 100644
index 000000000..52a5fef8c
--- /dev/null
+++ b/base/toolkits/interconnect-P2P_intraserver/cambricon/MLU/cnvs.example.yml
@@ -0,0 +1,5 @@
+custom:
+- custom:
+    mlulink:
+      data_size: 3000000
+      link_type: over_port
diff --git a/base/toolkits/interconnect-P2P_intraserver/cambricon/MLU/main.sh b/base/toolkits/interconnect-P2P_intraserver/cambricon/MLU/main.sh
new file mode 100644
index 000000000..04305e274
--- /dev/null
+++ b/base/toolkits/interconnect-P2P_intraserver/cambricon/MLU/main.sh
@@ -0,0 +1,8 @@
+export MLU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+LOG_PATH=`pwd`/`hostname -i | awk '{print $1}'`_run_log
+cnvs -r mlulink -c `pwd`/cnvs.example.yml 2>&1 | tee ${LOG_PATH}
+device0_1=$(sed -n '15p' "$LOG_PATH" | awk '{print $7}')
+device1_0=$(sed -n '19p' "$LOG_PATH" | awk '{print $5}')
+result=$(python3 -c "print(float($device0_1)*0.5 + float($device1_0)*0.5)")
+echo "[FlagPerf Result]interconnect-P2P_intraserver-bandwidth=${result} GB/s"
+rm -rf cnvs_stats ${LOG_PATH} #删除缓存文件
diff --git a/base/toolkits/interconnect-P2P_intraserver/nvidia/A100/README.md b/base/toolkits/interconnect-P2P_intraserver/nvidia/A100/README.md
index 89a51a57f..b3c7319f9 100644
--- a/base/toolkits/interconnect-P2P_intraserver/nvidia/A100/README.md
+++ b/base/toolkits/interconnect-P2P_intraserver/nvidia/A100/README.md
@@ -15,13 +15,14 @@
 * CPU：AMD EPYC7742-64core
 * docker版本：20.10.16
 * 内存：1TiB
+* 机内总线协议：Speed 16GT/s, Width x16（PCIE4）
 * 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
 
 # 评测结果
 
 ## 核心评测结果
 
-| 评测项  | 服务器内P2P互联带宽测试值    | 服务器P2P互联带宽标定值 | 测试标定比例 |
+| 评测项  | 服务器内P2P互联带宽测试值（双向）    | 服务器P2P互联带宽标定值（双向） | 测试标定比例 |
 | ---- | ----------- | -------- | ------ |
 | 评测结果 | 564.13GB/s | 600GB/s | 94.02%  |
 
diff --git a/base/toolkits/interconnect-P2P_intraserver/nvidia/A100/bandwidth.cu b/base/toolkits/interconnect-P2P_intraserver/nvidia/A100/bandwidth.cu
index 844532a18..b9913bfb4 100644
--- a/base/toolkits/interconnect-P2P_intraserver/nvidia/A100/bandwidth.cu
+++ b/base/toolkits/interconnect-P2P_intraserver/nvidia/A100/bandwidth.cu
@@ -4,6 +4,8 @@
 
 #include <stdio.h>
 #include <cuda_runtime.h>
+#include <iostream>
+#include <iomanip>
 
 #define SIZE (1024ULL * 1024ULL * 1024ULL * sizeof(float))
 #define WARMUP_ITERATIONS 100
@@ -94,18 +96,17 @@ int main() {
             checkCudaError(cudaMemcpy(d_src, d_dst, SIZE, cudaMemcpyDefault), "cudaMemcpy");
         } 
     }
-
     checkCudaError(cudaEventRecord(end, 0), "cudaEventRecord");
     checkCudaError(cudaEventSynchronize(end), "cudaEventSynchronize");
-
     checkCudaError(cudaEventElapsedTime(&elapsed_time, start, end), "cudaEventElapsedTime");
-
-    double bandwidth = 2.0 * SIZE * ITERATIONS / (elapsed_time / 1000.0);
-
-    printf("[FlagPerf Result]inferconnect-P2P_intraserver-bandwidth=%.2fGiB/s\n", bandwidth / (1024.0 * 1024.0 * 1024.0));
-    printf("[FlagPerf Result]inferconnect-P2P_intraserver-bandwidth=%.2fGB/s\n", bandwidth / (1000.0 * 1000.0 * 1000.0));
-
-
+    double bandwidth = SIZE * ITERATIONS / (elapsed_time / 1000.0) + SIZE * ITERATIONS / (elapsed_time / 1000.0);
+    std::cout << "[FlagPerf Result]inferconnect-P2P_intraserver-bandwidth=" 
+              << std::fixed << std::setprecision(2) << bandwidth / (1024.0 * 1024.0 * 1024.0) 
+              << "GiB/s" << std::endl;
+
+    std::cout << "[FlagPerf Result]inferconnect-P2P_intraserver-bandwidth=" 
+              << std::fixed << std::setprecision(2) << bandwidth / (1000.0 * 1000.0 * 1000.0) 
+              << "GB/s" << std::endl;
     checkCudaError(cudaSetDevice(gpuid[0]), "cudaSetDevice");
     checkCudaError(cudaDeviceDisablePeerAccess(gpuid[1]), "cudaDeviceDisablePeerAccess");
     checkCudaError(cudaSetDevice(gpuid[1]), "cudaSetDevice");
diff --git a/base/toolkits/interconnect-h2d/cambricon/MLU/README.md b/base/toolkits/interconnect-h2d/cambricon/MLU/README.md
new file mode 100644
index 000000000..660ff5b58
--- /dev/null
+++ b/base/toolkits/interconnect-h2d/cambricon/MLU/README.md
@@ -0,0 +1,42 @@
+# 参评AI芯片信息
+
+* 厂商：Cambricon
+* 产品名称：MLU
+* 产品型号：/
+* TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数：1
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.1 LTS
+* 操作系统内核：linux5.15.0-97-generic
+* CPU：/
+* docker版本：25.0.3
+* 内存：2TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | CPU-芯片互联带宽测试值    | CPU-芯片互联带宽标定值 | 测试标定比例 |
+| ---- | ----------- | -------- | ------ |
+| 评测结果 | / | / | /  |
+
+注: h2d/d2h带宽受到CPU、PCIE、内存等服务器内AI芯片以外的模块影响，无标定值
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗  | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- |
+| 监控结果 | / | / | /   | /     | / | / | /   | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度  | 单卡平均显存占用 |
+| ---- | --------- | -------- | ------- | -------- |
+| 监控结果 | / | /   | / | /   |
+
+# 厂商测试工具原理说明
diff --git a/base/toolkits/interconnect-h2d/cambricon/MLU/cnvs.example.yml b/base/toolkits/interconnect-h2d/cambricon/MLU/cnvs.example.yml
new file mode 100644
index 000000000..9b22940a1
--- /dev/null
+++ b/base/toolkits/interconnect-h2d/cambricon/MLU/cnvs.example.yml
@@ -0,0 +1,27 @@
+custom:
+- custom:
+    pcie:
+      numa_mode: "enable"
+      subtests:
+        link_check:
+          subtest_on: True
+          min_pci_generation: 1
+          min_pci_width: 1
+        h2d_d2h_d2d_bidir:
+          subtest_on: True
+          h2d_data_size: 1000000000.0
+          h2d_repeat_num: 200
+          h2d_min_bandwidth: 0
+          h2d_memcpy_type: async
+          d2h_data_size: 1000000000.0
+          d2h_repeat_num: 200
+          d2h_min_bandwidth: 0
+          d2h_memcpy_type: async
+          d2d_data_size: 1000000000.0
+          d2d_repeat_num: 200
+          d2d_min_bandwidth: 0
+          d2d_memcpy_type: async
+          bidir_data_size: 1000000000.0
+          bidir_repeat_num: 200
+          bidir_min_bandwidth: 0
+          bidir_memcpy_type: async
\ No newline at end of file
diff --git a/base/toolkits/interconnect-h2d/cambricon/MLU/main.sh b/base/toolkits/interconnect-h2d/cambricon/MLU/main.sh
new file mode 100644
index 000000000..5d1eb7a92
--- /dev/null
+++ b/base/toolkits/interconnect-h2d/cambricon/MLU/main.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+export MLU_VISIBLE_DEVICES=0
+LOG_PATH=`pwd`/`hostname -i | awk '{print $1}'`_run_log
+cnvs -r pcie -c `pwd`/cnvs.example.yml 2>&1 | tee ${LOG_PATH}
+bandwidth=$(sed -n '17p' "$LOG_PATH" | awk '{print $5}')
+echo "[FlagPerf Result] interconnect-h2d bandwidth=$bandwidth GB/s"
+rm -rf cnvs_stats ${LOG_PATH} #删除缓存文件
\ No newline at end of file
diff --git a/base/toolkits/interconnect-h2d/metax/C550/README.md b/base/toolkits/interconnect-h2d/metax/C550/README.md
new file mode 100755
index 000000000..7dcebec3c
--- /dev/null
+++ b/base/toolkits/interconnect-h2d/metax/C550/README.md
@@ -0,0 +1,48 @@
+# 参评AI芯片信息
+
+* 厂商：Metax
+
+
+* 产品名称：C550
+* 产品型号：曦云®C550 64G
+* TDP：450W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：8
+* 服务器型号：OAM C550-1500
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.15.0-58-generic
+* CPU：Inter(R) Xeon(R) Plattinum 8480+
+* docker版本：24.0.7
+* 内存：2TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | CPU-芯片互联带宽测试值(8卡平均) | CPU-芯片互联带宽标定值(8卡平均) | 测试标定比例(8卡平均) |
+| ---- | -------------- | -------------- | ------------ |
+| 评测结果 | /   | /       | /        |
+
+注: h2d/d2h带宽受到CPU、PCIE、内存等服务器内AI芯片以外的模块影响，无标定值
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗(8卡平均) | 单卡最大功耗(8卡最大) | 单卡功耗标准差(8卡最大) | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| 监控结果 | 4238.1W | 4284.0W | 121.65W    | /     | 99.2W       | 100.0W       | 4.58W        | 450W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(8卡平均) | 单卡平均显存占用(8卡平均) |
+| ---- | --------- | -------- | ------------ | -------------- |
+| 监控结果 | 0.089%    | 1.151%   | 34.32°C      | 1.285%        |
+
+# 厂商测试工具原理说明
+
+使用cudaMemcpy，进行hosttodevice的CPU-AI芯片互联操作，计算CPU-AI芯片互联带宽
diff --git a/base/toolkits/interconnect-h2d/metax/C550/bandwidth.cu b/base/toolkits/interconnect-h2d/metax/C550/bandwidth.cu
new file mode 100755
index 000000000..f5088e2f3
--- /dev/null
+++ b/base/toolkits/interconnect-h2d/metax/C550/bandwidth.cu
@@ -0,0 +1,56 @@
+// Copyright (c) 2024 BAAI. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+#include <stdio.h>
+#include <cuda_runtime.h>
+
+#define GB (1024ULL * 1024ULL * 1024ULL)
+#define SIZE (16ULL * GB)
+#define WARMUP_ITERATIONS 100
+#define ITERATIONS 1000
+
+void checkCudaError(cudaError_t err, const char *msg) {
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error: %s: %s\n", msg, cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() {
+    float *d_src, *d_dst;
+    cudaEvent_t start, end;
+    float elapsed_time;
+
+    checkCudaError(cudaMallocHost(&d_src, SIZE), "cudaMallocHost");
+    checkCudaError(cudaMalloc(&d_dst, SIZE), "cudaMalloc");
+
+    checkCudaError(cudaEventCreate(&start), "cudaEventCreate");
+    checkCudaError(cudaEventCreate(&end), "cudaEventCreate");
+
+    for (int i = 0; i < WARMUP_ITERATIONS; ++i) {
+        checkCudaError(cudaMemcpy(d_dst, d_src, SIZE, cudaMemcpyHostToDevice), "cudaMemcpy");
+    }
+
+    checkCudaError(cudaEventRecord(start), "cudaEventRecord");
+
+    for (int i = 0; i < ITERATIONS; ++i) {
+        checkCudaError(cudaMemcpy(d_dst, d_src, SIZE, cudaMemcpyHostToDevice), "cudaMemcpy");
+    }
+
+    checkCudaError(cudaEventRecord(end), "cudaEventRecord");
+    checkCudaError(cudaEventSynchronize(end), "cudaEventSynchronize");
+
+    checkCudaError(cudaEventElapsedTime(&elapsed_time, start, end), "cudaEventElapsedTime");
+
+    double bandwidth = SIZE * ITERATIONS / (elapsed_time / 1000.0);
+
+    printf("[FlagPerf Result]transfer-bandwidth=%.2fGiB/s\n", bandwidth / (1024.0 * 1024.0 * 1024.0));
+    printf("[FlagPerf Result]transfer-bandwidth=%.2fGB/s\n", bandwidth / (1000.0 * 1000.0 * 1000.0));
+
+    checkCudaError(cudaFreeHost(d_src), "cudaFreeHost");
+    checkCudaError(cudaFree(d_dst), "cudaFree");
+    checkCudaError(cudaEventDestroy(start), "cudaEventDestroy");
+    checkCudaError(cudaEventDestroy(end), "cudaEventDestroy");
+
+    return 0;
+}
\ No newline at end of file
diff --git a/base/toolkits/interconnect-h2d/metax/C550/main.sh b/base/toolkits/interconnect-h2d/metax/C550/main.sh
new file mode 100755
index 000000000..fdc9a6bdc
--- /dev/null
+++ b/base/toolkits/interconnect-h2d/metax/C550/main.sh
@@ -0,0 +1,8 @@
+export MACA_PATH=/opt/maca
+export CUDA_PATH=$MACA_PATH/tools/cu-bridge
+export MACA_CLANG_PATH=$MACA_PATH/mxgpu_llvm/bin
+export LD_LIBRARY_PATH=./:$MACA_PATH/lib:$LD_LIBRARY_PATH
+export PATH=$CUDA_PATH/bin:$MACA_CLANG_PATH:$PATH
+export MACA_VISIBLE_DEVICES=0
+cucc bandwidth.cu -lcublas -o bdtest
+./bdtest
\ No newline at end of file
diff --git a/base/toolkits/main_memory-bandwidth/cambricon/MLU/README.md b/base/toolkits/main_memory-bandwidth/cambricon/MLU/README.md
new file mode 100644
index 000000000..bbf5fdf33
--- /dev/null
+++ b/base/toolkits/main_memory-bandwidth/cambricon/MLU/README.md
@@ -0,0 +1,40 @@
+# 参评AI芯片信息
+
+* 厂商：Cambricon
+* 产品名称：MLU
+* 产品型号：/
+* TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数：1
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.1 LTS
+* 操作系统内核：linux5.15.0-97-generic
+* CPU：/
+* docker版本：25.0.3
+* 内存：2TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 主存储带宽测试值    | 主存储带宽标定值 | 测试标定比例 |
+| ---- | ----------- | -------- | ------ |
+| 评测结果 | / | / | /  |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗  | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- |
+| 监控结果 | / | / | /   | /     | / | / | /   | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度  | 单卡平均显存占用 |
+| ---- | --------- | -------- | ------- | -------- |
+| 监控结果 | / | /   | / | /   |
+
+# 厂商测试工具原理说明
\ No newline at end of file
diff --git a/base/toolkits/main_memory-bandwidth/cambricon/MLU/cnvs.example.yml b/base/toolkits/main_memory-bandwidth/cambricon/MLU/cnvs.example.yml
new file mode 100644
index 000000000..8378909f3
--- /dev/null
+++ b/base/toolkits/main_memory-bandwidth/cambricon/MLU/cnvs.example.yml
@@ -0,0 +1,4 @@
+custom:
+- custom:
+    memory_bandwidth:
+      iterations: 15000
diff --git a/base/toolkits/main_memory-bandwidth/cambricon/MLU/main.sh b/base/toolkits/main_memory-bandwidth/cambricon/MLU/main.sh
new file mode 100644
index 000000000..5f22f7173
--- /dev/null
+++ b/base/toolkits/main_memory-bandwidth/cambricon/MLU/main.sh
@@ -0,0 +1,6 @@
+export MLU_VISIBLE_DEVICES=0
+LOG_PATH=`pwd`/`hostname -i | awk '{print $1}'`_run_log
+cnvs -r memory_bandwidth -c `pwd`/cnvs.example.yml 2>&1 | tee ${LOG_PATH}
+value=$(grep "read" "$LOG_PATH" | awk '{print $2}')
+echo "[FlagPerf Result]main_memory-bandwidth=${value} GB/s"
+rm -rf cnvs_stats ${LOG_PATH} #删除缓存文件
\ No newline at end of file
diff --git a/base/toolkits/main_memory-bandwidth/metax/C550/README.md b/base/toolkits/main_memory-bandwidth/metax/C550/README.md
new file mode 100755
index 000000000..f2fc2a729
--- /dev/null
+++ b/base/toolkits/main_memory-bandwidth/metax/C550/README.md
@@ -0,0 +1,47 @@
+# 参评AI芯片信息
+
+* 厂商：Metax
+
+
+* 产品名称：C550
+* 产品型号：曦云®C550 64G
+* TDP：450W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：2
+* 服务器型号：OAM C550-1500
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.15.0-58-generic
+* CPU：Inter(R) Xeon(R) Plattinum 8480+
+* docker版本：24.0.7
+* 内存：2TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 主存储带宽测试值(8卡平均) | 主存储带宽标定值(8卡平均) | 测试标定比例(8卡平均) |
+| ---- | -------------- | -------------- | ------------ |
+| 评测结果 | 1495.70GB/s    | 1.8TB/s       | 83.09%        |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗(8卡平均) | 单卡最大功耗(8卡最大) | 单卡功耗标准差(8卡最大) | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| 监控结果 | 4284.0W | 4284.0W | 0.0W    | /     | 209.0W       | 317.0W       | 108.0W        | 450W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(8卡平均) | 单卡平均显存占用(8卡平均) |
+| ---- | --------- | -------- | ------------ | -------------- |
+| 监控结果 | 0.076%    | 0.496%   | 39.0°C      | 51.579%        |
+
+
+# 厂商测试工具原理说明
+
+使用cuda核函数，进行读+写AI芯片主存储操作，计算AI芯片主存储带宽
diff --git a/base/toolkits/main_memory-bandwidth/metax/C550/bandwidth.cu b/base/toolkits/main_memory-bandwidth/metax/C550/bandwidth.cu
new file mode 100755
index 000000000..b1b6c04bc
--- /dev/null
+++ b/base/toolkits/main_memory-bandwidth/metax/C550/bandwidth.cu
@@ -0,0 +1,66 @@
+// Copyright (c) 2024 BAAI. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+
+#include <stdio.h>
+#include <cuda_runtime.h>
+
+#define GB (1024ULL * 1024ULL * 1024ULL)
+#define SIZE (16ULL * GB)
+#define WARMUP_ITERATIONS 100
+#define ITERATIONS 1000
+
+void checkCudaError(cudaError_t err, const char *msg) {
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error: %s: %s\n", msg, cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+}
+
+__global__ void copyKernel(void* d_dst, const void* d_src, size_t size) {
+    size_t offset = blockIdx.x * blockDim.x + threadIdx.x;
+    if (offset < size) {
+        ((double*)d_dst)[offset] = ((const double*)d_src)[offset];
+    }
+}
+
+int main() {
+    double *d_src, *d_dst;
+    cudaEvent_t start, end;
+    float elapsed_time;
+
+    checkCudaError(cudaMalloc(&d_src, SIZE), "cudaMalloc");
+    checkCudaError(cudaMalloc(&d_dst, SIZE), "cudaMalloc");
+
+    checkCudaError(cudaEventCreate(&start), "cudaEventCreate");
+    checkCudaError(cudaEventCreate(&end), "cudaEventCreate");
+
+    int threadsPerBlock = 1024;
+    size_t numElem = SIZE/sizeof(double);
+    int blocksPerGrid = (numElem + threadsPerBlock - 1) / threadsPerBlock;
+    for (int i = 0; i < WARMUP_ITERATIONS; ++i) {
+	    copyKernel<<<blocksPerGrid, threadsPerBlock>>>(d_dst, d_src, SIZE);
+    }
+    cudaDeviceSynchronize();
+    checkCudaError(cudaEventRecord(start), "cudaEventRecord");
+    for (int i = 0; i < ITERATIONS; ++i) {
+	    copyKernel<<<blocksPerGrid, threadsPerBlock>>>(d_dst, d_src, SIZE);
+    }
+   cudaDeviceSynchronize();
+    checkCudaError(cudaEventRecord(end), "cudaEventRecord");
+    checkCudaError(cudaEventSynchronize(end), "cudaEventSynchronize");
+
+    checkCudaError(cudaEventElapsedTime(&elapsed_time, start, end), "cudaEventElapsedTime");
+
+    double bandwidth = 2.0 * SIZE * ITERATIONS / (elapsed_time / 1000.0);
+
+    printf("[FlagPerf Result]main_memory-bandwidth=%.2fGiB/s\n", bandwidth / (1024.0 * 1024.0 * 1024.0));
+    printf("[FlagPerf Result]main_memory-bandwidth=%.2fGB/s\n", bandwidth / (1000.0 * 1000.0 * 1000.0));
+
+    checkCudaError(cudaFree(d_src), "cudaFree");
+    checkCudaError(cudaFree(d_dst), "cudaFree");
+    checkCudaError(cudaEventDestroy(start), "cudaEventDestroy");
+    checkCudaError(cudaEventDestroy(end), "cudaEventDestroy");
+
+    return 0;
+}
diff --git a/base/toolkits/main_memory-bandwidth/metax/C550/main.sh b/base/toolkits/main_memory-bandwidth/metax/C550/main.sh
new file mode 100755
index 000000000..9d9f1043e
--- /dev/null
+++ b/base/toolkits/main_memory-bandwidth/metax/C550/main.sh
@@ -0,0 +1,7 @@
+export MACA_PATH=/opt/maca
+export CUDA_PATH=$MACA_PATH/tools/cu-bridge
+export MACA_CLANG_PATH=$MACA_PATH/mxgpu_llvm/bin
+export LD_LIBRARY_PATH=./:$MACA_PATH/lib:$LD_LIBRARY_PATH
+export PATH=$CUDA_PATH/bin:$MACA_CLANG_PATH:$PATH
+cucc bandwidth.cu -lcublas -o bdtest
+./bdtest
diff --git a/base/toolkits/main_memory-capacity/cambricon/MLU/README.md b/base/toolkits/main_memory-capacity/cambricon/MLU/README.md
new file mode 100644
index 000000000..459932439
--- /dev/null
+++ b/base/toolkits/main_memory-capacity/cambricon/MLU/README.md
@@ -0,0 +1,38 @@
+# 参评AI芯片信息
+
+* 厂商：Cambricon
+* 产品名称：MLU
+* 产品型号：/
+* TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数：1
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.1 LTS
+* 操作系统内核：linux5.15.0-97-generic
+* CPU：/
+* docker版本：25.0.3
+* 内存：2TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 主存储容量测试值  | 主存储容量标定值 | 测试标定比例 |
+| ---- | ----------------- | -------- | ------ |
+| 评测结果 | / | / | / |
+
+## 能耗监控结果
+
+此评测样例中无意义
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 |
+| ---- | --------- | -------- |
+| 监控结果 | /    | /   |
+
+# 厂商测试工具原理说明
diff --git a/base/toolkits/main_memory-capacity/cambricon/MLU/main.sh b/base/toolkits/main_memory-capacity/cambricon/MLU/main.sh
new file mode 100644
index 000000000..dd1b4499d
--- /dev/null
+++ b/base/toolkits/main_memory-capacity/cambricon/MLU/main.sh
@@ -0,0 +1,12 @@
+export MLU_VISIBLE_DEVICES=0
+LOG_PATH=`pwd`/`hostname -i | awk '{print $1}'`_run_log
+pushd /usr/local/neuware/samples/cnrt && mkdir -p build && pushd build && cmake .. && make -j20 && pushd bin 
+for i in $(seq 1 7500)  
+do
+    echo ${i}
+    ./basic_device_info 2>&1 | tee ${LOG_PATH}
+done
+value=$(grep "Device 0 has avaliable memory in MB" "$LOG_PATH" | awk '{print $8}')
+echo "[FlagPerf Result]main_memory-capacity=${value} MiB"
+rm -rf ${LOG_PATH} #删除缓存文件
+popd && popd && popd
\ No newline at end of file
diff --git a/base/vendors/cambricon/cambricon_analysis.py b/base/vendors/cambricon/cambricon_analysis.py
new file mode 100644
index 000000000..313d18dcf
--- /dev/null
+++ b/base/vendors/cambricon/cambricon_analysis.py
@@ -0,0 +1,25 @@
+def analysis_log(logpath, config):
+    logfile = open(logpath)
+
+    result = {"temp": {}, "power": {}, "mem": {}}
+    for gpuID in range(config.NPROC_PER_NODE):
+        for monitor_index in result.keys():
+            result[monitor_index][gpuID] = []
+
+    max_mem = None
+    next_gpu_id = 0
+
+    for line in logfile.readlines():
+        if "MiB" in line:
+            if max_mem is None:
+                max_mem = float(line.split(" ")[5])
+                result["max_mem"] = max_mem
+            temp = float(line.split(" ")[0][:-1])
+            power = float(line.split(" ")[1])
+            mem = float(line.split(" ")[3])
+            result["temp"][next_gpu_id].append(temp)
+            result["power"][next_gpu_id].append(power)
+            result["mem"][next_gpu_id].append(mem)
+            next_gpu_id = (next_gpu_id + 1) % config.NPROC_PER_NODE
+
+    return result
diff --git a/base/vendors/cambricon/cambricon_monitor.py b/base/vendors/cambricon/cambricon_monitor.py
new file mode 100755
index 000000000..77afbc337
--- /dev/null
+++ b/base/vendors/cambricon/cambricon_monitor.py
@@ -0,0 +1,289 @@
+# ！/usr/bin/env python3
+# encoding: utf-8
+'''
+Usage:  python3 sys-monitor.py -o operation -l [log_path]
+            -o, --operation     start|stop|restart|status
+            -l, --log           log path , ./logs/ default
+'''
+
+import os
+import sys
+import time
+import signal
+import atexit
+import argparse
+import datetime
+from multiprocessing import Process
+import subprocess
+import schedule
+
+
+class Daemon:
+    '''
+    daemon subprocess class.
+    usage: subclass this daemon and override the run() method.
+    sys-monitor.pid: in the /tmp/, auto del when unexpected exit.
+    verbose: debug mode, disabled default.
+    '''
+
+    def __init__(self,
+                 pid_file,
+                 log_file,
+                 err_file,
+                 mlu_log,
+                 log_path,
+                 rate=5,
+                 stdin=os.devnull,
+                 stdout=os.devnull,
+                 stderr=os.devnull,
+                 home_dir='.',
+                 umask=0o22,
+                 verbose=0):
+        self.stdin = stdin
+        self.stdout = stdout
+        self.stderr = stderr
+        self.home_dir = home_dir
+        self.verbose = verbose
+        self.pidfile = pid_file
+        self.logfile = log_file
+        self.errfile = err_file
+        self.mlufile = mlu_log
+        self.logpath = log_path
+        self.rate = rate
+        self.umask = umask
+        self.verbose = verbose
+        self.daemon_alive = True
+
+    def get_pid(self):
+        try:
+            with open(self.pidfile, 'r') as pf:
+                pid = int(pf.read().strip())
+        except IOError:
+            pid = None
+        except SystemExit:
+            pid = None
+        return pid
+
+    def del_pid(self):
+        if os.path.exists(self.pidfile):
+            os.remove(self.pidfile)
+
+    def run(self):
+        '''
+        NOTE: override the method in subclass
+        '''
+
+        def mlu_mon(file):
+            TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
+            cmd = "paste <(cnmon |grep 'Default') <(cnmon |grep 'MLU' | head -n -1) | awk '{print $3,$4,$5,$9,$10,$11,$25}'; echo \"\""
+            process = subprocess.Popen(cmd,
+                                       shell=True,
+                                       executable="/bin/bash",
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.STDOUT,
+                                       encoding='utf-8')
+            try:
+                out = process.communicate(timeout=10)
+            except subprocess.TimeoutExpired:
+                process.kill()
+                out = process.communicate()
+
+            if process.returncode != 0:
+                result = "error"
+            result = TIMESTAMP + "\n" + out[0] + "\n"
+            with open(file, 'a') as f:
+                f.write(result)
+
+        def timer_mlu_mon():
+            mlu_process = Process(target=mlu_mon, args=(self.mlufile, ))
+            mlu_process.start()
+
+        schedule.every(self.rate).seconds.do(timer_mlu_mon)
+        while True:
+            schedule.run_pending()
+            time.sleep(5)
+
+    def daemonize(self):
+        if self.verbose >= 1:
+            print('daemon process starting ...')
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #1 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        os.chdir(self.home_dir)
+        os.setsid()
+        os.umask(self.umask)
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #2 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        sys.stdout.flush()
+        sys.stderr.flush()
+        si = open(self.stdin, 'r')
+        so = open(self.stdout, 'a+')
+        if self.stderr:
+            se = open(self.stderr, 'a+')
+        else:
+            se = so
+        os.dup2(si.fileno(), sys.stdin.fileno())
+        os.dup2(so.fileno(), sys.stdout.fileno())
+        os.dup2(se.fileno(), sys.stderr.fileno())
+        atexit.register(self.del_pid)
+        pid = str(os.getpid())
+        with open(self.pidfile, 'w+') as f:
+            f.write('%s\n' % pid)
+
+    def start(self):
+        if not os.path.exists(self.logpath):
+            os.makedirs(self.logpath)
+        elif os.path.exists(self.mlufile):
+            os.remove(self.mlufile)
+        if self.verbose >= 1:
+            print('ready to start ......')
+        # check for a pid file to see if the daemon already runs
+        pid = self.get_pid()
+        if pid:
+            msg = 'pid file %s already exists, is it already running?\n'
+            sys.stderr.write(msg % self.pidfile)
+            sys.exit(1)
+        # start the daemon
+        self.daemonize()
+        self.run()
+
+    def stop(self):
+        if self.verbose >= 1:
+            print('stopping ...')
+        pid = self.get_pid()
+        if not pid:
+            msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile
+            sys.stderr.write(msg)
+            if os.path.exists(self.pidfile):
+                os.remove(self.pidfile)
+            return
+        # try to kill the daemon process
+        try:
+            i = 0
+            while 1:
+                os.kill(pid, signal.SIGTERM)
+                time.sleep(1)
+                i = i + 1
+                if i % 10 == 0:
+                    os.kill(pid, signal.SIGHUP)
+        except OSError as err:
+            err = str(err)
+            if err.find('No such process') > 0:
+                if os.path.exists(self.pidfile):
+                    os.remove(self.pidfile)
+            else:
+                print(str(err))
+                sys.exit(1)
+            if self.verbose >= 1:
+                print('Stopped!')
+
+    def restart(self):
+        self.stop()
+        self.start()
+
+    def status(self):
+        pid = self.get_pid()
+        if pid:
+            if os.path.exists('/proc/%d' % pid):
+                return pid
+        return False
+
+
+def parse_args():
+    ''' Check script input parameter. '''
+    parse = argparse.ArgumentParser(description='Sys monitor script')
+    parse.add_argument('-o',
+                       type=str,
+                       metavar='[operation]',
+                       required=True,
+                       help='start|stop|restart|status')
+    parse.add_argument('-l',
+                       type=str,
+                       metavar='[log_path]',
+                       required=False,
+                       default='/tmp/',
+                       help='log path')
+    args = parse.parse_args()
+    return args
+
+
+def get_system_info():
+    cmd = r"echo OS version:;"
+    cmd = cmd + r"cat /etc/issue | head -n1 | awk '{print $1, $2, $3}';"
+    cmd = cmd + r"echo ;"
+    
+    cmd = cmd + r"echo OS Kernel version:;"
+    cmd = cmd + r"uname -r;"
+    cmd = cmd + r"echo ;"
+    
+    cmd = cmd + r"echo Hardware Model:;"
+    cmd = cmd + r"sudo dmidecode | grep -A9 'System Information' | tail -n +2 | sed 's/^[ \t]*//';"
+    cmd = cmd + r"echo ;"
+    
+    cmd = cmd + r"echo Accelerator Model:;"
+    cmd = cmd + r"cnmon -l;"
+    cmd = cmd + r"echo ;"
+    
+    cmd = cmd + r"echo Accelerator Driver version:;"
+    cmd = cmd + r"cnmon | grep 'CNMON' | awk '{print $3}';"
+    cmd = cmd + r"echo ;"
+    
+    cmd = cmd + r"echo Docker version:;"
+    cmd = cmd + r"docker -v"
+    
+    return cmd
+    
+
+def main():
+    sample_rate1 = 5
+    args = parse_args()
+    operation = args.o
+    log_path = args.l
+    pid_fn = str('/tmp/mlu_monitor.pid')
+    log_fn = str(log_path + '/cambricon_monitor.log')
+    err_fn = str(log_path + '/cambricon_monitor.err')
+    # result for mlu
+    mlu_fn = str(log_path + '/cambricon_monitor.log')
+    sys_fn = str(log_path + '/sys_info.log')
+    cmd = get_system_info()
+    with open(sys_fn, "w") as f:
+        p = subprocess.Popen(cmd, shell=True, stdout=f, stderr=subprocess.STDOUT)
+        p.wait()
+
+    subdaemon = Daemon(pid_fn,
+                       log_fn,
+                       err_fn,
+                       mlu_fn,
+                       log_path,
+                       verbose=1,
+                       rate=sample_rate1)
+    if operation == 'start':
+        subdaemon.start()
+    elif operation == 'stop':
+        subdaemon.stop()
+    elif operation == 'restart':
+        subdaemon.restart()
+    elif operation == 'status':
+        pid = subdaemon.status()
+        if pid:
+            print('process [%s] is running ......' % pid)
+        else:
+            print('daemon process [%s] stopped' % pid)
+    else:
+        print("invalid argument!")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/base/vendors/cambricon/pytorch_2.1/Dockerfile b/base/vendors/cambricon/pytorch_2.1/Dockerfile
new file mode 100644
index 000000000..881bd28db
--- /dev/null
+++ b/base/vendors/cambricon/pytorch_2.1/Dockerfile
@@ -0,0 +1,16 @@
+FROM flagperf:cambricon-deepspeed-v24.06-torch2.1.0-catch1.21.0-ubuntu22.04-py310-megatron-patch
+#shell
+SHELL ["/bin/bash", "-c"]
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -yq --no-install-recommends tzdata && apt-get install -y openssh-server && mkdir -p /run/sshd
+RUN cp /etc/apt/sources.list /etc/apt/sources.list.backup \
+  && sed -i 's|http://.*archive.ubuntu.com/ubuntu/|https://mirrors.tuna.tsinghua.edu.cn/ubuntu/|g' /etc/apt/sources.list \
+  && sed -i 's|http://.*security.ubuntu.com/ubuntu/|https://mirrors.tuna.tsinghua.edu.cn/ubuntu/|g' /etc/apt/sources.list
+
+RUN apt update -y && apt install -y sudo dmidecode ipmitool sysstat net-tools sshpass libnuma-dev
+
+# modify ~/.bashrc file
+RUN sed -i '/\[ -z "\$PS1" \] \&\& return/s/^/#/' ~/.bashrc
+RUN echo -e "\n# Add environment variables\n\
+export NEUWARE_HOME=/usr/local/neuware\n\
+export LD_LIBRARY_PATH=/usr/local/mpi_wrapper/build/install/lib64:/usr/local/neuware/lib64:/usr/local/openmpi/lib:${LD_LIBRARY_PATH}\n\
+export PATH=/torch/venv3/pytorch/bin:/torch/venv3/pytorch/bin:/usr/local/neuware/bin:/usr/local/openmpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:${PATH}" >> ~/.bashrc
diff --git a/base/vendors/cambricon/pytorch_2.1/pytorch_2.1_install.sh b/base/vendors/cambricon/pytorch_2.1/pytorch_2.1_install.sh
new file mode 100644
index 000000000..27e26b929
--- /dev/null
+++ b/base/vendors/cambricon/pytorch_2.1/pytorch_2.1_install.sh
@@ -0,0 +1,16 @@
+set -xe
+pip install schedule loguru
+pushd /usr/local/neuware/share/cnclbenchmark/cnmpi_wrapper/
+bash build.sh
+popd
+
+#配置免密
+echo 'root:123456' | sudo chpasswd
+rm -rf ~/.ssh/* && ssh-keygen -t rsa -N '' -f /root/.ssh/id_rsa -q
+sed -i '/StrictHostKeyChecking/c StrictHostKeyChecking no' /etc/ssh/ssh_config
+sed -i 's/#Port 22/Port 1234/g' /etc/ssh/sshd_config
+sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/g' /etc/ssh/sshd_config
+sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' /etc/ssh/sshd_config
+/etc/init.d/ssh restart
+#免密
+sshpass -p "123456" ssh-copy-id -i ~/.ssh/id_rsa.pub -p 1234 root@`hostname -i | awk '{print $1}'`
diff --git a/base/vendors/kunlunxin/kunlunxin_analysis.py b/base/vendors/kunlunxin/kunlunxin_analysis.py
new file mode 100644
index 000000000..ebee86b2d
--- /dev/null
+++ b/base/vendors/kunlunxin/kunlunxin_analysis.py
@@ -0,0 +1,25 @@
+def analysis_log(logpath, config):
+    logfile = open(logpath)
+
+    result = {"temp": {}, "power": {}, "mem": {}}
+    for gpuID in range(config.NPROC_PER_NODE):
+        for monitor_index in result.keys():
+            result[monitor_index][gpuID] = []
+
+    max_mem = None
+    next_gpu_id = 0
+
+    for line in logfile.readlines():
+        if "MiB" in line:
+            if max_mem is None:
+                max_mem = float(line.split(" ")[3][:-3])
+                result["max_mem"] = max_mem
+            temp = float(line.split(" ")[0][:-1])
+            power = float(line.split(" ")[1][:-1])
+            mem = float(line.split(" ")[2][:-3])
+            result["temp"][next_gpu_id].append(temp)
+            result["power"][next_gpu_id].append(power)
+            result["mem"][next_gpu_id].append(mem)
+            next_gpu_id = (next_gpu_id + 1) % config.NPROC_PER_NODE
+
+    return result
diff --git a/base/vendors/kunlunxin/kunlunxin_monitor.py b/base/vendors/kunlunxin/kunlunxin_monitor.py
new file mode 100644
index 000000000..c031e63d2
--- /dev/null
+++ b/base/vendors/kunlunxin/kunlunxin_monitor.py
@@ -0,0 +1,256 @@
+# ！/usr/bin/env python3
+# encoding: utf-8
+'''
+Usage:  python3 sys-monitor.py -o operation -l [log_path]
+            -o, --operation     start|stop|restart|status
+            -l, --log           log path , ./logs/ default
+'''
+
+import os
+import sys
+import time
+import signal
+import atexit
+import argparse
+import datetime
+from multiprocessing import Process
+import subprocess
+import schedule
+
+
+class Daemon:
+    '''
+    daemon subprocess class.
+    usage: subclass this daemon and override the run() method.
+    sys-monitor.pid: in the /tmp/, auto del when unexpected exit.
+    verbose: debug mode, disabled default.
+    '''
+
+    def __init__(self,
+                 pid_file,
+                 log_file,
+                 err_file,
+                 gpu_log,
+                 log_path,
+                 rate=5,
+                 stdin=os.devnull,
+                 stdout=os.devnull,
+                 stderr=os.devnull,
+                 home_dir='.',
+                 umask=0o22,
+                 verbose=0):
+        self.stdin = stdin
+        self.stdout = stdout
+        self.stderr = stderr
+        self.home_dir = home_dir
+        self.verbose = verbose
+        self.pidfile = pid_file
+        self.logfile = log_file
+        self.errfile = err_file
+        self.gpufile = gpu_log
+        self.logpath = log_path
+        self.rate = rate
+        self.umask = umask
+        self.verbose = verbose
+        self.daemon_alive = True
+
+    def get_pid(self):
+        try:
+            with open(self.pidfile, 'r') as pf:
+                pid = int(pf.read().strip())
+        except IOError:
+            pid = None
+        except SystemExit:
+            pid = None
+        return pid
+
+    def del_pid(self):
+        if os.path.exists(self.pidfile):
+            os.remove(self.pidfile)
+
+    def run(self):
+        '''
+        NOTE: override the method in subclass
+        '''
+
+        def gpu_mon(file):
+            TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
+            cmd = "xpu-smi -m | awk '{print $5\"C\",$9\"W\",$18\"MiB\",$19\"MiB\",$20\"%\"}'"
+            process = subprocess.Popen(cmd,
+                                       shell=True,
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.STDOUT,
+                                       encoding='utf-8')
+            try:
+                out = process.communicate(timeout=10)
+            except subprocess.TimeoutExpired:
+                process.kill()
+                out = process.communicate()
+
+            if process.returncode != 0:
+                result = "error"
+            result = TIMESTAMP + "\n" + out[0] + "\n"
+            with open(file, 'a') as f:
+                f.write(result)
+
+        def timer_gpu_mon():
+            gpu_process = Process(target=gpu_mon, args=(self.gpufile, ))
+            gpu_process.start()
+
+        schedule.every(self.rate).seconds.do(timer_gpu_mon)
+        while True:
+            schedule.run_pending()
+            time.sleep(5)
+
+    def daemonize(self):
+        if self.verbose >= 1:
+            print('daemon process starting ...')
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #1 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        os.chdir(self.home_dir)
+        os.setsid()
+        os.umask(self.umask)
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #2 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        sys.stdout.flush()
+        sys.stderr.flush()
+        si = open(self.stdin, 'r')
+        so = open(self.stdout, 'a+')
+        if self.stderr:
+            se = open(self.stderr, 'a+')
+        else:
+            se = so
+        os.dup2(si.fileno(), sys.stdin.fileno())
+        os.dup2(so.fileno(), sys.stdout.fileno())
+        os.dup2(se.fileno(), sys.stderr.fileno())
+        atexit.register(self.del_pid)
+        pid = str(os.getpid())
+        with open(self.pidfile, 'w+') as f:
+            f.write('%s\n' % pid)
+
+    def start(self):
+        if not os.path.exists(self.logpath):
+            os.makedirs(self.logpath)
+        elif os.path.exists(self.gpufile):
+            os.remove(self.gpufile)
+        if self.verbose >= 1:
+            print('ready to start ......')
+        # check for a pid file to see if the daemon already runs
+        pid = self.get_pid()
+        if pid:
+            msg = 'pid file %s already exists, is it already running?\n'
+            sys.stderr.write(msg % self.pidfile)
+            sys.exit(1)
+        # start the daemon
+        self.daemonize()
+        self.run()
+
+    def stop(self):
+        if self.verbose >= 1:
+            print('stopping ...')
+        pid = self.get_pid()
+        if not pid:
+            msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile
+            sys.stderr.write(msg)
+            if os.path.exists(self.pidfile):
+                os.remove(self.pidfile)
+            return
+        # try to kill the daemon process
+        try:
+            i = 0
+            while 1:
+                os.kill(pid, signal.SIGTERM)
+                time.sleep(1)
+                i = i + 1
+                if i % 10 == 0:
+                    os.kill(pid, signal.SIGHUP)
+        except OSError as err:
+            err = str(err)
+            if err.find('No such process') > 0:
+                if os.path.exists(self.pidfile):
+                    os.remove(self.pidfile)
+            else:
+                print(str(err))
+                sys.exit(1)
+            if self.verbose >= 1:
+                print('Stopped!')
+
+    def restart(self):
+        self.stop()
+        self.start()
+
+    def status(self):
+        pid = self.get_pid()
+        if pid:
+            if os.path.exists('/proc/%d' % pid):
+                return pid
+        return False
+
+
+def parse_args():
+    ''' Check script input parameter. '''
+    parse = argparse.ArgumentParser(description='Sys monitor script')
+    parse.add_argument('-o',
+                       type=str,
+                       metavar='[operation]',
+                       required=True,
+                       help='start|stop|restart|status')
+    parse.add_argument('-l',
+                       type=str,
+                       metavar='[log_path]',
+                       required=False,
+                       default='./logs/',
+                       help='log path')
+    args = parse.parse_args()
+    return args
+
+
+def main():
+    sample_rate1 = 5
+    args = parse_args()
+    operation = args.o
+    log_path = args.l
+    pid_fn = str('/tmp/kunlunxin_monitor.pid')
+    log_fn = str(log_path + '/kunlunxin_monitor.log')
+    err_fn = str(log_path + '/kunlunxin_monitor.err')
+    # result for gpu
+    gpu_fn = str(log_path + '/kunlunxin_monitor.log')
+
+    subdaemon = Daemon(pid_fn,
+                       log_fn,
+                       err_fn,
+                       gpu_fn,
+                       log_path,
+                       verbose=1,
+                       rate=sample_rate1)
+    if operation == 'start':
+        subdaemon.start()
+    elif operation == 'stop':
+        subdaemon.stop()
+    elif operation == 'restart':
+        subdaemon.restart()
+    elif operation == 'status':
+        pid = subdaemon.status()
+        if pid:
+            print('process [%s] is running ......' % pid)
+        else:
+            print('daemon process [%s] stopped' % pid)
+    else:
+        print("invalid argument!")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/base/vendors/kunlunxin/xpytorch029/Dockerfile b/base/vendors/kunlunxin/xpytorch029/Dockerfile
new file mode 100644
index 000000000..d360fe89c
--- /dev/null
+++ b/base/vendors/kunlunxin/xpytorch029/Dockerfile
@@ -0,0 +1,16 @@
+FROM iregistry.baidu-int.com/xmlir/xmlir_ubuntu_2004_x86_64:v0.29
+RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple"
+RUN /bin/bash -c "uname -a"
+RUN /bin/bash -c alias python3=python
+ENV PATH /root/miniconda/envs/python38_torch201_cuda/bin:$PATH
+ENV PATH /usr/local/xpu/bin:$PATH
+RUN /bin/bash -c 'wget -O /tmp/xre.tar.gz https://klx-sdk-release-public.su.bcebos.com/xre/kl3-release/5.0.15.1/xre-Linux-x86_64-5.0.15.1.tar.gz && cd /tmp && tar zxf xre.tar.gz && cp -a xre-Linux-x86_64-5.0.15.1 /usr/local/xpu'
+
+#RUN apt-get update
+RUN pip3 install loguru
+#RUN pip3 install pycuda
+RUN pip3 install schedule
+RUN pip3 install munch
+RUN /bin/bash -c 'wget -O /tmp/xpytorch.run https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/R300_plus/latest/xpytorch-cp38-torch201-ubuntu2004-x64.run && bash /tmp/xpytorch.run'
+#RUN /bin/bash -c 'source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda'
+ENV CUDART_DUMMY_REGISTER 1
diff --git a/base/vendors/kunlunxin/xpytorch029/xpytorch029_install.sh b/base/vendors/kunlunxin/xpytorch029/xpytorch029_install.sh
new file mode 100644
index 000000000..a02584fad
--- /dev/null
+++ b/base/vendors/kunlunxin/xpytorch029/xpytorch029_install.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+set -x
+
+# conda env
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+pip install pytest loguru schedule
+
+# xpytorch install
+#wget -q -O xpytorch.run https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/R300_plus/latest/xpytorch-cp38-torch201-ubuntu2004-x64.run && bash xpytorch.run &> install-xpytorch.log
+CUDART_DUMMY_REGISTER=1 python -m torch_xmlir --doctor
+CUDART_DUMMY_REGISTER=1 python -c "import torch; print(torch.rand(512, 128).cuda())"
+
diff --git a/base/vendors/metax/pytorch_2.0/Dockerfile b/base/vendors/metax/pytorch_2.0/Dockerfile
new file mode 100755
index 000000000..eac960b81
--- /dev/null
+++ b/base/vendors/metax/pytorch_2.0/Dockerfile
@@ -0,0 +1,15 @@
+FROM llama2_70b_qwen_0720:2.23.0.13.342-ubuntu20.04-amd64
+ENV PATH=$PATH:/opt/conda/bin
+RUN /bin/bash -c "pip3 config set global.index-url https://mirror.baidu.com/pypi/simple"
+RUN /bin/bash -c "uname -a"
+RUN /bin/bash -c alias python3=python
+RUN apt-get update
+RUN pip3 install loguru
+RUN pip3 install schedule
+RUN pip3 install argparse
+RUN pip3 install pyyaml
+ENV MCCL_FAST_WRITE_BACK=1
+ENV MCCL_EARLY_WRITE_BACK=15
+ENV MCCL_P2P_LEVEL=SYS
+ENV MCCL_NET_GDR_LEVEL=SYS
+ENV MCCL_CROSS_NIC=1
diff --git a/base/vendors/metax/pytorch_2.0/pytorch2.0_install.sh b/base/vendors/metax/pytorch_2.0/pytorch2.0_install.sh
new file mode 100755
index 000000000..cc1f786e8
--- /dev/null
+++ b/base/vendors/metax/pytorch_2.0/pytorch2.0_install.sh
@@ -0,0 +1 @@
+#!/bin/bash
\ No newline at end of file
diff --git a/docs/operations/assets/.DS_Store b/docs/operations/assets/.DS_Store
new file mode 100644
index 000000000..9a18f9c4f
Binary files /dev/null and b/docs/operations/assets/.DS_Store differ
diff --git a/docs/operations/assets/sample.jpg b/docs/operations/assets/sample.jpg
new file mode 100644
index 000000000..5ba614cac
Binary files /dev/null and b/docs/operations/assets/sample.jpg differ
diff --git "a/docs/operations/assets/\345\215\225\344\270\252\347\256\227\345\255\220\346\211\247\350\241\214\346\265\201\347\250\213.png" "b/docs/operations/assets/\345\215\225\344\270\252\347\256\227\345\255\220\346\211\247\350\241\214\346\265\201\347\250\213.png"
new file mode 100644
index 000000000..0141e41e4
Binary files /dev/null and "b/docs/operations/assets/\345\215\225\344\270\252\347\256\227\345\255\220\346\211\247\350\241\214\346\265\201\347\250\213.png" differ
diff --git a/docs/operations/operations-case-doc.md b/docs/operations/operations-case-doc.md
new file mode 100644
index 000000000..da1f08ed2
--- /dev/null
+++ b/docs/operations/operations-case-doc.md
@@ -0,0 +1,150 @@
+# 算子评测厂商适配文档
+
+为了评估AI芯片在原生算子和Triton算子（[FlagGems](https://github.com/FlagOpen/FlagGems)）方面的支持程度和性能，FlagPerf 设计并实现了针对各个算子在 AI 芯片的评测方案。具体的评测方案细节可以向 FlagPerf 团队索取评测方案文档，这里仅介绍厂商适配需关注的详细内容。厂商按照本文档完成适配后，评测时会自动生成相关指标结果。
+
+## 工程组织形式
+算子评测相关代码均在FlagPerf/operation目录下, 整体结构如下：
+```
+├── benchmarks
+│   ├── abs
+│   │   ├── case_config.yaml
+│   │   ├── main.py
+│   │   └── nvidia
+│   │       └── A100_40_SXM
+│   │           ├── README.md
+│   │           ├── case_config.yaml
+│   │           ├── env.sh
+│   │           └── requirements.txt
+├── configs
+│   └── host.yaml
+├── container_main.py
+├── run.py
+└── vendors
+    └── nvidia
+        ├── ngctorch2403
+        │   ├── Dockerfile
+        │   └── ngctorch2403_install.sh
+        ├── nvidia_analysis.py
+        └── nvidia_monitor.py
+        
+```
+1、benchmarks
+
+存放各个算子评测代码。每个算子必定包含：
+
+* case_config.yaml，为对应算子的各超参配置，原则上硬件无关
+* main.py，为对应算子的主进程
+* vendor/目录，存放各厂商相关文件：
+    * case_config.yaml，可覆盖式更新上级目录的超参配置。原则上推荐采用FlagPerf 的默认配置，如果因对应芯片无法支持FlagPerf默认配置, 可以在该文件中修改超参配置
+    * env.sh，可厂商自定义针对该算子的环境变量/执行shell脚本，会在启动main.py之前由FlagPerf自动执行
+    * requirements.txt，可厂商自定义pip安装包，会由FlagPerf自动执行
+    * README.md，记录厂商此样例使用服务器的规格、芯片规格，并记录评测结果中可以公开的部分
+
+2、configs
+下设一个文件host.yaml，存放各主机IP，端口，FlagPerf路径等信息
+
+此文件每次运行时自由更改填写，无需适配或更新提交
+
+3、container_main.py
+
+此文件为容器内主进程，负责根据host.yaml启动对应评测样例主进程
+
+4、run.py
+
+此文件为FlagPerf评测主进程，负责根据host.yaml启动并准备集群环境，启动container_main.py
+
+5、vendors
+
+此文件存放各厂商相关环境基础文件，每个厂商必定包含：
+*  \<vendor\>_analysis.py，用于解析各评测项结果，可参考给出的英伟达实现方案
+*  \<vendor\>_monitor.py，用于对AI芯片进行温度、功率、显存使用等方面的监控，可参考给出的英伟达实现方案
+*  \<envname\>，包含对应运行时环境
+    *  Dockerfile :
+    *  \<envname\>_install.sh: 可自定义全局的环境变量、安装软件等操作，会在测例评测开始由FlagPerf 自动执行
+
+注意: 这里的**envname**需要和host.yaml中 CASES 的 value 值保持一致。可以参考下图英伟达的命名与使用方式。
+![sample](assets/sample.jpg)
+
+
+## 评测运行时流程
+
+#### 运行前工作
+1、配置修改
+在运行评测前，需要填写configs/host.yaml文件
+```
+FLAGPERF_PATH: "/home/FlagPerf/operation"
+FLAGPERF_LOG_PATH: "result"
+VENDOR: "nvidia"
+FLAGPERF_LOG_LEVEL: "info"
+HOSTS: ["192.168.1.2"]
+NPROC_PER_NODE: 1
+SSH_PORT: "22"
+HOSTS_PORTS: ["2222"]
+MASTER_PORT: "29501"
+SHM_SIZE: "32G"
+ACCE_CONTAINER_OPT: " --gpus all"
+# for nvidia, using " -- gpus all"
+# for xxx, using
+PIP_SOURCE: "https://mirror.baidu.com/pypi/simple"
+CLEAR_CACHES: True
+# for nvidia, using "CUDA_VISIBLE_DEVICES"
+# for xxx, using
+ACCE_VISIBLE_DEVICE_ENV_NAME: "CUDA_VISIBLE_DEVICES"
+# "operation:dataFormat:chip": "docker_images"
+# now only support flaggems and nativepytorch
+CASES: 
+    "mm:FP16:nativetorch:A100_40_SXM": "ngctorch2403"
+```
+在host.yaml文件中，各项配置含义如下：
+
+* FLAGPERF_PATH: 为FlagPerf/operation/所在**绝对路径**
+* FLAGPERF_LOG_PATH: 为填写日志目录相对于FlagPerf/operation/的**相对路径**，需要具有write权限
+* VENDOR: 为厂商名称
+* FLAGPERF_LOG_LEVEL: 为日志记录等级，可选debug、info、error等
+* HOSTS:为一个字符串数组，包含若干主机的IP。数组0位置填写的IP为MASTER
+* NPROC_PER_NODE: 表示每台主机启动的AI芯片数量
+* SSH_PORT: 表示主机间免密登录所用端口
+* HOST_PORTS: 表示容器间torch通信所用端口
+* MASTER_PORT: 表示容器间torch通信对应master端口
+* SHM_SIZE:表示容器启动时共享内存大小
+* ACCE_CONTAINER_OPT: 表示AI芯片进入镜像所需命令。例如对于英伟达，命令为" --gpus all"
+* PIP_SOURCE: 表示容器内PIP所用源地址
+* CLEAR_CACHE: 表示启动测试前是否清理系统cache，原则上为True
+* ACCE_VISIBLE_DEVICE_ENV_NAME: 表示选定AI芯片所用环境变量。例如对于英伟达，环境变量为"CUDA_VISIBLE_DEVICES"
+* CASES: 为一个字典。key为评测算子名称:数制:算子库名:芯片型号, value为对应运行时环境名称。
+    例如，可使用"mm:FP16:nativetorch:A100_40_SXM": "ngctorch2403" 来以FP6数制基于原生NativeTorch执行mm算子；
+    可使用"mm:FP32:flaggems:A100_40_SXM": "ngctorch2403" 来以FP32数制基于FlagGems算子库执行mm算子；
+    ngctorch2403 为vendors目录下被评测厂商对应运行环境的名称。
+
+2、运行流程
+为了更好的理解整体流程，这里以流程图的形式简述单个算子评测的主要流程。
+![单个算子执行流程](assets/%E5%8D%95%E4%B8%AA%E7%AE%97%E5%AD%90%E6%89%A7%E8%A1%8C%E6%B5%81%E7%A8%8B.png)
+
+3、快速评测方法
+因算子数量众多，本方案提供了快速执行所有算子并渲染结果的脚本，以帮助厂商快速确认。脚本位于``` operation/helper``` 目录下，使用方法如下：
+```
+# 安装依赖
+cd operation/helper
+pip install -r requirements.txt
+
+# 按照实际情况修改 main.sh 脚本
+vim main.sh 
+# 该脚本中有两处需要修改
+# （1）与厂商、执行环境相关的数据，即“修改点 1”
+# （2）与测例相关的数据，即“修改点 2”，其格式为 算子名="数制1 数制2 数制3"
+
+# 执行 main.sh 即可, 执行完成后会在operation/results 目录下看到每个算子、每个数制的执行结果和日志
+bash main.sh
+```
+
+
+## 厂商适配文档
+#### 初次适配
+如“评测运行时流程”的“运行流程”中**黄颜色部分**所示，厂商需要适配的分为三个部分：
+* 适配样例的case_config.yaml，env.sh，requirements.txt，根据机器信息及评测结果填写README.md，位于benchmarks/\<case\>/\<vendor>\/, 可参考英伟达方案
+* 适配监控和日志分析等方法，该部分位于vendors/\<vendor\>/目录下，形式与内容可以参考英伟达方案
+* 提交厂商自身相关环境文件及相关代码，即vendors/\<vendor\>/\<环境名\>目录，组织形式及内容可参考英伟达方案
+#### 后续适配
+厂商后续参与某评测样例时，需要提交初次适配所需的 1 部分文件，不需要提交第2、3部分
+#### 配置及结果更新
+厂商如认为现有评测结果及配置不足以展现自身能力，可修改初次适配所需的 1 部分中的文件，并修改README.md中的结果。
diff --git a/flaggems_model/model_bert_test.py b/flaggems_model/model_bert_test.py
index 7b49b6254..6b10403ae 100644
--- a/flaggems_model/model_bert_test.py
+++ b/flaggems_model/model_bert_test.py
@@ -2,6 +2,12 @@
 
 import pytest
 import torch
+
+try:
+    from torch_mlu.utils.model_transfer import transfer
+except ImportError:
+    pass
+
 from transformers import AutoTokenizer, BertConfig, BertModel
 
 import flag_gems
diff --git a/operation/benchmarks/abs/cambricon/MLU/case_config.yaml b/operation/benchmarks/abs/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/abs/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/abs/cambricon/MLU/env.sh b/operation/benchmarks/abs/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/abs/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/abs/case_config.yaml b/operation/benchmarks/abs/case_config.yaml
index 6ebcc4612..acc0f44fb 100644
--- a/operation/benchmarks/abs/case_config.yaml
+++ b/operation/benchmarks/abs/case_config.yaml
@@ -1,5 +1,4 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 50000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/abs/iluvatar/BI150/README.md b/operation/benchmarks/abs/iluvatar/BI150/README.md
new file mode 100644
index 000000000..a4eb1799a
--- /dev/null
+++ b/operation/benchmarks/abs/iluvatar/BI150/README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 0.00E+00    | 0.07TFLOPS       | 0.07TFLOPS        | 0.29% | 0.29% |
+| nativetorch | 0.00E+00    | 0.07TFLOPS      | 0.07TFLOPS      | 0.3%      | 0.3%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 0.00E+00    | 7413.27us       | 7422.1us        | 134.89op/s | 134.73op/s | 242083.45us | 8447.68us |
+| nativetorch | 0.00E+00    | 7381.57us       | 7393.42us        | 135.47op/s | 135.26op/s | 7741.95us | 7666.12us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2066.25W | 2090.0W | 31.15W   | /     | 162.6W       | 163.0W      | 2.79W        | 350W  |
+| flaggems监控结果 | 2071.0W | 2090.0W | 32.91W   | /     | 168.93W       | 169.0W      | 0.3W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 41.474%    | 2.386%   | 45.82°C       | 19.489%        |
+| flaggems监控结果 | 42.969%    | 2.391%   | 46.51°C       | 19.489%        |
\ No newline at end of file
diff --git a/operation/benchmarks/abs/iluvatar/BI150/case_config.yaml b/operation/benchmarks/abs/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..a06e87765
--- /dev/null
+++ b/operation/benchmarks/abs/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
diff --git a/operation/benchmarks/abs/iluvatar/BI150/env.sh b/operation/benchmarks/abs/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/abs/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/abs/iluvatar/BI150/requirements.txt b/operation/benchmarks/abs/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/abs/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/abs/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/abs/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..ec1c9e233
--- /dev/null
+++ b/operation/benchmarks/abs/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,2 @@
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/abs/kunlunxin/R300p/env.sh b/operation/benchmarks/abs/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/abs/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/abs/main.py b/operation/benchmarks/abs/main.py
index 9cc63a298..7aaf73aa3 100644
--- a/operation/benchmarks/abs/main.py
+++ b/operation/benchmarks/abs/main.py
@@ -4,12 +4,12 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +23,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,32 +53,21 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation"
-          )  # correctness is implemented casebycase
 
     m = case_config.Melements
 
-    dtype = {"FP32": torch.float32}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(m, dtype=dtype[config.dataformat])
-
-        a_fp64 = a.to(torch.float64)
-        r_fp64 = torch.abs(a_fp64)
-
-        a = a.to(0)
-        r_device = torch.abs(a).cpu()
-        mape = torch.mean(torch.abs(r_device - r_fp64) / torch.abs(r_fp64))
-
-        mmape.append(mape)
-    
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
 
     a = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat]).to(0)
 
@@ -80,8 +77,8 @@ def main(config, case_config):
     op2flops = lambda x: x * m * 1024 * 1024
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "abs", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -89,6 +86,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
diff --git a/operation/benchmarks/abs/metax/C550_64/case_config.yaml b/operation/benchmarks/abs/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/abs/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/abs/metax/C550_64/env.sh b/operation/benchmarks/abs/metax/C550_64/env.sh
new file mode 100644
index 000000000..bb4cd2dce
--- /dev/null
+++ b/operation/benchmarks/abs/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_CONST_ARGS_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/abs/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/abs/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..2878cb0c8
--- /dev/null
+++ b/operation/benchmarks/abs/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.34TFLOPS       | 0.33TFLOPS        | 0.11% | 0.11% |
+| nativetorch | True    | 0.35TFLOPS      | 0.35TFLOPS      | 0.11%      | 0.11%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 3201.08us       | 3206.14us        | 312.39op/s | 311.9op/s | 2272998.94us | 3297.47us |
+| nativetorch | 3079.64us       | 3086.34us        | 324.71op/s | 324.01op/s | 14492.24us | 3106.5us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 256.58W       | 260.0W      | 2.55W        | 400W  |
+| flaggems监控结果 | 1599.0W | 1716.0W | 117.0W   | /     | 291.21W       | 300.0W      | 16.82W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.797%    | 2.308%   | 47.03°C       | 17.394%        |
+| flaggems监控结果 | 0.748%    | 2.308%   | 51.03°C       | 17.207%        |
diff --git a/operation/benchmarks/abs/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/abs/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..bee853acd
--- /dev/null
+++ b/operation/benchmarks/abs/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems |  True    | 0.34TFLOPS       | 0.33TFLOPS        | 0.11% | 0.11% |
+| nativetorch |  True    | 0.35TFLOPS      | 0.35TFLOPS      | 0.11%      | 0.11%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 3201.0us       | 3207.17us        | 312.4op/s | 311.8op/s | 1386926.17us | 3283.38us |
+| nativetorch | 3079.7us       | 3086.34us        | 324.71op/s | 324.01op/s | 15379.74us | 3102.82us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 272.61W       | 279.0W      | 4.46W        | 400W  |
+| flaggems监控结果 | 1599.0W | 1716.0W | 117.0W   | /     | 306.91W       | 312.0W      | 4.87W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.979%    | 2.314%   | 48.95°C       | 17.394%        |
+| flaggems监控结果 | 0.735%    | 2.313%   | 50.68°C       | 17.207%        |
diff --git a/operation/benchmarks/abs/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/abs/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..a37bfc2e2
--- /dev/null
+++ b/operation/benchmarks/abs/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.17TFLOPS       | 0.17TFLOPS        | 0.88% | 0.88% |
+| nativetorch | True    | 0.17TFLOPS      | 0.17TFLOPS      | 0.89%      | 0.89%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 6261.54us       | 6272.0us        | 159.71op/s | 159.44op/s | 864443.97us | 6352.77us |
+| nativetorch | 6195.61us       | 6197.25us        | 161.4op/s | 161.36op/s | 20390.02us | 6247.2us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1586.0W | 1638.0W | 73.54W   | /     | 255.76W       | 259.0W      | 2.37W        | 400W  |
+| flaggems监控结果 | 1638.0W | 1716.0W | 110.31W   | /     | 281.87W       | 286.0W      | 3.64W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.734%    | 2.288%   | 48.28°C       | 32.551%        |
+| flaggems监控结果 | 0.756%    | 2.288%   | 49.92°C       | 32.364%        |
diff --git a/operation/benchmarks/abs/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/abs/nvidia/A100_40_SXM/case_config.yaml
index 7d02883ab..bc4b04b42 100644
--- a/operation/benchmarks/abs/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/abs/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 50000
-SPECTFLOPS: 19.5
diff --git a/operation/benchmarks/add/cambricon/MLU/case_config.yaml b/operation/benchmarks/add/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/add/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/add/cambricon/MLU/env.sh b/operation/benchmarks/add/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/add/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/add/case_config.yaml b/operation/benchmarks/add/case_config.yaml
index 6ebcc4612..acc0f44fb 100644
--- a/operation/benchmarks/add/case_config.yaml
+++ b/operation/benchmarks/add/case_config.yaml
@@ -1,5 +1,4 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 50000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/cos/nvidia/A100_40_SXM/README.md b/operation/benchmarks/add/iluvatar/BI150/README.md
similarity index 52%
rename from operation/benchmarks/cos/nvidia/A100_40_SXM/README.md
rename to operation/benchmarks/add/iluvatar/BI150/README.md
index 6963fa775..5d3ca6f56 100644
--- a/operation/benchmarks/cos/nvidia/A100_40_SXM/README.md
+++ b/operation/benchmarks/add/iluvatar/BI150/README.md
@@ -1,56 +1,54 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a31e96c235
-
-# 评测结果
-
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 2.32E-08    | 2.72TFLOPS       | 2.72TFLOPS        | 0.96% | 0.96% |
-| nativetorch | 2.32E-08    | 2.72TFLOPS      | 2.72TFLOPS      | 0.96%      | 0.96%    |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 5.78E-10    | 6140.78us       | 6145.02us        | 162.85op/s | 162.73op/s | 1511239.71us | 6207.61us |
-| nativetorch | 5.78E-10    | 6163.42us       | 6167.55us        | 162.25op/s | 162.14op/s | 10481.96us | 6215.57us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1716.0W | 1716.0W | 0.0W   | /     | 314.13W       | 319.0W      | 4.74W        | 1716.0  |
-| flaggems监控结果 | 1794.0W | 1794.0W | 0.0W   | /     | 366.26W       | 369.0W      | 4.92W        | 1794.0  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 0.726%    | 1.298%   | 52.24°C       | 31.535%        |
-| flaggems监控结果 | 0.778%    | 1.298%   | 54.73°C       | 41.64%        |
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 1.62E-08    | 0.1TFLOPS       | 0.1TFLOPS        | 0.2% | 0.19% |
+| nativetorch | 1.62E-08    | 0.1TFLOPS      | 0.1TFLOPS      | 0.2%      | 0.2%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 7.93E-10    | 11194.37us       | 11215.52us        | 89.33op/s | 89.16op/s | 249786.04us | 11663.84us |
+| nativetorch | 7.93E-10    | 10934.39us       | 10941.88us        | 91.45op/s | 91.39op/s | 11590.2us | 11492.98us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2074.8W | 2090.0W | 30.4W   | /     | 163.98W       | 164.0W      | 0.13W        | 350W  |
+| flaggems监控结果 | 2082.4W | 2109.0W | 35.24W   | /     | 168.87W       | 169.0W      | 1.05W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 41.49%    | 2.391%   | 47.84°C       | 25.739%        |
+| flaggems监控结果 | 44.653%    | 2.389%   | 47.81°C       | 25.739%        |
\ No newline at end of file
diff --git a/operation/benchmarks/add/iluvatar/BI150/case_config.yaml b/operation/benchmarks/add/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..9fe3174da
--- /dev/null
+++ b/operation/benchmarks/add/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 49.152
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
diff --git a/operation/benchmarks/add/iluvatar/BI150/env.sh b/operation/benchmarks/add/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/add/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/add/iluvatar/BI150/requirements.txt b/operation/benchmarks/add/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/add/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/add/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/add/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..ec1c9e233
--- /dev/null
+++ b/operation/benchmarks/add/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,2 @@
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/add/kunlunxin/R300p/env.sh b/operation/benchmarks/add/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/add/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/add/main.py b/operation/benchmarks/add/main.py
index efedcb1ff..2ba2bb250 100644
--- a/operation/benchmarks/add/main.py
+++ b/operation/benchmarks/add/main.py
@@ -4,12 +4,12 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +23,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,36 +53,22 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation"
-          )  # correctness is implemented casebycase
 
     m = case_config.Melements
 
 
-    dtype = {"FP32": torch.float32}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(m, dtype=dtype[config.dataformat])
-        b = torch.randn(m, dtype=dtype[config.dataformat])
-
-        a_fp64 = a.to(torch.float64)
-        b_fp64 = b.to(torch.float64)
-        r_fp64 = torch.add(a_fp64, b_fp64)
-
-        a = a.to(0)
-        b = b.to(0)
-        r_device = torch.add(a, b).cpu()
-        mape = torch.mean(torch.abs(r_device - r_fp64) / torch.abs(r_fp64))
-
-        mmape.append(mape)
-    
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
 
     a = torch.randn(m, 1024, 1024,  dtype=dtype[config.dataformat]).to(0)
     b = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat]).to(0)
@@ -85,8 +79,8 @@ def main(config, case_config):
     op2flops = lambda x: x * 2 * m * 1024 * 1024
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "add", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -94,6 +88,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
diff --git a/operation/benchmarks/add/metax/C550_64/case_config.yaml b/operation/benchmarks/add/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/add/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/add/metax/C550_64/env.sh b/operation/benchmarks/add/metax/C550_64/env.sh
new file mode 100644
index 000000000..bb4cd2dce
--- /dev/null
+++ b/operation/benchmarks/add/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_CONST_ARGS_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/add/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/add/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..5a394a65c
--- /dev/null
+++ b/operation/benchmarks/add/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.46TFLOPS       | 0.46TFLOPS        | 0.15% | 0.15% |
+| nativetorch | True    | 0.45TFLOPS      | 0.45TFLOPS      | 0.15%      | 0.15%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 4670.63us       | 4678.66us        | 214.1op/s | 213.74op/s | 1356985.05us | 4803.36us |
+| nativetorch | 4740.34us       | 4746.24us        | 210.96op/s | 210.69op/s | 24034.67us | 4767.14us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1586.0W | 1638.0W | 73.54W   | /     | 254.6W       | 258.0W      | 4.38W        | 400W  |
+| flaggems监控结果 | 1638.0W | 1716.0W | 110.31W   | /     | 300.3W       | 305.0W      | 6.37W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.088%    | 2.317%   | 47.98°C       | 22.447%        |
+| flaggems监控结果 | 1.032%    | 2.315%   | 51.06°C       | 22.259%        |
diff --git a/operation/benchmarks/add/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/add/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..97d160b79
--- /dev/null
+++ b/operation/benchmarks/add/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.46TFLOPS       | 0.46TFLOPS        | 0.15% | 0.15% |
+| nativetorch | True    | 0.45TFLOPS      | 0.45TFLOPS      | 0.15%      | 0.14%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 4666.92us       | 4674.56us        | 214.27op/s | 213.92op/s | 891453.37us | 4780.82us |
+| nativetorch | 4734.18us       | 4747.26us        | 211.23op/s | 210.65op/s | 1037001.4us | 4766.17us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1638.0W | 110.31W   | /     | 268.62W       | 275.0W      | 3.89W        | 400W  |
+| flaggems监控结果 | 1612.0W | 1716.0W | 97.28W   | /     | 299.53W       | 304.0W      | 4.04W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.911%    | 2.322%   | 48.6°C       | 27.499%        |
+| flaggems监控结果 | 0.805%    | 2.32%   | 49.77°C       | 22.259%        |
diff --git a/operation/benchmarks/add/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/add/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..35b533443
--- /dev/null
+++ b/operation/benchmarks/add/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.23TFLOPS       | 0.23TFLOPS        | 1.18% | 1.18% |
+| nativetorch | True    | 0.23TFLOPS      | 0.23TFLOPS      | 1.16%      | 1.16%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 9330.4us       | 9339.9us        | 107.18op/s | 107.07op/s | 2090310.02us | 9424.17us |
+| nativetorch | 9473.41us       | 9479.17us        | 105.56op/s | 105.49op/s | 28999.91us | 9502.91us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1606.8W | 1638.0W | 62.4W   | /     | 255.59W       | 259.0W      | 2.14W        | 400W  |
+| flaggems监控结果 | 1653.6W | 1716.0W | 90.96W   | /     | 274.46W       | 279.0W      | 1.98W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.818%    | 2.294%   | 49.24°C       | 42.656%        |
+| flaggems监控结果 | 0.832%    | 2.293%   | 48.61°C       | 52.761%        |
diff --git a/operation/benchmarks/add/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/add/nvidia/A100_40_SXM/case_config.yaml
index 8d0d3a431..bc4b04b42 100644
--- a/operation/benchmarks/add/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/add/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 50000
-SPECTFLOPS: 312
\ No newline at end of file
diff --git a/operation/benchmarks/addmm/cambricon/MLU/case_config.yaml b/operation/benchmarks/addmm/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/addmm/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/addmm/cambricon/MLU/env.sh b/operation/benchmarks/addmm/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/addmm/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/addmm/case_config.yaml b/operation/benchmarks/addmm/case_config.yaml
new file mode 100644
index 000000000..95a66438a
--- /dev/null
+++ b/operation/benchmarks/addmm/case_config.yaml
@@ -0,0 +1,7 @@
+M: 4096
+N: 4096
+K: 4096
+WARMUP: 100
+ITERS: 20000
+KERNELWARMUP: 10
+KERNELITERS: 1000
diff --git a/operation/benchmarks/addmm/main.py b/operation/benchmarks/addmm/main.py
new file mode 100644
index 000000000..1c018fac4
--- /dev/null
+++ b/operation/benchmarks/addmm/main.py
@@ -0,0 +1,106 @@
+ # Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+import torch
+import os
+import time
+from argparse import ArgumentParser, Namespace
+import yaml
+import sys
+import subprocess
+
+sys.path.append("..")
+from drivers.utils import *
+from drivers.calculate import *
+
+
+def parse_args():
+    parser = ArgumentParser(description=" ")
+
+    parser.add_argument("--vendor",
+                        type=str,
+                        required=True,
+                        help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
+
+    parser.add_argument("--dataformat",
+                        type=str,
+                        required=True,
+                        help="like FP32,FP16")
+
+    parser.add_argument("--oplib",
+                        type=str,
+                        required=True,
+                        help="impl like pytorch/flaggems/cpp")
+
+    parser.add_argument("--chip",
+                        type=str,
+                        required=True,
+                        help="chip like A100_40_SXM")
+
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
+    set_ieee_float32(config.vendor)
+
+
+    m = case_config.M
+    n = case_config.N
+    k = case_config.K
+
+
+    a = torch.randn(m, n, dtype=dtype[config.dataformat]).to(0)
+    b = torch.randn(n, k, dtype=dtype[config.dataformat]).to(0)
+    c = torch.randn(m, k, dtype=dtype[config.dataformat]).to(0)
+
+    latency_nowarm, latency_warm, cputime, kerneltime = do_test(
+        torch.addmm, (c, a, b), host_device_sync, config, case_config)
+
+    op2flops = lambda x: x * 2 * m * n * k + 3 * x * m * k
+
+    perf_result = cal_perf(cputime, kerneltime, op2flops,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
+                 latency_nowarm, latency_warm)
+
+
+if __name__ == "__main__":
+    config = parse_args()
+    with open("case_config.yaml", "r") as file:
+        case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
+    with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
+              "r") as file:
+        case_config_vendor = yaml.safe_load(file)
+    case_config.update(case_config_vendor)
+    case_config = Namespace(**case_config)
+
+    if config.oplib == "flaggems":
+        import flag_gems
+        flag_gems.enable()
+        print("Using flaggems")
+    else:
+        print("Using nativetorch")
+    main(config, case_config)
\ No newline at end of file
diff --git a/base/benchmarks/computation-BF16/metax/case_config.yaml b/operation/benchmarks/addmm/metax/C550_64/case_config.yaml
old mode 100755
new mode 100644
similarity index 53%
rename from base/benchmarks/computation-BF16/metax/case_config.yaml
rename to operation/benchmarks/addmm/metax/C550_64/case_config.yaml
index fab052b4e..9e3a04d03
--- a/base/benchmarks/computation-BF16/metax/case_config.yaml
+++ b/operation/benchmarks/addmm/metax/C550_64/case_config.yaml
@@ -1,4 +1,4 @@
 M: 8192
 N: 8192
 K: 8192
-DIST_BACKEND: "nccl"
+ITERS: 20000
\ No newline at end of file
diff --git a/operation/benchmarks/addmm/metax/C550_64/env.sh b/operation/benchmarks/addmm/metax/C550_64/env.sh
new file mode 100644
index 000000000..79ff0fea1
--- /dev/null
+++ b/operation/benchmarks/addmm/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_ENABLE_COMPILER_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/addmm/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/addmm/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..586c5b180
--- /dev/null
+++ b/operation/benchmarks/addmm/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 234.35TFLOPS       | 249.1TFLOPS        | 75.11% | 79.84% |
+| nativetorch | True    | 223.36TFLOPS      | 227.57TFLOPS      | 71.59%      | 72.94%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 586.68us       | 551.94us        | 1704.5op/s | 1811.8op/s | 7012544.35us | 595.59us |
+| nativetorch | 615.56us       | 604.16us        | 1624.53op/s | 1655.19op/s | 137563.58us | 743.5us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1540.5W | 1794.0W | 149.78W   | /     | 397.42W       | 402.0W      | 4.11W        | 400W  |
+| flaggems监控结果 | 1540.5W | 1794.0W | 149.78W   | /     | 399.7W       | 408.0W      | 3.88W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.678%    | 1.114%   | 62.96°C       | 2.534%        |
+| flaggems监控结果 | 0.829%    | 1.116%   | 62.92°C       | 2.534%        |
diff --git a/operation/benchmarks/addmm/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/addmm/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..598d155ef
--- /dev/null
+++ b/operation/benchmarks/addmm/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 228.75TFLOPS       | 243.68TFLOPS        | 73.32% | 78.1% |
+| nativetorch | True    | 218.3TFLOPS      | 225.28TFLOPS      | 69.97%      | 72.21%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 601.04us       | 564.22us        | 1663.8op/s | 1772.35op/s | 8423981.07us | 672.99us |
+| nativetorch | 629.83us       | 610.3us        | 1587.73op/s | 1638.53op/s | 735235.98us | 738.47us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1521.0W | 1794.0W | 160.8W   | /     | 399.8W       | 410.0W      | 6.25W        | 400W  |
+| flaggems监控结果 | 1540.5W | 1794.0W | 149.78W   | /     | 398.33W       | 410.0W      | 4.45W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.573%    | 1.119%   | 62.96°C       | 2.534%        |
+| flaggems监控结果 | 0.689%    | 1.113%   | 62.52°C       | 2.534%        |
diff --git a/operation/benchmarks/addmm/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/addmm/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..0e6876052
--- /dev/null
+++ b/operation/benchmarks/addmm/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 18.4TFLOPS       | 18.39TFLOPS        | 94.34% | 94.3% |
+| nativetorch | True    | 18.77TFLOPS      | 18.73TFLOPS      | 96.26%      | 96.07%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 7473.76us       | 7477.25us        | 133.8op/s | 133.74op/s | 21083149.32us | 7553.75us |
+| nativetorch | 7324.84us       | 7339.01us        | 136.52op/s | 136.26op/s | 84724.97us | 7346.76us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1725.75W | 1794.0W | 131.9W   | /     | 360.9W       | 366.0W      | 2.26W        | 400W  |
+| flaggems监控结果 | 1706.25W | 1794.0W | 153.23W   | /     | 355.72W       | 360.0W      | 3.04W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.759%    | 1.094%   | 63.27°C       | 2.534%        |
+| flaggems监控结果 | 0.746%    | 1.091%   | 62.83°C       | 2.534%        |
diff --git a/operation/benchmarks/addmm/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/addmm/nvidia/A100_40_SXM/case_config.yaml
new file mode 100644
index 000000000..5bc8be0b4
--- /dev/null
+++ b/operation/benchmarks/addmm/nvidia/A100_40_SXM/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 20000
diff --git a/operation/benchmarks/addmm/nvidia/A100_40_SXM/env.sh b/operation/benchmarks/addmm/nvidia/A100_40_SXM/env.sh
new file mode 100644
index 000000000..33786ec0d
--- /dev/null
+++ b/operation/benchmarks/addmm/nvidia/A100_40_SXM/env.sh
@@ -0,0 +1 @@
+echo "NVIDIA PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/addmm/nvidia/A100_40_SXM/requirements.txt b/operation/benchmarks/addmm/nvidia/A100_40_SXM/requirements.txt
new file mode 100644
index 000000000..330e27963
--- /dev/null
+++ b/operation/benchmarks/addmm/nvidia/A100_40_SXM/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/operation/benchmarks/all/cambricon/MLU/case_config.yaml b/operation/benchmarks/all/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/all/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/all/cambricon/MLU/env.sh b/operation/benchmarks/all/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/all/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/all/case_config.yaml b/operation/benchmarks/all/case_config.yaml
new file mode 100644
index 000000000..acc0f44fb
--- /dev/null
+++ b/operation/benchmarks/all/case_config.yaml
@@ -0,0 +1,5 @@
+Melements: 1024
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/all/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/all/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..398fbc7c8
--- /dev/null
+++ b/operation/benchmarks/all/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,3 @@
+Shape: [4096, 256]
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/all/kunlunxin/R300p/env.sh b/operation/benchmarks/all/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/all/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/all/main.py b/operation/benchmarks/all/main.py
new file mode 100644
index 000000000..236b07884
--- /dev/null
+++ b/operation/benchmarks/all/main.py
@@ -0,0 +1,110 @@
+ # Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+import torch
+import os
+import time
+from argparse import ArgumentParser, Namespace
+import yaml
+import sys
+import subprocess
+import math
+
+sys.path.append("..")
+from drivers.utils import *
+from drivers.calculate import *
+
+
+def parse_args():
+    parser = ArgumentParser(description=" ")
+
+    parser.add_argument("--vendor",
+                        type=str,
+                        required=True,
+                        help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
+
+    parser.add_argument("--dataformat",
+                        type=str,
+                        required=True,
+                        help="like FP32,FP16")
+
+    parser.add_argument("--oplib",
+                        type=str,
+                        required=True,
+                        help="impl like pytorch/flaggems/cpp")
+
+    parser.add_argument("--chip",
+                        type=str,
+                        required=True,
+                        help="chip like A100_40_SXM")
+
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
+    set_ieee_float32(config.vendor)
+
+
+    m = case_config.Melements
+    # default arange: 0, M * 1024 * 1024
+    arange_end = m * 1024 * 1024
+
+    if config.vendor == 'kunlunxin':
+        # if `Shape' specified in `case_config.yaml', use it
+        if case_config.__contains__('Shape') and case_config.Shape is not None:
+            arange_end = math.prod(case_config.Shape)
+
+    a = torch.arange(0, arange_end).to(0)
+    print(f'Shape for performance_test: {a.shape}')
+
+    latency_nowarm, latency_warm, cputime, kerneltime = do_test(
+        torch.all, (a, ), host_device_sync, config, case_config)
+
+    op2flops = lambda x: x * arange_end
+
+    perf_result = cal_perf(cputime, kerneltime, op2flops,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
+                 latency_nowarm, latency_warm)
+
+
+if __name__ == "__main__":
+    config = parse_args()
+    with open("case_config.yaml", "r") as file:
+        case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
+    with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
+              "r") as file:
+        case_config_vendor = yaml.safe_load(file)
+    case_config.update(case_config_vendor)
+    case_config = Namespace(**case_config)
+
+    if config.oplib == "flaggems":
+        import flag_gems
+        flag_gems.enable()
+        print("Using flaggems")
+    else:
+        print("Using nativetorch")
+    main(config, case_config)
diff --git a/operation/benchmarks/all/metax/C550_64/case_config.yaml b/operation/benchmarks/all/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..e38fa96aa
--- /dev/null
+++ b/operation/benchmarks/all/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 5000
diff --git a/operation/benchmarks/all/metax/C550_64/env.sh b/operation/benchmarks/all/metax/C550_64/env.sh
new file mode 100644
index 000000000..0cdec082d
--- /dev/null
+++ b/operation/benchmarks/all/metax/C550_64/env.sh
@@ -0,0 +1 @@
+echo "METAX PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/all/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/all/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..78571f882
--- /dev/null
+++ b/operation/benchmarks/all/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.18TFLOPS       | 0.18TFLOPS        | 0.06% | 0.06% |
+| nativetorch | True    | 0.19TFLOPS      | 0.18TFLOPS      | 0.06%      | 0.06%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 5917.67us       | 5940.22us        | 168.99op/s | 168.34op/s | 3061884.82us | 5991.7us |
+| nativetorch | 5788.96us       | 5811.2us        | 172.74op/s | 172.08op/s | 29267.76us | 5816.68us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1404.0W | 1404.0W | 0.0W   | /     | 210.0W       | 211.0W      | 1.41W        | 400W  |
+| flaggems监控结果 | 1482.0W | 1482.0W | 0.0W   | /     | 210.67W       | 213.0W      | 2.62W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.828%    | 2.341%   | 39.0°C       | 23.059%        |
+| flaggems监控结果 | 0.684%    | 2.334%   | 39.17°C       | 22.264%        |
diff --git a/operation/benchmarks/all/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/all/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..d71c04289
--- /dev/null
+++ b/operation/benchmarks/all/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.18TFLOPS       | 0.18TFLOPS        | 0.06% | 0.06% |
+| nativetorch | True    | 0.19TFLOPS      | 0.18TFLOPS      | 0.06%      | 0.06%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 5917.67us       | 5940.22us        | 168.99op/s | 168.34op/s | 2575345.7us | 5978.89us |
+| nativetorch | 5789.12us       | 5810.18us        | 172.74op/s | 172.11op/s | 26647.94us | 5806.55us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1482.0W | 1482.0W | 0.0W   | /     | 208.67W       | 210.0W      | 1.7W        | 400W  |
+| flaggems监控结果 | 1404.0W | 1404.0W | 0.0W   | /     | 211.33W       | 214.0W      | 2.13W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.925%    | 2.284%   | 39.38°C       | 22.452%        |
+| flaggems监控结果 | 1.627%    | 2.296%   | 40.08°C       | 22.264%        |
diff --git a/operation/benchmarks/all/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/all/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..0c9439ba2
--- /dev/null
+++ b/operation/benchmarks/all/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.18TFLOPS       | 0.18TFLOPS        | 0.93% | 0.93% |
+| nativetorch | True    | 0.19TFLOPS      | 0.18TFLOPS      | 0.95%      | 0.95%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 5938.6us       | 5940.22us        | 168.39op/s | 168.34op/s | 1786354.81us | 5985.48us |
+| nativetorch | 5789.48us       | 5811.2us        | 172.73op/s | 172.08op/s | 25188.06us | 5816.95us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1482.0W | 1482.0W | 0.0W   | /     | 209.83W       | 213.0W      | 2.11W        | 400W  |
+| flaggems监控结果 | 1482.0W | 1482.0W | 0.0W   | /     | 214.33W       | 215.0W      | 0.75W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.065%    | 2.284%   | 41.29°C       | 22.452%        |
+| flaggems监控结果 | 1.073%    | 2.284%   | 43.68°C       | 22.264%        |
diff --git a/operation/benchmarks/all/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/all/nvidia/A100_40_SXM/case_config.yaml
new file mode 100644
index 000000000..e38fa96aa
--- /dev/null
+++ b/operation/benchmarks/all/nvidia/A100_40_SXM/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 5000
diff --git a/operation/benchmarks/all/nvidia/A100_40_SXM/env.sh b/operation/benchmarks/all/nvidia/A100_40_SXM/env.sh
new file mode 100644
index 000000000..33786ec0d
--- /dev/null
+++ b/operation/benchmarks/all/nvidia/A100_40_SXM/env.sh
@@ -0,0 +1 @@
+echo "NVIDIA PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/all/nvidia/A100_40_SXM/requirements.txt b/operation/benchmarks/all/nvidia/A100_40_SXM/requirements.txt
new file mode 100644
index 000000000..330e27963
--- /dev/null
+++ b/operation/benchmarks/all/nvidia/A100_40_SXM/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/operation/benchmarks/amax/cambricon/MLU/case_config.yaml b/operation/benchmarks/amax/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/amax/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/amax/cambricon/MLU/env.sh b/operation/benchmarks/amax/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/amax/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/amax/case_config.yaml b/operation/benchmarks/amax/case_config.yaml
new file mode 100644
index 000000000..53cadd1fd
--- /dev/null
+++ b/operation/benchmarks/amax/case_config.yaml
@@ -0,0 +1,6 @@
+M: 1024
+N: 1024
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
diff --git a/operation/benchmarks/amax/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/amax/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..1138a649a
--- /dev/null
+++ b/operation/benchmarks/amax/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,3 @@
+Shape: [4096, 1]
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/amax/kunlunxin/R300p/env.sh b/operation/benchmarks/amax/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/amax/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/amax/main.py b/operation/benchmarks/amax/main.py
new file mode 100644
index 000000000..228375cef
--- /dev/null
+++ b/operation/benchmarks/amax/main.py
@@ -0,0 +1,111 @@
+ # Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+import torch
+import os
+import time
+from argparse import ArgumentParser, Namespace
+import yaml
+import sys
+import subprocess
+import math
+
+sys.path.append("..")
+from drivers.utils import *
+from drivers.calculate import *
+
+
+def parse_args():
+    parser = ArgumentParser(description=" ")
+
+    parser.add_argument("--vendor",
+                        type=str,
+                        required=True,
+                        help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
+
+    parser.add_argument("--dataformat",
+                        type=str,
+                        required=True,
+                        help="like FP32,FP16")
+
+    parser.add_argument("--oplib",
+                        type=str,
+                        required=True,
+                        help="impl like pytorch/flaggems/cpp")
+
+    parser.add_argument("--chip",
+                        type=str,
+                        required=True,
+                        help="chip like A100_40_SXM")
+
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
+    set_ieee_float32(config.vendor)
+
+
+    m = case_config.M
+    n = case_config.N
+    # default shape: (M*80, N*80)
+    shape = (m * 80, n * 80)
+
+    if config.vendor == 'kunlunxin':
+        # if `Shape' specified in `case_config.yaml', use it
+        if case_config.__contains__('Shape') and case_config.Shape is not None:
+            shape = case_config.Shape
+
+    a = torch.randn(shape, dtype=dtype[config.dataformat]).to(0)
+    print(f'Shape for performance_test: {a.shape}')
+
+    latency_nowarm, latency_warm, cputime, kerneltime = do_test(
+        torch.amax, (a, 1), host_device_sync, config, case_config)
+
+    op2flops = lambda x: x * math.prod(shape)
+
+    perf_result = cal_perf(cputime, kerneltime, op2flops,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
+                 latency_nowarm, latency_warm)
+
+
+if __name__ == "__main__":
+    config = parse_args()
+    with open("case_config.yaml", "r") as file:
+        case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
+    with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
+              "r") as file:
+        case_config_vendor = yaml.safe_load(file)
+    case_config.update(case_config_vendor)
+    case_config = Namespace(**case_config)
+
+    if config.oplib == "flaggems":
+        import flag_gems
+        flag_gems.enable()
+        print("Using flaggems")
+    else:
+        print("Using nativetorch")
+    main(config, case_config)
diff --git a/operation/benchmarks/amax/metax/C550_64/case_config.yaml b/operation/benchmarks/amax/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..529af74ce
--- /dev/null
+++ b/operation/benchmarks/amax/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
\ No newline at end of file
diff --git a/operation/benchmarks/amax/metax/C550_64/env.sh b/operation/benchmarks/amax/metax/C550_64/env.sh
new file mode 100644
index 000000000..0cdec082d
--- /dev/null
+++ b/operation/benchmarks/amax/metax/C550_64/env.sh
@@ -0,0 +1 @@
+echo "METAX PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/amax/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/amax/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..b942785d2
--- /dev/null
+++ b/operation/benchmarks/amax/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 7042de1d8fb6f978596322faaeda6b55ca1ae5ec
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.71TFLOPS       | 0.71TFLOPS        | 0.23% | 0.23% |
+| nativetorch | True    | 0.66TFLOPS      | 0.66TFLOPS      | 0.21%      | 0.21%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 9393.99us       | 9414.66us        | 106.45op/s | 106.22op/s | 3538341.1us | 9482.42us |
+| nativetorch | 10144.45us       | 10165.25us        | 98.58op/s | 98.37op/s | 855711.86us | 10205.15us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1612.0W | 1716.0W | 147.08W   | /     | 312.16W       | 314.0W      | 4.36W        | 400W  |
+| flaggems监控结果 | 1599.0W | 1716.0W | 117.0W   | /     | 277.8W       | 282.0W      | 3.13W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.632%    | 1.292%   | 52.26°C       | 33.405%        |
+| flaggems监控结果 | 0.611%    | 1.322%   | 48.9°C       | 33.405%        |
diff --git a/operation/benchmarks/amax/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/amax/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..2b22ccbf4
--- /dev/null
+++ b/operation/benchmarks/amax/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 7042de1d8fb6f978596322faaeda6b55ca1ae5ec
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.71TFLOPS       | 0.71TFLOPS        | 0.23% | 0.23% |
+| nativetorch | True    | 0.67TFLOPS      | 0.67TFLOPS      | 0.21%      | 0.21%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 9399.79us       | 9422.85us        | 106.39op/s | 106.13op/s | 3611338.23us | 9498.9us |
+| nativetorch | 10025.23us       | 10046.46us        | 99.75op/s | 99.54op/s | 1141446.76us | 10060.09us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1582.29W | 1716.0W | 154.4W   | /     | 319.51W       | 323.0W      | 3.64W        | 400W  |
+| flaggems监控结果 | 1573.0W | 1716.0W | 145.34W   | /     | 281.84W       | 287.0W      | 6.24W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.622%    | 1.331%   | 50.64°C       | 33.405%        |
+| flaggems监控结果 | 0.63%    | 1.365%   | 48.53°C       | 33.405%        |
diff --git a/operation/benchmarks/amax/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/amax/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..f0fb44713
--- /dev/null
+++ b/operation/benchmarks/amax/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 7042de1d8fb6f978596322faaeda6b55ca1ae5ec
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.36TFLOPS       | 0.36TFLOPS        | 1.84% | 1.83% |
+| nativetorch | True    | 0.34TFLOPS      | 0.34TFLOPS      | 1.76%      | 1.75%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 18744.77us       | 18765.82us        | 53.35op/s | 53.29op/s | 2989468.51us | 18853.41us |
+| nativetorch | 19601.09us       | 19630.08us        | 51.02op/s | 50.94op/s | 246469.82us | 19694.48us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1620.67W | 1716.0W | 80.37W   | /     | 268.39W       | 272.0W      | 1.56W        | 400W  |
+| flaggems监控结果 | 1612.0W | 1638.0W | 73.54W   | /     | 268.85W       | 271.0W      | 1.97W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.623%    | 1.082%   | 48.21°C       | 64.375%        |
+| flaggems监控结果 | 0.632%    | 1.073%   | 48.21°C       | 64.982%        |
diff --git a/operation/benchmarks/amax/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/amax/nvidia/A100_40_SXM/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/amax/nvidia/A100_40_SXM/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/amax/nvidia/A100_40_SXM/env.sh b/operation/benchmarks/amax/nvidia/A100_40_SXM/env.sh
new file mode 100644
index 000000000..33786ec0d
--- /dev/null
+++ b/operation/benchmarks/amax/nvidia/A100_40_SXM/env.sh
@@ -0,0 +1 @@
+echo "NVIDIA PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/amax/nvidia/A100_40_SXM/requirements.txt b/operation/benchmarks/amax/nvidia/A100_40_SXM/requirements.txt
new file mode 100644
index 000000000..330e27963
--- /dev/null
+++ b/operation/benchmarks/amax/nvidia/A100_40_SXM/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/operation/benchmarks/argmax/cambricon/MLU/case_config.yaml b/operation/benchmarks/argmax/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/argmax/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/argmax/cambricon/MLU/env.sh b/operation/benchmarks/argmax/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/argmax/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/argmax/case_config.yaml b/operation/benchmarks/argmax/case_config.yaml
new file mode 100644
index 000000000..53cadd1fd
--- /dev/null
+++ b/operation/benchmarks/argmax/case_config.yaml
@@ -0,0 +1,6 @@
+M: 1024
+N: 1024
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
diff --git a/operation/benchmarks/argmax/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/argmax/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..1138a649a
--- /dev/null
+++ b/operation/benchmarks/argmax/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,3 @@
+Shape: [4096, 1]
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/argmax/kunlunxin/R300p/env.sh b/operation/benchmarks/argmax/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/argmax/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/argmax/main.py b/operation/benchmarks/argmax/main.py
new file mode 100644
index 000000000..4f99b68b8
--- /dev/null
+++ b/operation/benchmarks/argmax/main.py
@@ -0,0 +1,111 @@
+ # Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+import torch
+import os
+import time
+from argparse import ArgumentParser, Namespace
+import yaml
+import sys
+import subprocess
+import math
+
+sys.path.append("..")
+from drivers.utils import *
+from drivers.calculate import *
+
+
+def parse_args():
+    parser = ArgumentParser(description=" ")
+
+    parser.add_argument("--vendor",
+                        type=str,
+                        required=True,
+                        help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
+
+    parser.add_argument("--dataformat",
+                        type=str,
+                        required=True,
+                        help="like FP32,FP16")
+
+    parser.add_argument("--oplib",
+                        type=str,
+                        required=True,
+                        help="impl like pytorch/flaggems/cpp")
+
+    parser.add_argument("--chip",
+                        type=str,
+                        required=True,
+                        help="chip like A100_40_SXM")
+
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
+    set_ieee_float32(config.vendor)
+
+
+    m = case_config.M
+    n = case_config.N
+    # default shape: (M * 80, N * 80)
+    shape = (m * 80, n * 80)
+
+    if config.vendor == 'kunlunxin':
+        # if `Shape' specified in `case_config.yaml', use it
+        if case_config.__contains__('Shape') and case_config.Shape is not None:
+            shape = case_config.Shape
+
+    a = torch.randn(shape, dtype=dtype[config.dataformat]).to(0)
+    print(f'Shape for performance_test: {a.shape}')
+
+    latency_nowarm, latency_warm, cputime, kerneltime = do_test(
+        torch.argmax, (a, 1), host_device_sync, config, case_config)
+
+    op2flops = lambda x: x * math.prod(shape)
+
+    perf_result = cal_perf(cputime, kerneltime, op2flops,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
+                 latency_nowarm, latency_warm)
+
+
+if __name__ == "__main__":
+    config = parse_args()
+    with open("case_config.yaml", "r") as file:
+        case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
+    with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
+              "r") as file:
+        case_config_vendor = yaml.safe_load(file)
+    case_config.update(case_config_vendor)
+    case_config = Namespace(**case_config)
+
+    if config.oplib == "flaggems":
+        import flag_gems
+        flag_gems.enable()
+        print("Using flaggems")
+    else:
+        print("Using nativetorch")
+    main(config, case_config)
diff --git a/operation/benchmarks/argmax/metax/C550_64/case_config.yaml b/operation/benchmarks/argmax/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..ebb286609
--- /dev/null
+++ b/operation/benchmarks/argmax/metax/C550_64/case_config.yaml
@@ -0,0 +1,2 @@
+M: 256
+ITERS: 50000
\ No newline at end of file
diff --git a/operation/benchmarks/argmax/metax/C550_64/env.sh b/operation/benchmarks/argmax/metax/C550_64/env.sh
new file mode 100644
index 000000000..0cdec082d
--- /dev/null
+++ b/operation/benchmarks/argmax/metax/C550_64/env.sh
@@ -0,0 +1 @@
+echo "METAX PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/argmax/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/argmax/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..6fa2fce49
--- /dev/null
+++ b/operation/benchmarks/argmax/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 7042de1d8fb6f978596322faaeda6b55ca1ae5ec
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.22TFLOPS       | 0.22TFLOPS        | 0.07% | 0.07% |
+| nativetorch | True    | 0.53TFLOPS      | 0.53TFLOPS      | 0.17%      | 0.17%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 31019.07us       | 31038.46us        | 32.24op/s | 32.22op/s | 15481359.71us | 31264.4us |
+| nativetorch | 12650.3us       | 12666.88us        | 79.05op/s | 78.95op/s | 35978.48us | 12688.63us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1626.86W | 1716.0W | 140.95W   | /     | 330.6W       | 333.0W      | 4.31W        | 400W  |
+| flaggems监控结果 | 1539.2W | 1560.0W | 53.03W   | /     | 191.29W       | 192.0W      | 0.98W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.644%    | 1.336%   | 54.84°C       | 33.454%        |
+| flaggems监控结果 | 0.659%    | 1.208%   | 44.58°C       | 34.214%        |
diff --git a/operation/benchmarks/argmax/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/argmax/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..98d4e0535
--- /dev/null
+++ b/operation/benchmarks/argmax/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 7042de1d8fb6f978596322faaeda6b55ca1ae5ec
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.21TFLOPS       | 0.21TFLOPS        | 0.07% | 0.07% |
+| nativetorch | True    | 0.55TFLOPS      | 0.55TFLOPS      | 0.18%      | 0.18%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 31410.93us       | 31428.61us        | 31.84op/s | 31.82op/s | 14161757.85us | 31670.44us |
+| nativetorch | 12174.38us       | 12191.74us        | 82.14op/s | 82.02op/s | 32883.35us | 12228.56us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1638.0W | 1794.0W | 182.93W   | /     | 352.22W       | 355.0W      | 5.21W        | 400W  |
+| flaggems监控结果 | 1535.62W | 1560.0W | 53.18W   | /     | 192.61W       | 198.0W      | 0.92W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.639%    | 1.376%   | 54.12°C       | 32.847%        |
+| flaggems监控结果 | 0.672%    | 1.227%   | 44.75°C       | 34.224%        |
diff --git a/operation/benchmarks/argmax/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/argmax/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..2dd7f2e7e
--- /dev/null
+++ b/operation/benchmarks/argmax/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 7042de1d8fb6f978596322faaeda6b55ca1ae5ec
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.19TFLOPS       | 0.19TFLOPS        | 0.98% | 0.98% |
+| nativetorch | True    | 0.34TFLOPS      | 0.34TFLOPS      | 1.75%      | 1.75%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 35037.52us       | 35058.69us        | 28.54op/s | 28.52op/s | 14506439.34us | 35723.93us |
+| nativetorch | 19617.96us       | 19640.32us        | 50.97op/s | 50.92op/s | 37356.09us | 19732.22us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1690.0W | 1716.0W | 73.54W   | /     | 323.32W       | 326.0W      | 2.67W        | 400W  |
+| flaggems监控结果 | 1618.5W | 1638.0W | 58.5W   | /     | 223.19W       | 225.0W      | 1.49W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.752%    | 1.168%   | 50.93°C       | 65.031%        |
+| flaggems监控结果 | 0.753%    | 1.122%   | 47.62°C       | 65.732%        |
diff --git a/operation/benchmarks/argmax/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/argmax/nvidia/A100_40_SXM/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/argmax/nvidia/A100_40_SXM/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/argmax/nvidia/A100_40_SXM/env.sh b/operation/benchmarks/argmax/nvidia/A100_40_SXM/env.sh
new file mode 100644
index 000000000..33786ec0d
--- /dev/null
+++ b/operation/benchmarks/argmax/nvidia/A100_40_SXM/env.sh
@@ -0,0 +1 @@
+echo "NVIDIA PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/argmax/nvidia/A100_40_SXM/requirements.txt b/operation/benchmarks/argmax/nvidia/A100_40_SXM/requirements.txt
new file mode 100644
index 000000000..330e27963
--- /dev/null
+++ b/operation/benchmarks/argmax/nvidia/A100_40_SXM/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/operation/benchmarks/bitwise_and/cambricon/MLU/case_config.yaml b/operation/benchmarks/bitwise_and/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/bitwise_and/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/bitwise_and/cambricon/MLU/env.sh b/operation/benchmarks/bitwise_and/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/bitwise_and/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/bitwise_and/case_config.yaml b/operation/benchmarks/bitwise_and/case_config.yaml
index 6ebcc4612..acc0f44fb 100644
--- a/operation/benchmarks/bitwise_and/case_config.yaml
+++ b/operation/benchmarks/bitwise_and/case_config.yaml
@@ -1,5 +1,4 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 50000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/bitwise_and/nvidia/A100_40_SXM/README.md b/operation/benchmarks/bitwise_and/iluvatar/BI150/README.md
similarity index 55%
rename from operation/benchmarks/bitwise_and/nvidia/A100_40_SXM/README.md
rename to operation/benchmarks/bitwise_and/iluvatar/BI150/README.md
index ec9883a72..4e5e91473 100644
--- a/operation/benchmarks/bitwise_and/nvidia/A100_40_SXM/README.md
+++ b/operation/benchmarks/bitwise_and/iluvatar/BI150/README.md
@@ -1,57 +1,54 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:9168f2d031ecc1b31a9f658fb66dd6735b7306b3
-
-# 评测结果
-
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 0.00E+00    | /  | /        | / | / |
-| nativetorch | 0.00E+00    | /  | /      | /      | /    |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时>延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 0.00E+00    | 108940.43us       | 106414.08us        | 9.18op/s | 9.4op/s | 279049.06us | 98172.16us |
-| nativetorch | 0.00E+00    | 100299.41us       | 96353.28us        | 9.97op/s | 10.38op/s | 270552.12us | 96428.79us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单
-卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1560.0W | 1560.0W | 0.0W   | /     | 59.96W       | 62.0W      | 1.89W        | 1560.0  |
-| flaggems监控结果 | 1560.0W | 1560.0W | 0.0W   | /     | 60.33W       | 62.0W      | 1.55W        | 1560.0  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 19.861%    | 1.719%   | 32.85°C       | 1.828%        |
-| flaggems监控结果 | 19.185%    | 1.73%   | 33.29°C       | 1.034%        |
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 0.00E+00    | /  | /        | / | / |
+| nativetorch | 0.00E+00    | /  | /      | /      | /    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 0.00E+00    | 3482.24us       | 3509.47us        | 287.17op/s | 284.94op/s | 234861.08us | 3881.73us |
+| nativetorch | 0.00E+00    | 3282.83us       | 3313.42us        | 304.62op/s | 301.8op/s | 3543.5us | 3453.43us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2042.5W | 2090.0W | 47.5W   | /     | 176.7W       | 178.0W      | 3.02W        | 350W  |
+| flaggems监控结果 | 2033.0W | 2071.0W | 38.0W   | /     | 162.5W       | 163.0W      | 2.2W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 40.862%    | 2.406%   | 47.12°C       | 8.551%        |
+| flaggems监控结果 | 41.247%    | 2.406%   | 43.82°C       | 6.989%        |
diff --git a/operation/benchmarks/bitwise_and/iluvatar/BI150/case_config.yaml b/operation/benchmarks/bitwise_and/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..dd97b6504
--- /dev/null
+++ b/operation/benchmarks/bitwise_and/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: -1.0
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
diff --git a/operation/benchmarks/bitwise_and/iluvatar/BI150/env.sh b/operation/benchmarks/bitwise_and/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/bitwise_and/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/bitwise_and/iluvatar/BI150/requirements.txt b/operation/benchmarks/bitwise_and/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/bitwise_and/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/bitwise_and/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/bitwise_and/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..ec1c9e233
--- /dev/null
+++ b/operation/benchmarks/bitwise_and/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,2 @@
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/bitwise_and/kunlunxin/R300p/env.sh b/operation/benchmarks/bitwise_and/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/bitwise_and/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/bitwise_and/main.py b/operation/benchmarks/bitwise_and/main.py
index 4fee17721..d68bb6e4e 100644
--- a/operation/benchmarks/bitwise_and/main.py
+++ b/operation/benchmarks/bitwise_and/main.py
@@ -4,12 +4,12 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +23,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,45 +53,35 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation")
     m = case_config.Melements
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(m) 
-        a = (127 * a).to(torch.int8)
-        b = torch.randn(m) 
-        b = (127 * b).to(torch.int8)
-
-        r_cpu = torch.bitwise_and(a, b)
-
-        a = a.to(0)
-        b = b.to(0)
-        r_device = torch.bitwise_and(a, b).cpu() 
-        mape = ((r_device != r_cpu).float().sum()/r_cpu.numel()).item()
-
-        mmape.append(mape)
     
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
-
-    a = torch.randn(m, 1024, 1024) 
-    a = (127 * a).to(torch.int8)
-    b = torch.randn(m, 1024, 1024) 
-    b = (127 * b).to(torch.int8)
+    low = -32768
+    high = 32768
+    a = torch.randint(low, high, (m, 1024, 1024),  dtype=dtype[config.dataformat]) 
+    a = (127 * a).to(0)
+    b = torch.randint(low, high, (m, 1024, 1024),  dtype=dtype[config.dataformat]) 
+    b = (127 * b).to(0)
 
     latency_nowarm, latency_warm, cputime, kerneltime = do_test(
         torch.bitwise_and, (a, b), host_device_sync, config, case_config)
 
-    op2flops = lambda x: 0.0
+    op2flops = lambda x: x * m * 1024 * 1024
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "bitwise_and", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -91,6 +89,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
@@ -103,4 +102,4 @@ def main(config, case_config):
         print("Using flaggems")
     else:
         print("Using nativetorch")
-    main(config, case_config)
\ No newline at end of file
+    main(config, case_config)
diff --git a/operation/benchmarks/bitwise_and/metax/C550_64/case_config.yaml b/operation/benchmarks/bitwise_and/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..35cf096b2
--- /dev/null
+++ b/operation/benchmarks/bitwise_and/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 1000
diff --git a/operation/benchmarks/bitwise_and/metax/C550_64/env.sh b/operation/benchmarks/bitwise_and/metax/C550_64/env.sh
new file mode 100644
index 000000000..bb4cd2dce
--- /dev/null
+++ b/operation/benchmarks/bitwise_and/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_CONST_ARGS_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/bitwise_and/nvidia/A100_40_SXM/INT16_README.md b/operation/benchmarks/bitwise_and/nvidia/A100_40_SXM/INT16_README.md
new file mode 100644
index 000000000..992c3485b
--- /dev/null
+++ b/operation/benchmarks/bitwise_and/nvidia/A100_40_SXM/INT16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.23TFLOPS       | 0.23TFLOPS        | -23.01% | -22.97% |
+| nativetorch | True    | 0.23TFLOPS      | 0.23TFLOPS      | -22.63%      | -22.61%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 4666.37us       | 4673.54us        | 214.3op/s | 213.97op/s | 855168.94us | 5038.66us |
+| nativetorch | 4745.46us       | 4748.29us        | 210.73op/s | 210.6op/s | 16824.05us | 4763.28us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1404.0W | 1404.0W | 0.0W   | /     | 259.0W       | 261.0W      | 2.0W        | 400W  |
+| flaggems监控结果 | 1404.0W | 1404.0W | 0.0W   | /     | 285.0W       | 285.0W      | 0.0W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.601%    | 1.17%   | 36.27°C       | 26.478%        |
+| flaggems监控结果 | 0.458%    | 1.149%   | 36.33°C       | 21.238%        |
diff --git a/operation/benchmarks/bitwise_and/nvidia/A100_40_SXM/INT32_README.md b/operation/benchmarks/bitwise_and/nvidia/A100_40_SXM/INT32_README.md
new file mode 100644
index 000000000..7845e6b5a
--- /dev/null
+++ b/operation/benchmarks/bitwise_and/nvidia/A100_40_SXM/INT32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.11TFLOPS       | 0.11TFLOPS        | 0.59% | 0.59% |
+| nativetorch | True    | 0.11TFLOPS      | 0.11TFLOPS      | 0.58%      | 0.58%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 9343.12us       | 9352.19us        | 107.03op/s | 106.93op/s | 933369.31us | 9470.82us |
+| nativetorch | 9473.68us       | 9480.19us        | 105.56op/s | 105.48op/s | 23693.9us | 9507.68us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1404.0W | 1404.0W | 0.0W   | /     | 243.0W       | 243.0W      | 0.0W        | 400W  |
+| flaggems监控结果 | 1404.0W | 1404.0W | 0.0W   | /     | 261.5W       | 264.0W      | 2.5W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.773%    | 1.163%   | 36.79°C       | 51.739%        |
+| flaggems监控结果 | 0.606%    | 1.213%   | 41.0°C       | 41.447%        |
diff --git a/operation/benchmarks/bitwise_and/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/bitwise_and/nvidia/A100_40_SXM/case_config.yaml
index 75bb60240..35cf096b2 100644
--- a/operation/benchmarks/bitwise_and/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/bitwise_and/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 1000
-SPECTFLOPS: -1.0
\ No newline at end of file
diff --git a/operation/benchmarks/bitwise_not/cambricon/MLU/case_config.yaml b/operation/benchmarks/bitwise_not/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/bitwise_not/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/bitwise_not/cambricon/MLU/env.sh b/operation/benchmarks/bitwise_not/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/bitwise_not/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/bitwise_not/case_config.yaml b/operation/benchmarks/bitwise_not/case_config.yaml
index 6ebcc4612..acc0f44fb 100644
--- a/operation/benchmarks/bitwise_not/case_config.yaml
+++ b/operation/benchmarks/bitwise_not/case_config.yaml
@@ -1,5 +1,4 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 50000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/bitwise_not/nvidia/A100_40_SXM/README.md b/operation/benchmarks/bitwise_not/iluvatar/BI150/README.md
similarity index 55%
rename from operation/benchmarks/bitwise_not/nvidia/A100_40_SXM/README.md
rename to operation/benchmarks/bitwise_not/iluvatar/BI150/README.md
index 418551b92..d0e8a1cbe 100644
--- a/operation/benchmarks/bitwise_not/nvidia/A100_40_SXM/README.md
+++ b/operation/benchmarks/bitwise_not/iluvatar/BI150/README.md
@@ -1,57 +1,54 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:9168f2d031ecc1b31a9f658fb66dd6735b7306b3
-
-# 评测结果
-
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 0.00E+00    | /  | /  | / | / |
-| nativetorch | 0.00E+00    | / | /  |  /  | /  |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时>延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 0.00E+00    | 108143.29us       | 100317.18us        | 9.25op/s | 9.97op/s | 39373.23us | 104317.95us |
-| nativetorch | 0.00E+00    | 107053.46us       | 102887.42us        | 9.34op/s | 9.72op/s | 43383.28us | 98195.36us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单
-卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1482.0W | 1482.0W | 0.0W   | /     | 60.39W       | 62.0W      | 1.5W        | 1482.0  |
-| flaggems监控结果 | 1560.0W | 1560.0W | 0.0W   | /     | 60.36W       | 62.0W      | 1.55W        | 1560.0  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 20.095%    | 1.603%   | 33.34°C       | 1.828%        |
-| flaggems监控结果 | 20.144%    | 1.596%   | 33.49°C       | 1.034%        |
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 0.00E+00    | /  | /  | / | / |
+| nativetorch | 0.00E+00    | / | /  |  /  | /  |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 0.00E+00    | 2052.91us       | 2071.01us        | 487.11op/s | 482.86op/s | 253735.09us | 2432.21us |
+| nativetorch | 0.00E+00    | 2169.4us       | 2170.33us        | 460.96op/s | 460.76op/s | 2454.73us | 2375.62us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1995.0W | 1995.0W | 0.0W   | /     | 175.24W       | 178.0W      | 8.33W        | 350W  |
+| flaggems监控结果 | 1995.0W | 1995.0W | 0.0W   | /     | 173.14W       | 174.0W      | 3.4W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 46.287%    | 2.393%   | 47.29°C       | 5.426%        |
+| flaggems监控结果 | 48.414%    | 2.4%   | 45.55°C       | 6.989%        |
diff --git a/operation/benchmarks/bitwise_not/iluvatar/BI150/case_config.yaml b/operation/benchmarks/bitwise_not/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..dd97b6504
--- /dev/null
+++ b/operation/benchmarks/bitwise_not/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: -1.0
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
diff --git a/operation/benchmarks/bitwise_not/iluvatar/BI150/env.sh b/operation/benchmarks/bitwise_not/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/bitwise_not/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/bitwise_not/iluvatar/BI150/requirements.txt b/operation/benchmarks/bitwise_not/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/bitwise_not/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/bitwise_not/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/bitwise_not/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..ec1c9e233
--- /dev/null
+++ b/operation/benchmarks/bitwise_not/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,2 @@
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/bitwise_not/kunlunxin/R300p/env.sh b/operation/benchmarks/bitwise_not/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/bitwise_not/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/bitwise_not/main.py b/operation/benchmarks/bitwise_not/main.py
index 4151106f2..f9585c862 100644
--- a/operation/benchmarks/bitwise_not/main.py
+++ b/operation/benchmarks/bitwise_not/main.py
@@ -4,12 +4,12 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +23,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,40 +53,35 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation")
     m = case_config.Melements
 
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(m) 
-        a = (127 * a).to(torch.int8)
-
-        r_cpu = torch.bitwise_not(a)
-
-        a = a.to(0)
-        r_device = torch.bitwise_not(a).cpu() 
-        mape = ((r_device != r_cpu).float().sum()/r_cpu.numel()).item()
 
-        mmape.append(mape)
-    
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
+    low = -32768
+    high = 32768
+    a = torch.randint(low, high, (m, 1024, 1024), dtype=dtype[config.dataformat])
 
-    a = torch.randn(m, 1024, 1024) 
-    a = (127 * a).to(torch.int8)
+    a = (127 * a).to(0)
 
     latency_nowarm, latency_warm, cputime, kerneltime = do_test(
         torch.bitwise_not, (a, ), host_device_sync, config, case_config)
 
-    op2flops = lambda x: 0.0
+    op2flops = lambda x: x * m * 1024 * 1024
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "bitwise_not", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -86,6 +89,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
@@ -98,4 +102,4 @@ def main(config, case_config):
         print("Using flaggems")
     else:
         print("Using nativetorch")
-    main(config, case_config)
\ No newline at end of file
+    main(config, case_config)
diff --git a/operation/benchmarks/bitwise_not/metax/C550_64/case_config.yaml b/operation/benchmarks/bitwise_not/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..35cf096b2
--- /dev/null
+++ b/operation/benchmarks/bitwise_not/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 1000
diff --git a/operation/benchmarks/bitwise_not/metax/C550_64/env.sh b/operation/benchmarks/bitwise_not/metax/C550_64/env.sh
new file mode 100644
index 000000000..bb4cd2dce
--- /dev/null
+++ b/operation/benchmarks/bitwise_not/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_CONST_ARGS_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/bitwise_not/nvidia/A100_40_SXM/INT16_README.md b/operation/benchmarks/bitwise_not/nvidia/A100_40_SXM/INT16_README.md
new file mode 100644
index 000000000..a67a44974
--- /dev/null
+++ b/operation/benchmarks/bitwise_not/nvidia/A100_40_SXM/INT16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.34TFLOPS       | 0.33TFLOPS        | -33.57% | -33.49% |
+| nativetorch | True    | 0.35TFLOPS      | 0.35TFLOPS      | -34.87%      | -34.79%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 3198.96us       | 3206.14us        | 312.6op/s | 311.9op/s | 816660.83us | 3283.99us |
+| nativetorch | 3079.52us       | 3086.34us        | 324.73op/s | 324.01op/s | 36638.09us | 3099.98us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1404.0W | 1404.0W | 0.0W   | /     | 274.0W       | 274.0W      | 0.0W        | 400W  |
+| flaggems监控结果 | 1404.0W | 1404.0W | 0.0W   | /     | 301.0W       | 301.0W      | 0.0W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.782%    | 1.107%   | 34.61°C       | 16.373%        |
+| flaggems监控结果 | 0.382%    | 1.107%   | 34.64°C       | 16.186%        |
diff --git a/operation/benchmarks/bitwise_not/nvidia/A100_40_SXM/INT32_README.md b/operation/benchmarks/bitwise_not/nvidia/A100_40_SXM/INT32_README.md
new file mode 100644
index 000000000..d3a32d36b
--- /dev/null
+++ b/operation/benchmarks/bitwise_not/nvidia/A100_40_SXM/INT32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.17TFLOPS       | 0.17TFLOPS        | 0.88% | 0.88% |
+| nativetorch | True    | 0.17TFLOPS      | 0.17TFLOPS      | 0.89%      | 0.89%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 6261.35us       | 6276.1us        | 159.71op/s | 159.33op/s | 1787221.85us | 6375.46us |
+| nativetorch | 6195.98us       | 6197.25us        | 161.4op/s | 161.36op/s | 23121.16us | 6223.68us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1404.0W | 1404.0W | 0.0W   | /     | 248.0W       | 249.0W      | 1.0W        | 400W  |
+| flaggems监控结果 | 1404.0W | 1404.0W | 0.0W   | /     | 270.0W       | 270.0W      | 0.0W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.661%    | 1.121%   | 35.47°C       | 31.53%        |
+| flaggems监控结果 | 1.029%    | 1.163%   | 36.0°C       | 41.627%        |
diff --git a/operation/benchmarks/bitwise_not/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/bitwise_not/nvidia/A100_40_SXM/case_config.yaml
index 75bb60240..35cf096b2 100644
--- a/operation/benchmarks/bitwise_not/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/bitwise_not/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 1000
-SPECTFLOPS: -1.0
\ No newline at end of file
diff --git a/operation/benchmarks/bitwise_or/cambricon/MLU/case_config.yaml b/operation/benchmarks/bitwise_or/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/bitwise_or/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/bitwise_or/cambricon/MLU/env.sh b/operation/benchmarks/bitwise_or/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/bitwise_or/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/bitwise_or/case_config.yaml b/operation/benchmarks/bitwise_or/case_config.yaml
index 6ebcc4612..acc0f44fb 100644
--- a/operation/benchmarks/bitwise_or/case_config.yaml
+++ b/operation/benchmarks/bitwise_or/case_config.yaml
@@ -1,5 +1,4 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 50000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/bitwise_or/nvidia/A100_40_SXM/README.md b/operation/benchmarks/bitwise_or/iluvatar/BI150/README.md
similarity index 52%
rename from operation/benchmarks/bitwise_or/nvidia/A100_40_SXM/README.md
rename to operation/benchmarks/bitwise_or/iluvatar/BI150/README.md
index 4028361ca..39094e9e5 100644
--- a/operation/benchmarks/bitwise_or/nvidia/A100_40_SXM/README.md
+++ b/operation/benchmarks/bitwise_or/iluvatar/BI150/README.md
@@ -1,57 +1,54 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:9168f2d031ecc1b31a9f658fb66dd6735b7306b3
-
-# 评测结果
-
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 0.00E+00    | /  | /  | / | / |
-| nativetorch | 0.00E+00    | /  | /  | /  | / |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时>延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 0.00E+00    | 104786.08us       | 100961.28us        | 9.54op/s | 9.9op/s | 298346.76us | 98256.06us |
-| nativetorch | 0.00E+00    | 107694.18us       | 99507.2us        | 9.29op/s | 10.05op/s | 269314.33us | 98708.36us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单
-卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1560.0W | 1560.0W | 0.0W   | /     | 60.15W       | 62.0W      | 1.68W        | 1560.0  |
-| flaggems监控结果 | 1560.0W | 1560.0W | 0.0W   | /     | 60.45W       | 62.0W      | 1.5W        | 1560.0  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 18.648%    | 1.73%   | 33.16°C       | 1.221%        |
-| flaggems监控结果 | 19.654%    | 1.743%   | 33.25°C       | 1.034%        |
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 0.00E+00    | /  | /  | / | / |
+| nativetorch | 0.00E+00    | / | /  |  /  | /  |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 0.00E+00    | 3480.73us       | 3508.1us        | 287.3op/s | 285.05op/s | 236354.23us | 3814.93us |
+| nativetorch | 0.00E+00    | 3295.92us       | 3311.87us        | 303.41op/s | 301.94op/s | 3554.28us | 3491.59us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2042.5W | 2090.0W | 47.5W   | /     | 177.28W       | 178.0W      | 0.51W        | 350W  |
+| flaggems监控结果 | 2033.0W | 2071.0W | 38.0W   | /     | 163.91W       | 164.0W      | 0.28W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 47.542%    | 2.401%   | 47.39°C       | 8.551%        |
+| flaggems监控结果 | 49.818%    | 2.405%   | 44.34°C       | 6.989%        |
\ No newline at end of file
diff --git a/operation/benchmarks/bitwise_or/iluvatar/BI150/case_config.yaml b/operation/benchmarks/bitwise_or/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..dd97b6504
--- /dev/null
+++ b/operation/benchmarks/bitwise_or/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: -1.0
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
diff --git a/operation/benchmarks/bitwise_or/iluvatar/BI150/env.sh b/operation/benchmarks/bitwise_or/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/bitwise_or/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/bitwise_or/iluvatar/BI150/requirements.txt b/operation/benchmarks/bitwise_or/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/bitwise_or/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/bitwise_or/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/bitwise_or/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..ec1c9e233
--- /dev/null
+++ b/operation/benchmarks/bitwise_or/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,2 @@
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/bitwise_or/kunlunxin/R300p/env.sh b/operation/benchmarks/bitwise_or/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/bitwise_or/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/bitwise_or/main.py b/operation/benchmarks/bitwise_or/main.py
index 98f67a178..d011b4df8 100644
--- a/operation/benchmarks/bitwise_or/main.py
+++ b/operation/benchmarks/bitwise_or/main.py
@@ -4,12 +4,12 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +23,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,45 +53,35 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation")
     m = case_config.Melements
 
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(m) 
-        a = (127 * a).to(torch.int8)
-        b = torch.randn(m) 
-        b = (127 * b).to(torch.int8)
-
-        r_cpu = torch.bitwise_or(a, b)
-
-        a = a.to(0)
-        b = b.to(0)
-        r_device = torch.bitwise_or(a, b).cpu() 
-        mape = ((r_device != r_cpu).float().sum()/r_cpu.numel()).item()
-
-        mmape.append(mape)
-    
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
-
-    a = torch.randn(m, 1024, 1024) 
-    a = (127 * a).to(torch.int8)
-    b = torch.randn(m, 1024, 1024) 
-    b = (127 * b).to(torch.int8)
+    low = -32768
+    high = 32767
+    a = torch.randint(low, high, (m, 1024, 1024),  dtype=dtype[config.dataformat]) 
+    a = (127 * a).to(0)
+    b = torch.randint(low, high, (m, 1024, 1024),  dtype=dtype[config.dataformat]) 
+    b = (127 * b).to(0)
 
     latency_nowarm, latency_warm, cputime, kerneltime = do_test(
         torch.bitwise_or, (a, b), host_device_sync, config, case_config)
 
-    op2flops = lambda x: 0.0
+    op2flops = lambda x: x * m * 1024 * 1024
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "bitwise_or", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -91,6 +89,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
@@ -103,4 +102,4 @@ def main(config, case_config):
         print("Using flaggems")
     else:
         print("Using nativetorch")
-    main(config, case_config)
\ No newline at end of file
+    main(config, case_config)
diff --git a/operation/benchmarks/bitwise_or/metax/C550_64/case_config.yaml b/operation/benchmarks/bitwise_or/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..35cf096b2
--- /dev/null
+++ b/operation/benchmarks/bitwise_or/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 1000
diff --git a/operation/benchmarks/bitwise_or/metax/C550_64/env.sh b/operation/benchmarks/bitwise_or/metax/C550_64/env.sh
new file mode 100644
index 000000000..bb4cd2dce
--- /dev/null
+++ b/operation/benchmarks/bitwise_or/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_CONST_ARGS_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/bitwise_or/nvidia/A100_40_SXM/INT16_README.md b/operation/benchmarks/bitwise_or/nvidia/A100_40_SXM/INT16_README.md
new file mode 100644
index 000000000..aabff7dba
--- /dev/null
+++ b/operation/benchmarks/bitwise_or/nvidia/A100_40_SXM/INT16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.23TFLOPS       | 0.23TFLOPS        | -23.01% | -22.97% |
+| nativetorch | True    | 0.23TFLOPS      | 0.23TFLOPS      | -22.63%      | -22.61%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 4666.25us       | 4673.54us        | 214.3op/s | 213.97op/s | 842234.06us | 4785.03us |
+| nativetorch | 4745.8us       | 4748.29us        | 210.71op/s | 210.6op/s | 17700.01us | 4969.85us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1404.0W | 1404.0W | 0.0W   | /     | 258.0W       | 259.0W      | 1.0W        | 400W  |
+| flaggems监控结果 | 1404.0W | 1404.0W | 0.0W   | /     | 285.0W       | 285.0W      | 0.0W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.619%    | 1.177%   | 35.73°C       | 26.478%        |
+| flaggems监控结果 | 0.515%    | 1.142%   | 36.0°C       | 21.238%        |
diff --git a/operation/benchmarks/bitwise_or/nvidia/A100_40_SXM/INT32_README.md b/operation/benchmarks/bitwise_or/nvidia/A100_40_SXM/INT32_README.md
new file mode 100644
index 000000000..818555580
--- /dev/null
+++ b/operation/benchmarks/bitwise_or/nvidia/A100_40_SXM/INT32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.11TFLOPS       | 0.11TFLOPS        | 0.59% | 0.59% |
+| nativetorch | True    | 0.11TFLOPS      | 0.11TFLOPS      | 0.58%      | 0.58%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 9341.61us       | 9349.12us        | 107.05op/s | 106.96op/s | 1626746.19us | 9465.18us |
+| nativetorch | 9473.69us       | 9480.19us        | 105.56op/s | 105.48op/s | 23299.54us | 9508.7us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1404.0W | 1404.0W | 0.0W   | /     | 242.5W       | 244.0W      | 1.5W        | 400W  |
+| flaggems监控结果 | 1404.0W | 1404.0W | 0.0W   | /     | 262.5W       | 264.0W      | 1.5W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.503%    | 1.156%   | 35.94°C       | 51.739%        |
+| flaggems监控结果 | 1.516%    | 1.26%   | 35.61°C       | 41.447%        |
diff --git a/operation/benchmarks/bitwise_or/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/bitwise_or/nvidia/A100_40_SXM/case_config.yaml
index 75bb60240..35cf096b2 100644
--- a/operation/benchmarks/bitwise_or/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/bitwise_or/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 1000
-SPECTFLOPS: -1.0
\ No newline at end of file
diff --git a/operation/benchmarks/bmm/cambricon/MLU/case_config.yaml b/operation/benchmarks/bmm/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/bmm/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/bmm/cambricon/MLU/env.sh b/operation/benchmarks/bmm/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/bmm/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/bmm/case_config.yaml b/operation/benchmarks/bmm/case_config.yaml
index 6d5edc600..99833cbf0 100644
--- a/operation/benchmarks/bmm/case_config.yaml
+++ b/operation/benchmarks/bmm/case_config.yaml
@@ -2,7 +2,6 @@ M: 4096
 N: 4096
 K: 4096
 BS: 2
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 200000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/bmm/iluvatar/BI150/README.md b/operation/benchmarks/bmm/iluvatar/BI150/README.md
new file mode 100644
index 000000000..f5a86d024
--- /dev/null
+++ b/operation/benchmarks/bmm/iluvatar/BI150/README.md
@@ -0,0 +1,57 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 注意事项
+测试bmm时必须调节降频问题，因此需要：bash vendors/iluvatar/dvfs.sh && python3 run.py 
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 1.76E-04    | 89.25TFLOPS       | 88.91TFLOPS        | 46.31% | 46.27% |
+| nativetorch | 1.76E-04    | 94.69TFLOPS      | 94.41TFLOPS      | 49.29%      | 49.17%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 1.33E-06    | 1540.0us       | 1545.83us        | 649.35op/s | 646.9op/s | 23206046.88us | 1915.09us |
+| nativetorch | 1.33E-06    | 1451.5us       | 1455.83us        | 688.94op/s | 686.89op/s | 2024.32us | 1620.88us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2033.0W | 2033.0W | 0.0W   | /     | 285.27W       | 287.0W      | 1.81W        | 350W  |
+| flaggems监控结果 | 2128.0W | 2223.0W | 95.0W   | /     | 274.0W       | 276.0W      | 2.62W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 43.107%    | 2.386%   | 80.8°C       | 1.88%        |
+| flaggems监控结果 | 47.647%    | 2.446%   | 78.5°C       | 2.24%        |
\ No newline at end of file
diff --git a/operation/benchmarks/bmm/iluvatar/BI150/case_config.yaml b/operation/benchmarks/bmm/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..f1dbe8c8f
--- /dev/null
+++ b/operation/benchmarks/bmm/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,9 @@
+M: 2048
+N: 4096
+K: 4096
+BS: 2
+SPECTFLOPS: 192
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/bmm/iluvatar/BI150/env.sh b/operation/benchmarks/bmm/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/bmm/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/bmm/iluvatar/BI150/requirements.txt b/operation/benchmarks/bmm/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/bmm/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/bmm/main.py b/operation/benchmarks/bmm/main.py
index 1ee39add6..116e6c38d 100644
--- a/operation/benchmarks/bmm/main.py
+++ b/operation/benchmarks/bmm/main.py
@@ -4,12 +4,12 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +23,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,38 +53,24 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 16-times smaller operation"
-          )  # correctness is implemented casebycase
 
     m = case_config.M
     n = case_config.N
     k = case_config.K
     bs = case_config.BS
 
-    dtype = {"FP16": torch.float16}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(bs, m//16, n//16, dtype=dtype[config.dataformat])
-        b = torch.randn(bs, n//16, k//16, dtype=dtype[config.dataformat])
-
-        a_fp64 = a.to(torch.float64)
-        b_fp64 = b.to(torch.float64)
-        r_fp64 = torch.bmm(a_fp64, b_fp64)
-
-        a = a.to(0)
-        b = b.to(0)
-        r_device = torch.bmm(a, b).cpu()
-        mape = torch.mean(torch.abs(r_device - r_fp64) / torch.abs(r_fp64))
-
-        mmape.append(mape)
-    
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
 
     a = torch.randn(bs, m, n, dtype=dtype[config.dataformat]).to(0)
     b = torch.randn(bs, n, k, dtype=dtype[config.dataformat]).to(0)
@@ -87,8 +81,8 @@ def main(config, case_config):
     op2flops = lambda x: x * 2 * m * n * k * bs
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "bmm", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -96,6 +90,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
diff --git a/operation/benchmarks/bmm/metax/C550_64/case_config.yaml b/operation/benchmarks/bmm/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..69e07c25c
--- /dev/null
+++ b/operation/benchmarks/bmm/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 200000
diff --git a/operation/benchmarks/bmm/metax/C550_64/env.sh b/operation/benchmarks/bmm/metax/C550_64/env.sh
new file mode 100644
index 000000000..79ff0fea1
--- /dev/null
+++ b/operation/benchmarks/bmm/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_ENABLE_COMPILER_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/bmm/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/bmm/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..42420d0be
--- /dev/null
+++ b/operation/benchmarks/bmm/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 240.66TFLOPS       | 251.58TFLOPS        | 77.13% | 80.63% |
+| nativetorch | True    | 247.68TFLOPS      | 257.12TFLOPS      | 79.39%      | 82.41%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 1142.2us       | 1092.61us        | 875.5op/s | 915.24op/s | 8017845.66us | 1175.53us |
+| nativetorch | 1109.81us       | 1069.06us        | 901.06op/s | 935.4op/s | 175370.5us | 1260.25us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1537.71W | 1872.0W | 136.47W   | /     | 396.18W       | 408.0W      | 20.14W        | 400W  |
+| flaggems监控结果 | 1579.5W | 1872.0W | 168.87W   | /     | 391.02W       | 407.0W      | 26.96W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.815%    | 2.338%   | 64.0°C       | 3.486%        |
+| flaggems监控结果 | 0.793%    | 2.337%   | 63.12°C       | 3.486%        |
diff --git a/operation/benchmarks/bmm/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/bmm/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..ab5cc7ce8
--- /dev/null
+++ b/operation/benchmarks/bmm/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 234.97TFLOPS       | 242.71TFLOPS        | 75.31% | 77.79% |
+| nativetorch | True    | 241.29TFLOPS      | 251.11TFLOPS      | 77.34%      | 80.48%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 1169.85us       | 1132.54us        | 854.81op/s | 882.97op/s | 8040399.85us | 1201.18us |
+| nativetorch | 1139.21us       | 1094.66us        | 877.8op/s | 913.53op/s | 464553.37us | 1142.24us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1579.5W | 1872.0W | 168.87W   | /     | 395.78W       | 410.0W      | 13.99W        | 400W  |
+| flaggems监控结果 | 1579.5W | 1872.0W | 168.87W   | /     | 396.08W       | 413.0W      | 16.27W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.813%    | 2.338%   | 63.87°C       | 3.486%        |
+| flaggems监控结果 | 1.158%    | 2.338%   | 63.5°C       | 3.486%        |
diff --git a/operation/benchmarks/bmm/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/bmm/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..2f7110330
--- /dev/null
+++ b/operation/benchmarks/bmm/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 18.6TFLOPS       | 18.6TFLOPS        | 95.37% | 95.37% |
+| nativetorch | True    | 19.13TFLOPS      | 19.13TFLOPS      | 98.12%      | 98.1%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 14780.99us       | 14780.42us        | 67.65op/s | 67.66op/s | 18475540.65us | 15099.9us |
+| nativetorch | 14366.06us       | 14369.79us        | 69.61op/s | 69.59op/s | 137728.38us | 14400.98us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1713.4W | 1794.0W | 111.19W   | /     | 339.56W       | 342.0W      | 7.01W        | 400W  |
+| flaggems监控结果 | 1729.0W | 1794.0W | 130.65W   | /     | 352.05W       | 357.0W      | 8.11W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.785%    | 2.297%   | 55.08°C       | 3.486%        |
+| flaggems监控结果 | 0.929%    | 2.297%   | 62.66°C       | 4.399%        |
diff --git a/operation/benchmarks/bmm/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/bmm/nvidia/A100_40_SXM/case_config.yaml
index d1549b623..69e07c25c 100644
--- a/operation/benchmarks/bmm/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/bmm/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 200000
-SPECTFLOPS: 312
\ No newline at end of file
diff --git a/operation/benchmarks/cos/cambricon/MLU/case_config.yaml b/operation/benchmarks/cos/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/cos/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/cos/cambricon/MLU/env.sh b/operation/benchmarks/cos/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/cos/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/cos/case_config.yaml b/operation/benchmarks/cos/case_config.yaml
index 6ebcc4612..acc0f44fb 100644
--- a/operation/benchmarks/cos/case_config.yaml
+++ b/operation/benchmarks/cos/case_config.yaml
@@ -1,5 +1,4 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 50000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/cos/iluvatar/BI150/README.md b/operation/benchmarks/cos/iluvatar/BI150/README.md
new file mode 100644
index 000000000..f44f71582
--- /dev/null
+++ b/operation/benchmarks/cos/iluvatar/BI150/README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 2.26E-08    | 0.07TFLOPS       | 0.07TFLOPS        | 0.3% | 0.29% |
+| nativetorch | 2.26E-08    | 0.07TFLOPS      | 0.07TFLOPS      | 0.3%      | 0.3%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 7.84E-10    | 7402.41us       | 7419.34us        | 135.09op/s | 134.78op/s | 412238.68us | 7853.85us |
+| nativetorch | 7.84E-10    | 7373.38us       | 7375.12us        | 135.62op/s | 135.59op/s | 7734.97us | 7665.34us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2085.25W | 2109.0W | 41.14W   | /     | 185.48W       | 186.0W      | 4.3W        | 350W  |
+| flaggems监控结果 | 2085.25W | 2128.0W | 52.68W   | /     | 192.82W       | 193.0W      | 0.42W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 42.041%    | 2.388%   | 52.48°C       | 30.432%        |
+| flaggems监控结果 | 42.401%    | 2.391%   | 53.47°C       | 20.856%        |
\ No newline at end of file
diff --git a/operation/benchmarks/cos/iluvatar/BI150/case_config.yaml b/operation/benchmarks/cos/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..4958ee3ba
--- /dev/null
+++ b/operation/benchmarks/cos/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/cos/iluvatar/BI150/env.sh b/operation/benchmarks/cos/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/cos/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/cos/iluvatar/BI150/requirements.txt b/operation/benchmarks/cos/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/cos/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/cos/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/cos/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..ec1c9e233
--- /dev/null
+++ b/operation/benchmarks/cos/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,2 @@
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/cos/kunlunxin/R300p/env.sh b/operation/benchmarks/cos/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..38a0db6a6
--- /dev/null
+++ b/operation/benchmarks/cos/kunlunxin/R300p/env.sh
@@ -0,0 +1,6 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+export XPU_enable_reorder=1
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/cos/main.py b/operation/benchmarks/cos/main.py
index c15f6515d..b852153bd 100644
--- a/operation/benchmarks/cos/main.py
+++ b/operation/benchmarks/cos/main.py
@@ -4,12 +4,12 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +23,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,32 +53,21 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation"
-          )  # correctness is implemented casebycase
 
     m = case_config.Melements
 
-    dtype = {"FP32": torch.float32}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(m, dtype=dtype[config.dataformat])
-
-        a_fp64 = a.to(torch.float64)
-        r_fp64 = torch.cos(a_fp64)
-
-        a = a.to(0)
-        r_device = torch.cos(a).cpu()
-        mape = torch.mean(torch.abs(r_device - r_fp64) / torch.abs(r_fp64))
-
-        mmape.append(mape)
-    
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
 
     a = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat]).to(0)
 
@@ -80,8 +77,8 @@ def main(config, case_config):
     op2flops = lambda x: x * m * 1024 * 1024
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "cos", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -89,6 +86,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
diff --git a/operation/benchmarks/cos/metax/C550_64/case_config.yaml b/operation/benchmarks/cos/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/cos/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/cos/metax/C550_64/env.sh b/operation/benchmarks/cos/metax/C550_64/env.sh
new file mode 100644
index 000000000..bb4cd2dce
--- /dev/null
+++ b/operation/benchmarks/cos/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_CONST_ARGS_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/cos/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/cos/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..325f50073
--- /dev/null
+++ b/operation/benchmarks/cos/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.28TFLOPS       | 0.28TFLOPS        | 0.09% | 0.09% |
+| nativetorch | True    | 0.33TFLOPS      | 0.33TFLOPS      | 0.11%      | 0.11%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3798.72us       | 3805.18us        | 263.25op/s | 262.8op/s | 2138545.26us | 3894.67us |
+| nativetorch | 3253.0us       | 3258.37us        | 307.41op/s | 306.9op/s | 25265.41us | 3305.84us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1638.0W | 1794.0W | 156.0W   | /     | 362.72W       | 369.0W      | 8.31W        | 400W  |
+| flaggems监控结果 | 1716.0W | 1872.0W | 168.5W   | /     | 387.47W       | 392.0W      | 7.34W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.135%    | 2.31%   | 55.98°C       | 17.394%        |
+| flaggems监控结果 | 0.836%    | 2.306%   | 62.46°C       | 17.212%        |
diff --git a/operation/benchmarks/cos/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/cos/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..d3475a138
--- /dev/null
+++ b/operation/benchmarks/cos/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.28TFLOPS       | 0.28TFLOPS        | 0.09% | 0.09% |
+| nativetorch | True    | 0.33TFLOPS      | 0.33TFLOPS      | 0.11%      | 0.11%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3802.85us       | 3802.11us        | 262.96op/s | 263.01op/s | 969585.06us | 3902.39us |
+| nativetorch | 3252.33us       | 3256.32us        | 307.47op/s | 307.1op/s | 15708.36us | 3274.72us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1638.0W | 1794.0W | 156.0W   | /     | 378.61W       | 385.0W      | 9.18W        | 400W  |
+| flaggems监控结果 | 1690.0W | 1872.0W | 204.72W   | /     | 396.34W       | 403.0W      | 8.16W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.891%    | 2.31%   | 55.96°C       | 17.394%        |
+| flaggems监控结果 | 0.75%    | 2.309%   | 62.95°C       | 17.212%        |
diff --git a/operation/benchmarks/cos/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/cos/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..42ecc4d4b
--- /dev/null
+++ b/operation/benchmarks/cos/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.17TFLOPS       | 0.17TFLOPS        | 0.88% | 0.88% |
+| nativetorch | True    | 0.17TFLOPS      | 0.17TFLOPS      | 0.89%      | 0.89%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 6256.12us       | 6275.07us        | 159.84op/s | 159.36op/s | 1034657.83us | 6355.39us |
+| nativetorch | 6163.4us       | 6165.5us        | 162.25op/s | 162.19op/s | 29524.39us | 6189.05us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1638.0W | 1716.0W | 110.31W   | /     | 317.44W       | 320.0W      | 3.56W        | 400W  |
+| flaggems监控结果 | 1690.0W | 1794.0W | 147.08W   | /     | 355.25W       | 359.0W      | 4.91W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.739%    | 2.292%   | 52.07°C       | 42.656%        |
+| flaggems监控结果 | 0.758%    | 2.29%   | 53.63°C       | 32.369%        |
diff --git a/operation/benchmarks/cos/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/cos/nvidia/A100_40_SXM/case_config.yaml
index 7d02883ab..bc4b04b42 100644
--- a/operation/benchmarks/cos/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/cos/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 50000
-SPECTFLOPS: 19.5
diff --git a/operation/benchmarks/cross_entropy_loss/cambricon/MLU/case_config.yaml b/operation/benchmarks/cross_entropy_loss/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/cross_entropy_loss/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/cross_entropy_loss/cambricon/MLU/env.sh b/operation/benchmarks/cross_entropy_loss/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/cross_entropy_loss/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/cross_entropy_loss/case_config.yaml b/operation/benchmarks/cross_entropy_loss/case_config.yaml
new file mode 100644
index 000000000..98c507568
--- /dev/null
+++ b/operation/benchmarks/cross_entropy_loss/case_config.yaml
@@ -0,0 +1,6 @@
+bs: 1024
+elements: 128000
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/cross_entropy_loss/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/cross_entropy_loss/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..fd65a50d9
--- /dev/null
+++ b/operation/benchmarks/cross_entropy_loss/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,4 @@
+bs: 4096
+elements: 256
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/cross_entropy_loss/kunlunxin/R300p/env.sh b/operation/benchmarks/cross_entropy_loss/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..38a0db6a6
--- /dev/null
+++ b/operation/benchmarks/cross_entropy_loss/kunlunxin/R300p/env.sh
@@ -0,0 +1,6 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+export XPU_enable_reorder=1
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/cross_entropy_loss/main.py b/operation/benchmarks/cross_entropy_loss/main.py
new file mode 100644
index 000000000..6444c1360
--- /dev/null
+++ b/operation/benchmarks/cross_entropy_loss/main.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+import torch
+import os
+import time
+from argparse import ArgumentParser, Namespace
+import yaml
+import sys
+import subprocess
+
+sys.path.append("..")
+from drivers.utils import *
+from drivers.calculate import *
+
+
+def parse_args():
+    parser = ArgumentParser(description=" ")
+
+    parser.add_argument("--vendor",
+                        type=str,
+                        required=True,
+                        help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
+    parser.add_argument("--dataformat",
+                        type=str,
+                        required=True,
+                        help="like FP32,FP16")
+
+    parser.add_argument("--oplib",
+                        type=str,
+                        required=True,
+                        help="impl like pytorch/flaggems/cpp")
+
+    parser.add_argument("--chip",
+                        type=str,
+                        required=True,
+                        help="chip like A100_40_SXM")
+
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
+    set_ieee_float32(config.vendor)
+
+    elements = case_config.elements
+    bs = case_config.bs
+    a = torch.randn(bs,  elements, dtype=dtype[config.dataformat], requires_grad=True).to(0)
+    target = torch.empty(bs, dtype=torch.int64).random_(elements).to(0)
+    f = torch.nn.CrossEntropyLoss()
+    latency_nowarm, latency_warm, cputime, kerneltime = do_test(
+        f, (a, target), host_device_sync, config, case_config, bp=True)
+
+    op2flops = lambda x: x * bs * elements * 3
+
+    perf_result = cal_perf(cputime, kerneltime, op2flops,
+                           config.spectflops, bp=True)
+    print_result(config, config.case_name, *perf_result, correctness,
+                 latency_nowarm, latency_warm)
+
+
+if __name__ == "__main__":
+    config = parse_args()
+    with open("case_config.yaml", "r") as file:
+        case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
+    with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
+              "r") as file:
+        case_config_vendor = yaml.safe_load(file)
+    case_config.update(case_config_vendor)
+    case_config = Namespace(**case_config)
+
+    if config.oplib == "flaggems":
+        import flag_gems
+        flag_gems.enable()
+        print("Using flaggems")
+    else:
+        print("Using nativetorch")
+    main(config, case_config)
diff --git a/operation/benchmarks/cross_entropy_loss/metax/C550_64/case_config.yaml b/operation/benchmarks/cross_entropy_loss/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..529af74ce
--- /dev/null
+++ b/operation/benchmarks/cross_entropy_loss/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
\ No newline at end of file
diff --git a/operation/benchmarks/cross_entropy_loss/metax/C550_64/env.sh b/operation/benchmarks/cross_entropy_loss/metax/C550_64/env.sh
new file mode 100644
index 000000000..0cdec082d
--- /dev/null
+++ b/operation/benchmarks/cross_entropy_loss/metax/C550_64/env.sh
@@ -0,0 +1 @@
+echo "METAX PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/cross_entropy_loss/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/cross_entropy_loss/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..ed1193372
--- /dev/null
+++ b/operation/benchmarks/cross_entropy_loss/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.69TFLOPS       | 0.69TFLOPS        | 0.22% | 0.22% |
+| nativetorch | True    | 0.65TFLOPS      | 0.64TFLOPS      | 0.21%      | 0.21%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 1707.92us       | 1712.13us        | 585.51op/s | 584.07op/s | 10886259.27us | 963.06us |
+| nativetorch | 1824.07us       | 1830.91us        | 548.23op/s | 546.18op/s | 16398.82us | 845.51us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1501.5W | 1638.0W | 85.0W   | /     | 217.06W       | 261.0W      | 19.3W        | 400W  |
+| flaggems监控结果 | 1560.0W | 1638.0W | 55.15W   | /     | 215.16W       | 250.0W      | 17.61W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 2.408%    | 1.704%   | 43.89°C       | 4.976%        |
+| flaggems监控结果 | 2.154%    | 1.679%   | 43.62°C       | 4.981%        |
diff --git a/operation/benchmarks/cross_entropy_loss/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/cross_entropy_loss/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..2297cb3ea
--- /dev/null
+++ b/operation/benchmarks/cross_entropy_loss/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.7TFLOPS       | 0.69TFLOPS        | 0.22% | 0.22% |
+| nativetorch | True    | 0.64TFLOPS      | 0.64TFLOPS      | 0.21%      | 0.21%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 1693.8us       | 1697.79us        | 590.39op/s | 589.0op/s | 10801391.18us | 970.74us |
+| nativetorch | 1829.79us       | 1840.13us        | 546.51op/s | 543.44op/s | 19897.53us | 868.77us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1638.0W | 55.15W   | /     | 216.5W       | 243.0W      | 23.82W        | 400W  |
+| flaggems监控结果 | 1540.5W | 1638.0W | 85.0W   | /     | 215.38W       | 244.0W      | 20.98W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.114%    | 1.281%   | 43.72°C       | 4.981%        |
+| flaggems监控结果 | 1.05%    | 1.282%   | 43.54°C       | 4.981%        |
diff --git a/operation/benchmarks/cross_entropy_loss/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/cross_entropy_loss/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..14a00c28c
--- /dev/null
+++ b/operation/benchmarks/cross_entropy_loss/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.48TFLOPS       | 0.49TFLOPS        | 2.48% | 2.5% |
+| nativetorch | True    | 0.34TFLOPS      | 0.34TFLOPS      | 1.73%      | 1.72%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 2439.16us       | 2423.81us        | 409.98op/s | 412.57op/s | 10596427.43us | 1075.44us |
+| nativetorch | 3493.21us       | 3512.32us        | 286.27op/s | 284.71op/s | 16326.47us | 1608.61us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1540.5W | 1638.0W | 85.0W   | /     | 217.87W       | 233.0W      | 18.73W        | 400W  |
+| flaggems监控结果 | 1540.5W | 1638.0W | 85.0W   | /     | 217.5W       | 249.0W      | 17.74W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.984%    | 1.283%   | 44.0°C       | 8.622%        |
+| flaggems监控结果 | 0.855%    | 1.283%   | 44.09°C       | 4.981%        |
diff --git a/operation/benchmarks/cross_entropy_loss/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/cross_entropy_loss/nvidia/A100_40_SXM/case_config.yaml
new file mode 100644
index 000000000..529af74ce
--- /dev/null
+++ b/operation/benchmarks/cross_entropy_loss/nvidia/A100_40_SXM/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
\ No newline at end of file
diff --git a/operation/benchmarks/cross_entropy_loss/nvidia/A100_40_SXM/env.sh b/operation/benchmarks/cross_entropy_loss/nvidia/A100_40_SXM/env.sh
new file mode 100644
index 000000000..33786ec0d
--- /dev/null
+++ b/operation/benchmarks/cross_entropy_loss/nvidia/A100_40_SXM/env.sh
@@ -0,0 +1 @@
+echo "NVIDIA PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/cross_entropy_loss/nvidia/A100_40_SXM/requirements.txt b/operation/benchmarks/cross_entropy_loss/nvidia/A100_40_SXM/requirements.txt
new file mode 100644
index 000000000..330e27963
--- /dev/null
+++ b/operation/benchmarks/cross_entropy_loss/nvidia/A100_40_SXM/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/operation/benchmarks/div/cambricon/MLU/case_config.yaml b/operation/benchmarks/div/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/div/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/div/cambricon/MLU/env.sh b/operation/benchmarks/div/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/div/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/div/case_config.yaml b/operation/benchmarks/div/case_config.yaml
index 6ebcc4612..acc0f44fb 100644
--- a/operation/benchmarks/div/case_config.yaml
+++ b/operation/benchmarks/div/case_config.yaml
@@ -1,5 +1,4 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 50000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/div/iluvatar/BI150/README.md b/operation/benchmarks/div/iluvatar/BI150/README.md
new file mode 100644
index 000000000..445a1ea2b
--- /dev/null
+++ b/operation/benchmarks/div/iluvatar/BI150/README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 0.00E+00    | 0.07TFLOPS       | 0.07TFLOPS        | 0.3% | 0.29% |
+| nativetorch | 0.00E+00    | 0.07TFLOPS      | 0.07TFLOPS      | 0.3%      | 0.3%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 0.00E+00    | 7401.79us       | 7417.78us        | 135.1op/s | 134.81op/s | 236832.66us | 7837.8us |
+| nativetorch | 0.00E+00    | 7390.62us       | 7392.9us        | 135.31op/s | 135.26op/s | 7660.95us | 7643.15us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2075.75W | 2090.0W | 24.68W   | /     | 164.14W       | 165.0W      | 4.56W        | 350W  |
+| flaggems监控结果 | 2066.25W | 2090.0W | 41.14W   | /     | 175.82W       | 176.0W      | 0.42W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 40.643%    | 2.388%   | 46.47°C       | 25.739%        |
+| flaggems监控结果 | 40.075%    | 2.391%   | 47.56°C       | 19.489%        |
\ No newline at end of file
diff --git a/operation/benchmarks/div/iluvatar/BI150/case_config.yaml b/operation/benchmarks/div/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..4958ee3ba
--- /dev/null
+++ b/operation/benchmarks/div/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/div/iluvatar/BI150/env.sh b/operation/benchmarks/div/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/div/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/div/iluvatar/BI150/requirements.txt b/operation/benchmarks/div/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/div/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/div/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/div/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..ec1c9e233
--- /dev/null
+++ b/operation/benchmarks/div/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,2 @@
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/div/kunlunxin/R300p/env.sh b/operation/benchmarks/div/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/div/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/div/main.py b/operation/benchmarks/div/main.py
index f64fdae5c..4014cea2a 100644
--- a/operation/benchmarks/div/main.py
+++ b/operation/benchmarks/div/main.py
@@ -4,12 +4,12 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +23,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,32 +53,21 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation"
-          )  # correctness is implemented casebycase
 
     Melements = case_config.Melements
 
-    dtype = {"FP32": torch.float32}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(Melements, dtype=dtype[config.dataformat])
-
-        a_fp64 = a.to(torch.float64)
-        r_fp64 = torch.div(a, 0.5)
-
-        a = a.to(0)
-        r_device = torch.div(a, 0.5).cpu()
-        mape = torch.mean(torch.abs(r_device - r_fp64) / torch.abs(r_fp64))
-
-        mmape.append(mape)
-    
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
 
     a = torch.randn(Melements * 1024 * 1024, dtype=dtype[config.dataformat]).to(0)
 
@@ -80,8 +77,8 @@ def main(config, case_config):
     op2flops = lambda x: x * Melements * 1024 * 1024
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "div", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -89,6 +86,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
diff --git a/operation/benchmarks/div/metax/C550_64/case_config.yaml b/operation/benchmarks/div/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/div/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/div/metax/C550_64/env.sh b/operation/benchmarks/div/metax/C550_64/env.sh
new file mode 100644
index 000000000..bb4cd2dce
--- /dev/null
+++ b/operation/benchmarks/div/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_CONST_ARGS_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/div/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/div/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..e95cac7c4
--- /dev/null
+++ b/operation/benchmarks/div/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.34TFLOPS       | 0.34TFLOPS        | 0.11% | 0.11% |
+| nativetorch | True    | 0.35TFLOPS      | 0.35TFLOPS      | 0.11%      | 0.11%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3184.91us       | 3189.76us        | 313.98op/s | 313.5op/s | 2152516.23us | 3273.44us |
+| nativetorch | 3081.16us       | 3087.36us        | 324.55op/s | 323.9op/s | 20795.63us | 3126.76us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 265.39W       | 270.0W      | 4.05W        | 400W  |
+| flaggems监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 265.41W       | 271.0W      | 3.61W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.849%    | 2.309%   | 47.89°C       | 17.394%        |
+| flaggems监控结果 | 0.845%    | 2.309%   | 47.77°C       | 17.207%        |
diff --git a/operation/benchmarks/div/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/div/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..992f11b65
--- /dev/null
+++ b/operation/benchmarks/div/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.34TFLOPS       | 0.34TFLOPS        | 0.11% | 0.11% |
+| nativetorch | True    | 0.35TFLOPS      | 0.35TFLOPS      | 0.11%      | 0.11%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3182.98us       | 3185.66us        | 314.17op/s | 313.91op/s | 1595915.65us | 3282.36us |
+| nativetorch | 3079.71us       | 3086.34us        | 324.71op/s | 324.01op/s | 18345.34us | 3103.87us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 277.1W       | 281.0W      | 3.98W        | 400W  |
+| flaggems监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 276.09W       | 280.0W      | 4.39W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.8%    | 2.316%   | 49.11°C       | 17.394%        |
+| flaggems监控结果 | 1.053%    | 2.314%   | 48.84°C       | 17.207%        |
diff --git a/operation/benchmarks/div/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/div/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..fa5703d0e
--- /dev/null
+++ b/operation/benchmarks/div/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.17TFLOPS       | 0.17TFLOPS        | 0.88% | 0.88% |
+| nativetorch | True    | 0.17TFLOPS      | 0.17TFLOPS      | 0.89%      | 0.89%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 6252.31us       | 6261.76us        | 159.94op/s | 159.7op/s | 1499794.35us | 6350.57us |
+| nativetorch | 6195.3us       | 6197.25us        | 161.41op/s | 161.36op/s | 24735.93us | 6471.84us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1586.0W | 1638.0W | 73.54W   | /     | 261.55W       | 265.0W      | 2.94W        | 400W  |
+| flaggems监控结果 | 1586.0W | 1638.0W | 73.54W   | /     | 265.08W       | 268.0W      | 3.42W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.727%    | 2.288%   | 48.95°C       | 32.551%        |
+| flaggems监控结果 | 1.011%    | 2.291%   | 49.28°C       | 32.364%        |
diff --git a/operation/benchmarks/div/nvidia/A100_40_SXM/README.md b/operation/benchmarks/div/nvidia/A100_40_SXM/README.md
deleted file mode 100644
index 71dca47e1..000000000
--- a/operation/benchmarks/div/nvidia/A100_40_SXM/README.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a31e96c235
-
-# 评测结果
-
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 0.00E+00    | 2.72TFLOPS       | 2.72TFLOPS        | 0.96% | 0.96% |
-| nativetorch | 0.00E+00    | 2.72TFLOPS      | 2.72TFLOPS      | 0.96%      | 0.96%    |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 0.00E+00    | 6195.01us       | 6197.25us        | 161.42op/s | 161.36op/s | 11650.31us | 6406.51us |
-| nativetorch | 0.00E+00    | 6196.01us       | 6200.32us        | 161.39op/s | 161.28op/s | 716341.29us | 6211.2us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1638.0W | 1638.0W | 0.0W   | /     | 257.05W       | 260.0W      | 3.09W        | 1638.0  |
-| flaggems监控结果 | 1677.0W | 1716.0W | 39.0W   | /     | 271.45W       | 275.0W      | 3.94W        | 1677.0  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 0.775%    | 1.298%   | 48.03°C       | 31.535%        |
-| flaggems监控结果 | 0.666%    | 1.298%   | 49.94°C       | 31.347%        |
diff --git a/operation/benchmarks/div/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/div/nvidia/A100_40_SXM/case_config.yaml
index 7d02883ab..bc4b04b42 100644
--- a/operation/benchmarks/div/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/div/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 50000
-SPECTFLOPS: 19.5
diff --git a/operation/benchmarks/drivers/calculate.py b/operation/benchmarks/drivers/calculate.py
index d6bcc4fe8..cde6329b1 100644
--- a/operation/benchmarks/drivers/calculate.py
+++ b/operation/benchmarks/drivers/calculate.py
@@ -5,9 +5,39 @@
 # -*- coding: UTF-8 -*-
 import time
 from triton.testing import do_bench as kernel_bench
+import os
+import subprocess
+
+
+def do_correctness(operation):
+    flaggems_dir = os.getenv("FLAGGEMS_WORK_DIR", "/")
+    gems_repo = subprocess.check_output(
+        ["find", flaggems_dir, "-type", "d", "-name", "FlagGems"], text=True).strip()
+
+    p = subprocess.Popen(
+        f"cd {os.path.join(gems_repo, 'tests')} && python3 test_named_ops.py --name {operation} --device cpu ",
+        shell=True
+        )
+    p.wait()
+
+    return p.returncode
+
+grad_outputs = None
+
+def do(exec_func, exec_args, bp=False):
+    global grad_outputs
+    if bp:
+        import torch
+        _tensor = exec_func(*exec_args).sum()
+        if grad_outputs is None:
+            grad_outputs = torch.zeros_like(_tensor)
+        inputs = list(filter(lambda x: x.requires_grad, [*exec_args]))
+        _grad = torch.autograd.grad(outputs=_tensor, inputs=inputs, grad_outputs=grad_outputs)
+    else:
+        _tensor = exec_func(*exec_args)
 
 
-def do_test(exec_func, exec_args, sync_func, config, case_config):
+def do_test(exec_func, exec_args, sync_func, config, case_config, bp=False):
     sync_func(config.vendor)
     start_latency_nowarm = time.perf_counter_ns()
     _tensor = exec_func(*exec_args)
@@ -16,7 +46,7 @@ def do_test(exec_func, exec_args, sync_func, config, case_config):
     latency_nowarm = time.perf_counter_ns() - start_latency_nowarm
 
     for _ in range(case_config.WARMUP):
-        _tensor = exec_func(*exec_args)
+        do(exec_func, exec_args, bp)
 
     sync_func(config.vendor)
     start_latency_warm = time.perf_counter_ns()
@@ -27,14 +57,14 @@ def do_test(exec_func, exec_args, sync_func, config, case_config):
 
     start_time = time.perf_counter()
     for _ in range(case_config.ITERS):
-        _tensor = exec_func(*exec_args)
+        do(exec_func, exec_args, bp)
 
     sync_func(config.vendor)
     end_time = time.perf_counter()
 
     cputime_raw = end_time - start_time
 
-    kerneltime_raw = kernel_bench(lambda: exec_func(*exec_args),
+    kerneltime_raw = kernel_bench(lambda: do(exec_func, exec_args, bp),
                                   warmup=case_config.KERNELWARMUP,
                                   rep=case_config.KERNELITERS,
                                   return_mode="median")
@@ -44,15 +74,16 @@ def do_test(exec_func, exec_args, sync_func, config, case_config):
                                                     2), cputime, kerneltime
 
 
-def cal_perf(cputime, kerneltime, op2flops, spectflops):
+def cal_perf(cputime, kerneltime, op2flops, spectflops, bp=False):
+    spectflops = float(spectflops)
     ctus = round(cputime * 1E6, 2)
     ktus = round(kerneltime * 1E6, 2)
 
     cps = 1.0 / cputime
     kps = 1.0 / kerneltime
 
-    cflops = op2flops(cps)
-    kflops = op2flops(kps)
+    cflops = op2flops(cps) * (3.0 if bp else 1.0)
+    kflops = op2flops(kps) * (3.0 if bp else 1.0)
     ctflops = round(cflops / 1E12, 2)
     ktflops = round(kflops / 1E12, 2)
 
@@ -63,7 +94,7 @@ def cal_perf(cputime, kerneltime, op2flops, spectflops):
 
 
 def print_result(config, casename, ct, kt, cps, kps, ctflops, ktflops, cfu,
-                 kfu, errmean, errstd, lnm, lm):
+                 kfu, correctness, lnm, lm):
     print(r"[FlagPerf Result]Operation {} in {} at {}:".format(
         casename, config.oplib, config.dataformat))
     print(r"[FlagPerf Result]FLOPS utilization: cputime={}%, kerneltime={}%".
@@ -74,8 +105,8 @@ def print_result(config, casename, ct, kt, cps, kps, ctflops, ktflops, cfu,
     print(
         r"[FlagPerf Result]kerneltime={} us, throughput={} op/s, equals to {} TFLOPS"
         .format(kt, kps, ktflops))
-    print(r"[FlagPerf Result]Relative error with FP64-CPU: mean={}, std={}".
-          format(errmean, errstd))
+    print(r"[FlagPerf Result]Correctness with CPU golden Reference: {}".format(
+        correctness))
     print(
         r"[FlagPerf Result]First time latency: no warmup={} us, warmup={} us".
         format(lnm, lm))
diff --git a/operation/benchmarks/drivers/utils.py b/operation/benchmarks/drivers/utils.py
index a05954a0b..4be5d92ba 100644
--- a/operation/benchmarks/drivers/utils.py
+++ b/operation/benchmarks/drivers/utils.py
@@ -6,7 +6,17 @@
 import torch
 
 
+def adapt_torch(vendor):
+    if vendor == "nvidia":
+        print("nvidia does nothing")
+    elif vendor == "cambricon":
+        from torch_mlu.utils.model_transfer import transfer
+    else:
+        print("unspecified vendor {}, do nothing".format(vendor))
+
+
 def set_ieee_float32(vendor):
+    adapt_torch(vendor)
     if vendor == "nvidia":
         torch.backends.cuda.matmul.allow_tf32 = False
     else:
@@ -14,6 +24,7 @@ def set_ieee_float32(vendor):
 
 
 def unset_ieee_float32(vendor):
+    adapt_torch(vendor)
     if vendor == "nvidia":
         torch.backends.cuda.matmul.allow_tf32 = True
     else:
@@ -21,18 +32,22 @@ def unset_ieee_float32(vendor):
 
 
 def host_device_sync(vendor):
+    adapt_torch(vendor)
     if vendor == "nvidia":
         torch.cuda.synchronize()
     else:
-        print("unspecified vendor {}, using default pytorch \"torch.cuda.synchronize\"".format(vendor))
+        print(
+            "unspecified vendor {}, using default pytorch \"torch.cuda.synchronize\""
+            .format(vendor))
         torch.cuda.synchronize()
 
 
 def multi_device_sync(vendor):
+    adapt_torch(vendor)
     if vendor == "nvidia":
         torch.distributed.barrier()
     else:
-        print("unspecified vendor {}, using default pytorch \"torch.distributed.barrier\"".format(vendor))
+        print(
+            "unspecified vendor {}, using default pytorch \"torch.distributed.barrier\""
+            .format(vendor))
         torch.distributed.barrier()
-        
-
diff --git a/operation/benchmarks/dropout/cambricon/MLU/case_config.yaml b/operation/benchmarks/dropout/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..8868324fc
--- /dev/null
+++ b/operation/benchmarks/dropout/cambricon/MLU/case_config.yaml
@@ -0,0 +1,2 @@
+SPECTFLOPS: 999999
+Melements: 1
diff --git a/operation/benchmarks/dropout/cambricon/MLU/env.sh b/operation/benchmarks/dropout/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/dropout/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/dropout/case_config.yaml b/operation/benchmarks/dropout/case_config.yaml
new file mode 100644
index 000000000..acc0f44fb
--- /dev/null
+++ b/operation/benchmarks/dropout/case_config.yaml
@@ -0,0 +1,5 @@
+Melements: 1024
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/dropout/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/dropout/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..bfd12215d
--- /dev/null
+++ b/operation/benchmarks/dropout/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,3 @@
+Melements: 1
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/dropout/kunlunxin/R300p/env.sh b/operation/benchmarks/dropout/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/dropout/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/dropout/main.py b/operation/benchmarks/dropout/main.py
new file mode 100644
index 000000000..ce6a71ba1
--- /dev/null
+++ b/operation/benchmarks/dropout/main.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+import torch
+import os
+import time
+from argparse import ArgumentParser, Namespace
+import yaml
+import sys
+import subprocess
+
+sys.path.append("..")
+from drivers.utils import *
+from drivers.calculate import *
+
+
+def parse_args():
+    parser = ArgumentParser(description=" ")
+
+    parser.add_argument("--vendor",
+                        type=str,
+                        required=True,
+                        help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
+    parser.add_argument("--dataformat",
+                        type=str,
+                        required=True,
+                        help="like FP32,FP16")
+
+    parser.add_argument("--oplib",
+                        type=str,
+                        required=True,
+                        help="impl like pytorch/flaggems/cpp")
+
+    parser.add_argument("--chip",
+                        type=str,
+                        required=True,
+                        help="chip like A100_40_SXM")
+
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
+    set_ieee_float32(config.vendor)
+
+    m = case_config.Melements
+    a = torch.randn(m * 1024 * 1024, dtype=dtype[config.dataformat]).to(0)
+    f = torch.nn.Dropout(p=0.2)
+    latency_nowarm, latency_warm, cputime, kerneltime = do_test(
+        f, (a, ), host_device_sync, config, case_config)
+
+    op2flops = lambda x: x * m * 1024 * 1024
+
+    perf_result = cal_perf(cputime, kerneltime, op2flops,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
+                 latency_nowarm, latency_warm)
+
+
+if __name__ == "__main__":
+    config = parse_args()
+    with open("case_config.yaml", "r") as file:
+        case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
+    with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
+              "r") as file:
+        case_config_vendor = yaml.safe_load(file)
+    case_config.update(case_config_vendor)
+    case_config = Namespace(**case_config)
+
+    if config.oplib == "flaggems":
+        import flag_gems
+        flag_gems.enable()
+        print("Using flaggems")
+    else:
+        print("Using nativetorch")
+    main(config, case_config)
diff --git a/operation/benchmarks/dropout/metax/C550_64/case_config.yaml b/operation/benchmarks/dropout/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..529af74ce
--- /dev/null
+++ b/operation/benchmarks/dropout/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
\ No newline at end of file
diff --git a/operation/benchmarks/dropout/metax/C550_64/env.sh b/operation/benchmarks/dropout/metax/C550_64/env.sh
new file mode 100644
index 000000000..0cdec082d
--- /dev/null
+++ b/operation/benchmarks/dropout/metax/C550_64/env.sh
@@ -0,0 +1 @@
+echo "METAX PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/dropout/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/dropout/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..641f87058
--- /dev/null
+++ b/operation/benchmarks/dropout/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.34TFLOPS       | 0.33TFLOPS        | 0.11% | 0.11% |
+| nativetorch | True    | 0.24TFLOPS      | 0.24TFLOPS      | 0.08%      | 0.08%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 3147.35us       | 3208.19us        | 317.73op/s | 311.7op/s | 1338018.75us | 3309.37us |
+| nativetorch | 4517.37us       | 4502.53us        | 221.37op/s | 222.1op/s | 15703.21us | 4523.7us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1612.0W | 1716.0W | 147.08W   | /     | 318.39W       | 325.0W      | 7.52W        | 400W  |
+| flaggems监控结果 | 1599.0W | 1794.0W | 195.0W   | /     | 394.09W       | 406.0W      | 8.27W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.759%    | 1.18%   | 53.26°C       | 23.952%        |
+| flaggems监控结果 | 0.687%    | 1.192%   | 58.76°C       | 16.186%        |
diff --git a/operation/benchmarks/dropout/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/dropout/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..558b06506
--- /dev/null
+++ b/operation/benchmarks/dropout/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.34TFLOPS       | 0.34TFLOPS        | 0.11% | 0.11% |
+| nativetorch | True    | 0.24TFLOPS      | 0.24TFLOPS      | 0.08%      | 0.08%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 3141.34us       | 3203.07us        | 318.34op/s | 312.2op/s | 524416.01us | 3281.54us |
+| nativetorch | 4515.39us       | 4515.84us        | 221.46op/s | 221.44op/s | 15994.08us | 4539.99us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1612.0W | 1716.0W | 147.08W   | /     | 329.5W       | 335.0W      | 5.44W        | 400W  |
+| flaggems监控结果 | 1599.0W | 1794.0W | 195.0W   | /     | 397.94W       | 405.0W      | 4.08W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.719%    | 1.152%   | 54.11°C       | 18.899%        |
+| flaggems监控结果 | 0.599%    | 1.138%   | 56.76°C       | 16.186%        |
diff --git a/operation/benchmarks/dropout/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/dropout/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..94394b948
--- /dev/null
+++ b/operation/benchmarks/dropout/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.17TFLOPS       | 0.17TFLOPS        | 0.87% | 0.86% |
+| nativetorch | True    | 0.14TFLOPS      | 0.14TFLOPS      | 0.7%      | 0.7%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 6343.16us       | 6424.58us        | 157.65op/s | 155.65op/s | 529269.29us | 6513.07us |
+| nativetorch | 7824.75us       | 7882.75us        | 127.8op/s | 126.86op/s | 20842.84us | 7609.15us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1638.0W | 1716.0W | 135.1W   | /     | 287.13W       | 296.0W      | 5.57W        | 400W  |
+| flaggems监控结果 | 1638.0W | 1716.0W | 110.31W   | /     | 331.73W       | 334.0W      | 3.55W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.643%    | 1.104%   | 52.43°C       | 34.056%        |
+| flaggems监控结果 | 0.667%    | 1.158%   | 52.68°C       | 31.343%        |
diff --git a/operation/benchmarks/dropout/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/dropout/nvidia/A100_40_SXM/case_config.yaml
new file mode 100644
index 000000000..529af74ce
--- /dev/null
+++ b/operation/benchmarks/dropout/nvidia/A100_40_SXM/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
\ No newline at end of file
diff --git a/operation/benchmarks/dropout/nvidia/A100_40_SXM/env.sh b/operation/benchmarks/dropout/nvidia/A100_40_SXM/env.sh
new file mode 100644
index 000000000..33786ec0d
--- /dev/null
+++ b/operation/benchmarks/dropout/nvidia/A100_40_SXM/env.sh
@@ -0,0 +1 @@
+echo "NVIDIA PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/dropout/nvidia/A100_40_SXM/requirements.txt b/operation/benchmarks/dropout/nvidia/A100_40_SXM/requirements.txt
new file mode 100644
index 000000000..330e27963
--- /dev/null
+++ b/operation/benchmarks/dropout/nvidia/A100_40_SXM/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/operation/benchmarks/eq/cambricon/MLU/case_config.yaml b/operation/benchmarks/eq/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/eq/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/eq/cambricon/MLU/env.sh b/operation/benchmarks/eq/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/eq/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/eq/case_config.yaml b/operation/benchmarks/eq/case_config.yaml
index 922bb72de..7cfa24472 100644
--- a/operation/benchmarks/eq/case_config.yaml
+++ b/operation/benchmarks/eq/case_config.yaml
@@ -1,5 +1,4 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 10
 ITERS: 1000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/bmm/nvidia/A100_40_SXM/README.md b/operation/benchmarks/eq/iluvatar/BI150/README.md
similarity index 65%
rename from operation/benchmarks/bmm/nvidia/A100_40_SXM/README.md
rename to operation/benchmarks/eq/iluvatar/BI150/README.md
index 658d00e5e..f056738b1 100644
--- a/operation/benchmarks/bmm/nvidia/A100_40_SXM/README.md
+++ b/operation/benchmarks/eq/iluvatar/BI150/README.md
@@ -1,56 +1,56 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a31e96c235
-
-# 评测结果
-
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 1.78E-04    | 235.09TFLOPS       | 245.6TFLOPS        | 75.35% | 78.72% |
-| nativetorch | 1.78E-04    | 242.57TFLOPS      | 250.64TFLOPS      | 77.75%      | 80.33%    |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 4.22E-06    | 1169.23us       | 1119.23us        | 855.26op/s | 893.47op/s | 1646797.64us | 1212.64us |
-| nativetorch | 4.22E-06    | 1133.19us       | 1096.7us        | 882.47op/s | 911.82op/s | 2701.11us | 1257.28us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1872.0W | 1872.0W | 0.0W   | /     | 398.57W       | 405.0W      | 3.47W        | 1872.0  |
-| flaggems监控结果 | 1872.0W | 1872.0W | 0.0W   | /     | 398.94W       | 416.0W      | 8.71W        | 1872.0  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 0.742%    | 2.233%   | 66.07°C       | 1.759%        |
-| flaggems监控结果 | 0.597%    | 2.275%   | 66.19°C       | 3.042%        |
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a31e96c235
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 0.00E+00    | 0.12TFLOPS       | 0.12TFLOPS        | 0.24% | 0.24% |
+| nativetorch | 0.00E+00    | 0.14TFLOPS      | 0.14TFLOPS      | 0.28%      | 0.28%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 0.00E+00    | 9257.5us       | 9268.29us        | 108.02op/s | 107.89op/s | 248942.92us | 9997.55us |
+| nativetorch | 0.00E+00    | 7842.88us       | 7862.79us        | 127.5op/s | 127.18op/s | 8110.18us | 8075.02us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2080.5W | 2109.0W | 39.17W   | /     | 167.06W       | 168.0W      | 0.25W        | 350W  |
+| flaggems监控结果 | 2059.6W | 2090.0W | 37.23W   | /     | 163.27W       | 164.0W      | 0.45W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 41.763%    | 2.393%   | 48.74°C       | 16.364%        |
+| flaggems监控结果 | 45.66%    | 2.394%   | 47.82°C       | 17.926%        |
\ No newline at end of file
diff --git a/operation/benchmarks/eq/iluvatar/BI150/case_config.yaml b/operation/benchmarks/eq/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..62122b8a2
--- /dev/null
+++ b/operation/benchmarks/eq/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 49.152
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/eq/iluvatar/BI150/env.sh b/operation/benchmarks/eq/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/eq/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/eq/iluvatar/BI150/requirements.txt b/operation/benchmarks/eq/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/eq/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/eq/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/eq/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..ec1c9e233
--- /dev/null
+++ b/operation/benchmarks/eq/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,2 @@
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/eq/kunlunxin/R300p/env.sh b/operation/benchmarks/eq/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/eq/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/eq/main.py b/operation/benchmarks/eq/main.py
index d82d7ed7c..16dad170a 100644
--- a/operation/benchmarks/eq/main.py
+++ b/operation/benchmarks/eq/main.py
@@ -4,12 +4,12 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +23,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,35 +53,21 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation"
-          )  # correctness is implemented casebycase
 
     Melements = case_config.Melements
 
-    dtype = {"FP32": torch.float32}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(Melements, dtype=dtype[config.dataformat])
-        b = torch.randn(Melements, dtype=dtype[config.dataformat])
-
-        a_fp64 = a.to(torch.float64)
-        b_fp64 = b.to(torch.float64)
-        r_fp64 = torch.eq(a, b)
-
-        a = a.to(0)
-        b = b.to(0)
-        r_device = torch.eq(a, b).cpu()
-
-#        mape = torch.mean(torch.abs(r_device - r_fp64) / torch.abs(r_fp64))
-        mape = ((r_device != r_fp64).float().sum()/r_fp64.numel()).item()
-        mmape.append(mape)
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
 
     a = torch.randn(Melements, 1024, 1024, dtype=dtype[config.dataformat]).to(0)
     b = torch.randn(Melements, 1024, 1024, dtype=dtype[config.dataformat]).to(0)
@@ -81,11 +75,11 @@ def main(config, case_config):
     latency_nowarm, latency_warm, cputime, kerneltime = do_test(
         torch.eq, (a, b), host_device_sync, config, case_config)
 
-    op2flops = lambda x: x * 2 * Melements * 1024 * 1024
+    op2flops = lambda x: x * Melements * 1024 * 1024
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "eq", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -93,6 +87,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
diff --git a/operation/benchmarks/eq/metax/C550_64/case_config.yaml b/operation/benchmarks/eq/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/eq/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/eq/metax/C550_64/env.sh b/operation/benchmarks/eq/metax/C550_64/env.sh
new file mode 100644
index 000000000..bb4cd2dce
--- /dev/null
+++ b/operation/benchmarks/eq/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_CONST_ARGS_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/eq/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/eq/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..1f881a678
--- /dev/null
+++ b/operation/benchmarks/eq/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.27TFLOPS       | 0.27TFLOPS        | 0.09% | 0.09% |
+| nativetorch | True    | 0.28TFLOPS      | 0.28TFLOPS      | 0.09%      | 0.09%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3972.6us       | 3989.5us        | 251.72op/s | 250.66op/s | 2181690.51us | 4061.19us |
+| nativetorch | 3815.22us       | 3828.74us        | 262.11op/s | 261.18op/s | 22478.2us | 3801.72us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1586.0W | 1638.0W | 73.54W   | /     | 259.03W       | 262.0W      | 3.65W        | 400W  |
+| flaggems监控结果 | 1612.0W | 1716.0W | 97.28W   | /     | 299.77W       | 304.0W      | 5.35W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.742%    | 2.324%   | 47.99°C       | 17.394%        |
+| flaggems监控结果 | 0.749%    | 2.321%   | 51.59°C       | 17.207%        |
diff --git a/operation/benchmarks/eq/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/eq/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..b39f37f45
--- /dev/null
+++ b/operation/benchmarks/eq/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.27TFLOPS       | 0.27TFLOPS        | 0.09% | 0.09% |
+| nativetorch | True    | 0.28TFLOPS      | 0.28TFLOPS      | 0.09%      | 0.09%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3964.2us       | 3981.31us        | 252.26op/s | 251.17op/s | 898462.19us | 4053.63us |
+| nativetorch | 3810.06us       | 3824.64us        | 262.46op/s | 261.46op/s | 840607.53us | 3786.4us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1586.0W | 1638.0W | 73.54W   | /     | 269.47W       | 273.0W      | 3.81W        | 400W  |
+| flaggems监控结果 | 1612.0W | 1716.0W | 97.28W   | /     | 298.08W       | 303.0W      | 5.65W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.701%    | 2.326%   | 48.38°C       | 17.394%        |
+| flaggems监控结果 | 0.675%    | 2.324%   | 50.9°C       | 17.207%        |
diff --git a/operation/benchmarks/eq/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/eq/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..bbe70e663
--- /dev/null
+++ b/operation/benchmarks/eq/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.16TFLOPS       | 0.16TFLOPS        | 0.81% | 0.8% |
+| nativetorch | True    | 0.16TFLOPS      | 0.16TFLOPS      | 0.81%      | 0.81%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 6821.87us       | 6840.32us        | 146.59op/s | 146.19op/s | 1710966.42us | 6979.48us |
+| nativetorch | 6778.43us       | 6796.29us        | 147.53op/s | 147.14op/s | 846129.64us | 6796.64us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1599.0W | 1638.0W | 67.55W   | /     | 260.78W       | 263.0W      | 2.29W        | 400W  |
+| flaggems监控结果 | 1638.0W | 1716.0W | 95.53W   | /     | 285.69W       | 288.0W      | 2.42W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.858%    | 2.294%   | 48.41°C       | 27.499%        |
+| flaggems监控结果 | 0.745%    | 2.295%   | 49.39°C       | 27.312%        |
diff --git a/operation/benchmarks/eq/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/eq/nvidia/A100_40_SXM/case_config.yaml
index f3489fba2..bc4b04b42 100644
--- a/operation/benchmarks/eq/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/eq/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 50000
-SPECTFLOPS: 312
diff --git a/operation/benchmarks/exp/cambricon/MLU/case_config.yaml b/operation/benchmarks/exp/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/exp/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/exp/cambricon/MLU/env.sh b/operation/benchmarks/exp/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/exp/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/exp/case_config.yaml b/operation/benchmarks/exp/case_config.yaml
new file mode 100644
index 000000000..acc0f44fb
--- /dev/null
+++ b/operation/benchmarks/exp/case_config.yaml
@@ -0,0 +1,5 @@
+Melements: 1024
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/exp/iluvatar/BI150/README.md b/operation/benchmarks/exp/iluvatar/BI150/README.md
new file mode 100644
index 000000000..05893c7df
--- /dev/null
+++ b/operation/benchmarks/exp/iluvatar/BI150/README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 4.17E-01    | 0.14TFLOPS       | 0.14TFLOPS        | 0.59% | 0.59% |
+| nativetorch | 5.46E-01    | 0.15TFLOPS      | 0.15TFLOPS      | 0.59%      | 0.59%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 2.25E-02    | 7412.41us       | 7417.78us        | 134.91op/s | 134.81op/s | 337903.93us | 7847.42us |
+| nativetorch | 2.13E-02    | 7367.84us       | 7393.22us        | 135.72op/s | 135.26op/s | 7687.31us | 7652.91us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2066.25W | 2090.0W | 41.14W   | /     | 176.99W       | 177.0W      | 0.12W        | 350W  |
+| flaggems监控结果 | 2071.0W | 2090.0W | 32.91W   | /     | 172.01W       | 173.0W      | 0.12W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 43.451%    | 2.388%   | 47.96°C       | 19.489%        |
+| flaggems监控结果 | 41.585%    | 2.391%   | 47.07°C       | 19.489%        |
diff --git a/operation/benchmarks/exp/iluvatar/BI150/case_config.yaml b/operation/benchmarks/exp/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..4958ee3ba
--- /dev/null
+++ b/operation/benchmarks/exp/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/exp/iluvatar/BI150/env.sh b/operation/benchmarks/exp/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/exp/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/exp/iluvatar/BI150/requirements.txt b/operation/benchmarks/exp/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/exp/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/exp/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/exp/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..ec1c9e233
--- /dev/null
+++ b/operation/benchmarks/exp/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,2 @@
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/exp/kunlunxin/R300p/env.sh b/operation/benchmarks/exp/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/exp/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/exp/main.py b/operation/benchmarks/exp/main.py
new file mode 100644
index 000000000..e7d753a02
--- /dev/null
+++ b/operation/benchmarks/exp/main.py
@@ -0,0 +1,102 @@
+ # Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+import torch
+import os
+import time
+from argparse import ArgumentParser, Namespace
+import yaml
+import sys
+import subprocess
+
+sys.path.append("..")
+from drivers.utils import *
+from drivers.calculate import *
+
+
+def parse_args():
+    parser = ArgumentParser(description=" ")
+
+    parser.add_argument("--vendor",
+                        type=str,
+                        required=True,
+                        help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
+
+    parser.add_argument("--dataformat",
+                        type=str,
+                        required=True,
+                        help="like FP32,FP16")
+
+    parser.add_argument("--oplib",
+                        type=str,
+                        required=True,
+                        help="impl like pytorch/flaggems/cpp")
+
+    parser.add_argument("--chip",
+                        type=str,
+                        required=True,
+                        help="chip like A100_40_SXM")
+
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
+    set_ieee_float32(config.vendor)
+
+
+    Melements = case_config.Melements
+
+
+    a = torch.randn(Melements * 1024 * 1024, dtype=dtype[config.dataformat]).to(0)
+
+    latency_nowarm, latency_warm, cputime, kerneltime = do_test(
+        torch.exp, (a, ), host_device_sync, config, case_config)
+
+    op2flops = lambda x: x * Melements * 1024 * 1024
+
+    perf_result = cal_perf(cputime, kerneltime, op2flops,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
+                 latency_nowarm, latency_warm)
+
+
+if __name__ == "__main__":
+    config = parse_args()
+    with open("case_config.yaml", "r") as file:
+        case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
+    with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
+              "r") as file:
+        case_config_vendor = yaml.safe_load(file)
+    case_config.update(case_config_vendor)
+    case_config = Namespace(**case_config)
+
+    if config.oplib == "flaggems":
+        import flag_gems
+        flag_gems.enable()
+        print("Using flaggems")
+    else:
+        print("Using nativetorch")
+    main(config, case_config)
\ No newline at end of file
diff --git a/operation/benchmarks/exp/metax/C550_64/case_config.yaml b/operation/benchmarks/exp/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/exp/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/exp/metax/C550_64/env.sh b/operation/benchmarks/exp/metax/C550_64/env.sh
new file mode 100644
index 000000000..bb4cd2dce
--- /dev/null
+++ b/operation/benchmarks/exp/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_CONST_ARGS_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/exp/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/exp/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..dfc2d6022
--- /dev/null
+++ b/operation/benchmarks/exp/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.34TFLOPS       | 0.34TFLOPS        | 0.11% | 0.11% |
+| nativetorch | True    | 0.35TFLOPS      | 0.34TFLOPS      | 0.11%      | 0.11%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3186.29us       | 3190.78us        | 313.84op/s | 313.4op/s | 860088.85us | 3266.39us |
+| nativetorch | 3109.29us       | 3115.01us        | 321.62op/s | 321.03op/s | 23140.04us | 3129.42us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1599.0W | 1716.0W | 117.0W   | /     | 306.0W       | 310.0W      | 5.24W        | 400W  |
+| flaggems监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 271.88W       | 276.0W      | 4.42W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.856%    | 2.309%   | 51.45°C       | 17.394%        |
+| flaggems监控结果 | 0.847%    | 2.307%   | 48.72°C       | 17.207%        |
diff --git a/operation/benchmarks/exp/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/exp/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..5b2a4119e
--- /dev/null
+++ b/operation/benchmarks/exp/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.34TFLOPS       | 0.34TFLOPS        | 0.11% | 0.11% |
+| nativetorch | True    | 0.35TFLOPS      | 0.34TFLOPS      | 0.11%      | 0.11%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3184.43us       | 3188.74us        | 314.03op/s | 313.6op/s | 1436720.37us | 3270.22us |
+| nativetorch | 3108.24us       | 3113.98us        | 321.73op/s | 321.13op/s | 24362.78us | 3124.14us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1599.0W | 1716.0W | 117.0W   | /     | 317.84W       | 322.0W      | 5.39W        | 400W  |
+| flaggems监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 282.34W       | 288.0W      | 4.16W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.908%    | 2.314%   | 51.48°C       | 17.394%        |
+| flaggems监控结果 | 0.857%    | 2.313%   | 49.05°C       | 17.207%        |
diff --git a/operation/benchmarks/exp/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/exp/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..4464be522
--- /dev/null
+++ b/operation/benchmarks/exp/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.17TFLOPS       | 0.17TFLOPS        | 0.88% | 0.88% |
+| nativetorch | True    | 0.17TFLOPS      | 0.17TFLOPS      | 0.89%      | 0.89%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 6258.74us       | 6269.95us        | 159.78op/s | 159.49op/s | 2003050.95us | 6365.73us |
+| nativetorch | 6191.55us       | 6193.15us        | 161.51op/s | 161.47op/s | 29134.02us | 6241.25us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1612.0W | 1716.0W | 97.28W   | /     | 280.1W       | 283.0W      | 2.83W        | 400W  |
+| flaggems监控结果 | 1586.0W | 1638.0W | 73.54W   | /     | 265.86W       | 268.0W      | 3.34W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.35%    | 2.292%   | 49.81°C       | 42.656%        |
+| flaggems监控结果 | 0.81%    | 2.292%   | 49.37°C       | 32.364%        |
diff --git a/operation/benchmarks/exp/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/exp/nvidia/A100_40_SXM/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/exp/nvidia/A100_40_SXM/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/exp/nvidia/A100_40_SXM/env.sh b/operation/benchmarks/exp/nvidia/A100_40_SXM/env.sh
new file mode 100644
index 000000000..33786ec0d
--- /dev/null
+++ b/operation/benchmarks/exp/nvidia/A100_40_SXM/env.sh
@@ -0,0 +1 @@
+echo "NVIDIA PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/exp/nvidia/A100_40_SXM/requirements.txt b/operation/benchmarks/exp/nvidia/A100_40_SXM/requirements.txt
new file mode 100644
index 000000000..330e27963
--- /dev/null
+++ b/operation/benchmarks/exp/nvidia/A100_40_SXM/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/operation/benchmarks/ge/cambricon/MLU/case_config.yaml b/operation/benchmarks/ge/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/ge/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/ge/cambricon/MLU/env.sh b/operation/benchmarks/ge/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/ge/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/ge/case_config.yaml b/operation/benchmarks/ge/case_config.yaml
index 6ebcc4612..acc0f44fb 100644
--- a/operation/benchmarks/ge/case_config.yaml
+++ b/operation/benchmarks/ge/case_config.yaml
@@ -1,5 +1,4 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 50000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/ge/iluvatar/BI150/README.md b/operation/benchmarks/ge/iluvatar/BI150/README.md
new file mode 100644
index 000000000..bec4b0451
--- /dev/null
+++ b/operation/benchmarks/ge/iluvatar/BI150/README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 0.00E+00    | 0.11TFLOPS       | 0.11TFLOPS        | 0.47% | 0.47% |
+| nativetorch | 0.00E+00    | 0.14TFLOPS      | 0.14TFLOPS      | 0.56%      | 0.56%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 0.00E+00    | 9367.86us       | 9388.52us        | 106.75op/s | 106.51op/s | 252885.14us | 9842.44us |
+| nativetorch | 0.00E+00    | 7834.02us       | 7864.16us        | 127.65op/s | 127.16op/s | 8152.24us | 8132.34us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2090.0W | 2109.0W | 32.91W   | /     | 167.96W       | 168.0W      | 0.19W        | 350W  |
+| flaggems监控结果 | 2090.0W | 2109.0W | 24.03W   | /     | 163.97W       | 164.0W      | 0.18W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 39.204%    | 2.582%   | 49.73°C       | 16.364%        |
+| flaggems监控结果 | 39.223%    | 2.587%   | 48.87°C       | 17.926%        |
\ No newline at end of file
diff --git a/operation/benchmarks/ge/iluvatar/BI150/case_config.yaml b/operation/benchmarks/ge/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..4958ee3ba
--- /dev/null
+++ b/operation/benchmarks/ge/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/ge/iluvatar/BI150/env.sh b/operation/benchmarks/ge/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/ge/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/ge/iluvatar/BI150/requirements.txt b/operation/benchmarks/ge/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/ge/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/ge/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/ge/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..ec1c9e233
--- /dev/null
+++ b/operation/benchmarks/ge/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,2 @@
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/ge/kunlunxin/R300p/env.sh b/operation/benchmarks/ge/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/ge/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/ge/main.py b/operation/benchmarks/ge/main.py
index 371b6048d..54354ad69 100644
--- a/operation/benchmarks/ge/main.py
+++ b/operation/benchmarks/ge/main.py
@@ -4,12 +4,12 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +23,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,36 +53,22 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation"
-          )  # correctness is implemented casebycase
 
     m = case_config.Melements
 
 
-    dtype = {"FP32": torch.float32}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(m, dtype=dtype[config.dataformat])
-        b = torch.randn(m, dtype=dtype[config.dataformat])
-
-        a_fp64 = a.to(torch.float64)
-        b_fp64 = b.to(torch.float64)
-        r_fp64 = torch.ge(a_fp64, b_fp64)
-
-        a = a.to(0)
-        b = b.to(0)
-        r_device = torch.ge(a, b).cpu()
-        mape = ((r_device != r_fp64).float().sum()/r_fp64.numel()).item()
-
-        mmape.append(mape)
-    
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
 
     a = torch.randn(m, 1024, 1024,  dtype=dtype[config.dataformat]).to(0)
     b = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat]).to(0)
@@ -82,11 +76,11 @@ def main(config, case_config):
     latency_nowarm, latency_warm, cputime, kerneltime = do_test(
         torch.ge, (a, b), host_device_sync, config, case_config)
 
-    op2flops = lambda x: x * 2 * m * 1024 * 1024
+    op2flops = lambda x: x * m * 1024 * 1024
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "ge", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -94,6 +88,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
diff --git a/operation/benchmarks/ge/metax/C550_64/case_config.yaml b/operation/benchmarks/ge/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/ge/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/ge/metax/C550_64/env.sh b/operation/benchmarks/ge/metax/C550_64/env.sh
new file mode 100644
index 000000000..bb4cd2dce
--- /dev/null
+++ b/operation/benchmarks/ge/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_CONST_ARGS_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/ge/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/ge/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..56347074e
--- /dev/null
+++ b/operation/benchmarks/ge/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.27TFLOPS       | 0.27TFLOPS        | 0.09% | 0.09% |
+| nativetorch | True    | 0.28TFLOPS      | 0.28TFLOPS      | 0.09%      | 0.09%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3972.96us       | 3991.55us        | 251.7op/s | 250.53op/s | 989515.87us | 4066.99us |
+| nativetorch | 3814.06us       | 3828.74us        | 262.19op/s | 261.18op/s | 18294.3us | 3832.56us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1638.0W | 110.31W   | /     | 261.82W       | 265.0W      | 3.3W        | 400W  |
+| flaggems监控结果 | 1638.0W | 1716.0W | 110.31W   | /     | 303.8W       | 308.0W      | 5.48W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.74%    | 2.522%   | 48.15°C       | 17.394%        |
+| flaggems监控结果 | 1.05%    | 2.521%   | 51.77°C       | 17.207%        |
diff --git a/operation/benchmarks/ge/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/ge/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..3c112e986
--- /dev/null
+++ b/operation/benchmarks/ge/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.27TFLOPS       | 0.27TFLOPS        | 0.09% | 0.09% |
+| nativetorch | True    | 0.28TFLOPS      | 0.28TFLOPS      | 0.09%      | 0.09%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3962.31us       | 3981.31us        | 252.38op/s | 251.17op/s | 879315.56us | 4060.77us |
+| nativetorch | 3809.17us       | 3823.62us        | 262.52op/s | 261.53op/s | 18512.46us | 3827.93us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1586.0W | 1638.0W | 73.54W   | /     | 270.97W       | 279.0W      | 3.62W        | 400W  |
+| flaggems监控结果 | 1612.0W | 1716.0W | 97.28W   | /     | 301.27W       | 305.0W      | 5.45W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.749%    | 2.524%   | 48.89°C       | 17.394%        |
+| flaggems监控结果 | 0.744%    | 2.527%   | 50.79°C       | 17.207%        |
diff --git a/operation/benchmarks/ge/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/ge/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..07fba0a25
--- /dev/null
+++ b/operation/benchmarks/ge/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.16TFLOPS       | 0.16TFLOPS        | 0.81% | 0.8% |
+| nativetorch | True    | 0.16TFLOPS      | 0.16TFLOPS      | 0.81%      | 0.81%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 6822.21us       | 6840.32us        | 146.58op/s | 146.19op/s | 1484940.19us | 6922.37us |
+| nativetorch | 6778.9us       | 6796.29us        | 147.52op/s | 147.14op/s | 21901.5us | 6805.13us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1599.0W | 1638.0W | 67.55W   | /     | 263.16W       | 268.0W      | 3.17W        | 400W  |
+| flaggems监控结果 | 1657.5W | 1716.0W | 101.32W   | /     | 287.93W       | 290.0W      | 2.69W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.75%    | 2.495%   | 48.17°C       | 27.499%        |
+| flaggems监控结果 | 0.699%    | 2.491%   | 49.41°C       | 30.025%        |
diff --git a/operation/benchmarks/ge/nvidia/A100_40_SXM/README.md b/operation/benchmarks/ge/nvidia/A100_40_SXM/README.md
deleted file mode 100644
index 1021f43ae..000000000
--- a/operation/benchmarks/ge/nvidia/A100_40_SXM/README.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a31e96c235
-
-# 评测结果
-
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 0.00E+00    | 0.32TFLOPS       | 0.32TFLOPS        | 1.63% | 1.62% |
-| nativetorch | 0.00E+00    | 0.32TFLOPS      | 0.32TFLOPS      | 1.62%      | 1.62%    |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 0.00E+00    | 6770.17us       | 6787.07us        | 147.71op/s | 147.34op/s | 303095.12us | 6845.48us |
-| nativetorch | 0.00E+00    | 6778.54us       | 6796.29us        | 147.52op/s | 147.14op/s | 9853.79us | 7919.67us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1664.0W | 1716.0W | 36.77W   | /     | 261.93W       | 266.0W      | 4.09W        | 1664.0  |
-| flaggems监控结果 | 1716.0W | 1716.0W | 0.0W   | /     | 300.87W       | 304.0W      | 3.66W        | 1716.0  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 0.8%    | 1.405%   | 48.4°C       | 26.483%        |
-| flaggems监控结果 | 0.831%    | 1.405%   | 51.06°C       | 29.009%        |
diff --git a/operation/benchmarks/ge/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/ge/nvidia/A100_40_SXM/case_config.yaml
index c7975e944..bc4b04b42 100644
--- a/operation/benchmarks/ge/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/ge/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 50000
-SPECTFLOPS: 19.5
\ No newline at end of file
diff --git a/operation/benchmarks/gelu/cambricon/MLU/case_config.yaml b/operation/benchmarks/gelu/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/gelu/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/gelu/cambricon/MLU/env.sh b/operation/benchmarks/gelu/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/gelu/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/gelu/case_config.yaml b/operation/benchmarks/gelu/case_config.yaml
index 6ebcc4612..acc0f44fb 100644
--- a/operation/benchmarks/gelu/case_config.yaml
+++ b/operation/benchmarks/gelu/case_config.yaml
@@ -1,5 +1,4 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 50000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/gelu/iluvatar/BI150/README.md b/operation/benchmarks/gelu/iluvatar/BI150/README.md
new file mode 100644
index 000000000..96a545019
--- /dev/null
+++ b/operation/benchmarks/gelu/iluvatar/BI150/README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 1.98E-07    | 0.65TFLOPS       | 0.65TFLOPS        | 2.65% | 2.65% |
+| nativetorch | 1.98E-07    | 0.66TFLOPS      | 0.65TFLOPS      | 2.67%      | 2.66%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 5.80E-07    | 7414.56us       | 7417.78us        | 134.87op/s | 134.81op/s | 351581.68us | 7976.16us |
+| nativetorch | 5.80E-07    | 7364.66us       | 7392.9us        | 135.78op/s | 135.26op/s | 7674.4us | 7758.33us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2099.5W | 2128.0W | 49.36W   | /     | 182.85W       | 184.0W      | 4.6W        | 350W  |
+| flaggems监控结果 | 2099.5W | 2128.0W | 49.36W   | /     | 189.89W       | 190.0W      | 0.35W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 53.276%    | 2.388%   | 52.32°C       | 19.489%        |
+| flaggems监控结果 | 51.369%    | 2.391%   | 53.85°C       | 19.489%        |
\ No newline at end of file
diff --git a/operation/benchmarks/gelu/iluvatar/BI150/case_config.yaml b/operation/benchmarks/gelu/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..4958ee3ba
--- /dev/null
+++ b/operation/benchmarks/gelu/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/gelu/iluvatar/BI150/env.sh b/operation/benchmarks/gelu/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/gelu/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/gelu/iluvatar/BI150/requirements.txt b/operation/benchmarks/gelu/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/gelu/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/gelu/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/gelu/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..e40de9acf
--- /dev/null
+++ b/operation/benchmarks/gelu/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,3 @@
+Shape: [1024, 1024]
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/gelu/kunlunxin/R300p/env.sh b/operation/benchmarks/gelu/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/gelu/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/gelu/main.py b/operation/benchmarks/gelu/main.py
index b7e8695fe..e7dd95057 100644
--- a/operation/benchmarks/gelu/main.py
+++ b/operation/benchmarks/gelu/main.py
@@ -4,12 +4,13 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
+import math
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +24,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,41 +54,39 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation")
     m = case_config.Melements
-    f = torch.nn.GELU()
-    dtype = {"FP32": torch.float32}
+    # default shape: (M, 1024, 1024)
+    shape = (m, 1024, 1024)
 
-    mmape = []
+    if config.vendor == 'kunlunxin':
+        # if `Shape' specified in `case_config.yaml', use it
+        if case_config.__contains__('Shape') and case_config.Shape is not None:
+            shape = case_config.Shape
 
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(m, dtype=dtype[config.dataformat])
-
-        a_fp64 = a.to(torch.float64)
-        r_fp64 = f(a_fp64) # 修改为gelu
-
-        a = a.to(0)
-        r_device = f(a).cpu() 
-        mape = torch.mean(torch.abs(r_device - r_fp64) / torch.abs(r_fp64))
-
-        mmape.append(mape)
-    
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
-
-    a = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat]).to(0)
+    a = torch.randn(shape, dtype=dtype[config.dataformat]).to(0)
+    print(f'Shape for performance_test: {a.shape}')
 
+    f = torch.nn.GELU()
     latency_nowarm, latency_warm, cputime, kerneltime = do_test(
         f, (a, ), host_device_sync, config, case_config) # 调整为torch.sub
 
-    op2flops = lambda x: x * 9 * m * 1024 * 1024 # 根据减法的实际FLOPs调整
+    op2flops = lambda x: x * 9 * math.prod(shape)
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "gelu", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -87,6 +94,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
diff --git a/operation/benchmarks/gelu/metax/C550_64/case_config.yaml b/operation/benchmarks/gelu/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/gelu/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/gelu/metax/C550_64/env.sh b/operation/benchmarks/gelu/metax/C550_64/env.sh
new file mode 100644
index 000000000..bb4cd2dce
--- /dev/null
+++ b/operation/benchmarks/gelu/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_CONST_ARGS_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/gelu/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/gelu/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..78973a234
--- /dev/null
+++ b/operation/benchmarks/gelu/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 2.7TFLOPS       | 2.71TFLOPS        | 0.87% | 0.87% |
+| nativetorch | True    | 2.96TFLOPS      | 2.95TFLOPS      | 0.95%      | 0.95%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3576.63us       | 3561.47us        | 279.59op/s | 280.78op/s | 1426179.79us | 3652.27us |
+| nativetorch | 3269.93us       | 3274.75us        | 305.82op/s | 305.37op/s | 18146.86us | 3293.32us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1638.0W | 1794.0W | 156.0W   | /     | 382.0W       | 389.0W      | 8.34W        | 400W  |
+| flaggems监控结果 | 1638.0W | 1794.0W | 156.0W   | /     | 397.14W       | 405.0W      | 4.96W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.837%    | 2.306%   | 57.74°C       | 17.394%        |
+| flaggems监控结果 | 0.966%    | 2.307%   | 62.89°C       | 17.212%        |
diff --git a/operation/benchmarks/gelu/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/gelu/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..d48805330
--- /dev/null
+++ b/operation/benchmarks/gelu/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 2.71TFLOPS       | 2.71TFLOPS        | 0.87% | 0.87% |
+| nativetorch | True    | 2.95TFLOPS      | 2.95TFLOPS      | 0.95%      | 0.94%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3571.19us       | 3559.42us        | 280.02op/s | 280.94op/s | 1002138.84us | 3623.78us |
+| nativetorch | 3277.56us       | 3280.9us        | 305.1op/s | 304.79op/s | 28947.12us | 3313.13us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1638.0W | 1794.0W | 156.0W   | /     | 395.03W       | 402.0W      | 8.28W        | 400W  |
+| flaggems监控结果 | 1677.0W | 1872.0W | 195.0W   | /     | 396.25W       | 404.0W      | 5.96W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.775%    | 2.31%   | 62.15°C       | 17.394%        |
+| flaggems监控结果 | 0.857%    | 2.309%   | 58.45°C       | 17.212%        |
diff --git a/operation/benchmarks/gelu/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/gelu/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..95cd9068c
--- /dev/null
+++ b/operation/benchmarks/gelu/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 1.55TFLOPS       | 1.54TFLOPS        | 7.94% | 7.92% |
+| nativetorch | True    | 1.57TFLOPS      | 1.57TFLOPS      | 8.08%      | 8.07%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 6237.56us       | 6255.62us        | 160.32op/s | 159.86op/s | 1716276.85us | 6340.94us |
+| nativetorch | 6137.1us       | 6140.93us        | 162.94op/s | 162.84op/s | 22741.53us | 6172.7us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1612.0W | 1716.0W | 147.08W   | /     | 325.82W       | 330.0W      | 4.17W        | 400W  |
+| flaggems监控结果 | 1690.0W | 1794.0W | 147.08W   | /     | 355.78W       | 359.0W      | 3.95W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.799%    | 2.29%   | 52.09°C       | 32.551%        |
+| flaggems监控结果 | 0.73%    | 2.291%   | 53.85°C       | 42.656%        |
diff --git a/operation/benchmarks/gelu/nvidia/A100_40_SXM/README.md b/operation/benchmarks/gelu/nvidia/A100_40_SXM/README.md
deleted file mode 100644
index 09066f016..000000000
--- a/operation/benchmarks/gelu/nvidia/A100_40_SXM/README.md
+++ /dev/null
@@ -1,57 +0,0 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:9168f2d031ecc1b31a9f658fb66dd6735b7306b3
-
-# 评测结果
-
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 1.80E-07    | 1.57TFLOPS       | 1.57TFLOPS        | 8.07% | 8.06% |
-| nativetorch | 2.15E-07    | 1.57TFLOPS      | 1.57TFLOPS      | 8.07%      | 8.07%    |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时>延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 3.87E-07    | 6139.86us       | 6145.02us        | 162.87op/s | 162.73op/s | 314473.94us | 6205.19us |
-| nativetorch | 4.20E-07    | 6138.4us       | 6141.95us        | 162.91op/s | 162.81op/s | 10541.9us | 6156.06us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单
-卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1716.0W | 1716.0W | 0.0W   | /     | 325.77W       | 329.0W      | 5.39W        | 1716.0  |
-| flaggems监控结果 | 1794.0W | 1794.0W | 0.0W   | /     | 373.03W       | 379.0W      | 4.64W        | 1794.0  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 0.798%    | 1.396%   | 52.73°C       | 41.64%        |
-| flaggems监控结果 | 0.768%    | 1.396%   | 54.92°C       | 31.352%        |
diff --git a/operation/benchmarks/gelu/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/gelu/nvidia/A100_40_SXM/case_config.yaml
index c7975e944..bc4b04b42 100644
--- a/operation/benchmarks/gelu/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/gelu/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 50000
-SPECTFLOPS: 19.5
\ No newline at end of file
diff --git a/operation/benchmarks/group_norm/cambricon/MLU/case_config.yaml b/operation/benchmarks/group_norm/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/group_norm/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/group_norm/cambricon/MLU/env.sh b/operation/benchmarks/group_norm/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/group_norm/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/group_norm/case_config.yaml b/operation/benchmarks/group_norm/case_config.yaml
new file mode 100644
index 000000000..a4396df36
--- /dev/null
+++ b/operation/benchmarks/group_norm/case_config.yaml
@@ -0,0 +1,7 @@
+bs: 20
+channel: 6
+hiddensize: 65536
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
diff --git a/operation/benchmarks/group_norm/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/group_norm/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..0c2890f86
--- /dev/null
+++ b/operation/benchmarks/group_norm/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,5 @@
+bs: 1
+channel: 3
+hiddensize: 4
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/group_norm/kunlunxin/R300p/env.sh b/operation/benchmarks/group_norm/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/group_norm/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/group_norm/main.py b/operation/benchmarks/group_norm/main.py
new file mode 100644
index 000000000..1210777f9
--- /dev/null
+++ b/operation/benchmarks/group_norm/main.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+import torch
+import os
+import time
+from argparse import ArgumentParser, Namespace
+import yaml
+import sys
+import subprocess
+
+sys.path.append("..")
+from drivers.utils import *
+from drivers.calculate import *
+
+
+def parse_args():
+    parser = ArgumentParser(description=" ")
+
+    parser.add_argument("--vendor",
+                        type=str,
+                        required=True,
+                        help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
+    parser.add_argument("--dataformat",
+                        type=str,
+                        required=True,
+                        help="like FP32,FP16")
+
+    parser.add_argument("--oplib",
+                        type=str,
+                        required=True,
+                        help="impl like pytorch/flaggems/cpp")
+
+    parser.add_argument("--chip",
+                        type=str,
+                        required=True,
+                        help="chip like A100_40_SXM")
+
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
+    set_ieee_float32(config.vendor)
+
+    bs = case_config.bs
+    channel = case_config.channel
+    hiddensize = case_config.hiddensize
+    a = torch.randn(bs, channel,  hiddensize, dtype=dtype[config.dataformat], requires_grad=True).to(0)
+    f = torch.nn.GroupNorm(channel // 2, channel, dtype=dtype[config.dataformat]).to(0)
+    latency_nowarm, latency_warm, cputime, kerneltime = do_test(
+        f, (a, ), host_device_sync, config, case_config)
+
+    op2flops = lambda x: x * bs * channel * hiddensize * 9
+
+    perf_result = cal_perf(cputime, kerneltime, op2flops,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
+                 latency_nowarm, latency_warm)
+
+
+if __name__ == "__main__":
+    config = parse_args()
+    with open("case_config.yaml", "r") as file:
+        case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
+    with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
+              "r") as file:
+        case_config_vendor = yaml.safe_load(file)
+    case_config.update(case_config_vendor)
+    case_config = Namespace(**case_config)
+
+    if config.oplib == "flaggems":
+        import flag_gems
+        flag_gems.enable()
+        print("Using flaggems")
+    else:
+        print("Using nativetorch")
+    main(config, case_config)
diff --git a/operation/benchmarks/group_norm/metax/C550_64/case_config.yaml b/operation/benchmarks/group_norm/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..74543ac45
--- /dev/null
+++ b/operation/benchmarks/group_norm/metax/C550_64/case_config.yaml
@@ -0,0 +1,4 @@
+bs: 16
+channel: 512
+hiddensize: 1024
+ITERS: 50000
\ No newline at end of file
diff --git a/operation/benchmarks/group_norm/metax/C550_64/env.sh b/operation/benchmarks/group_norm/metax/C550_64/env.sh
new file mode 100644
index 000000000..0cdec082d
--- /dev/null
+++ b/operation/benchmarks/group_norm/metax/C550_64/env.sh
@@ -0,0 +1 @@
+echo "METAX PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/group_norm/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/group_norm/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..59a5bf870
--- /dev/null
+++ b/operation/benchmarks/group_norm/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.27TFLOPS       | 0.26TFLOPS        | 0.09% | 0.08% |
+| nativetorch | True    | 0.41TFLOPS      | 0.38TFLOPS      | 0.13%      | 0.12%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 263.82us       | 268.29us        | 3790.48op/s | 3727.34op/s | 6721148.13us | 429.98us |
+| nativetorch | 173.51us       | 188.42us        | 5763.33op/s | 5307.4op/s | 14461.22us | 239.68us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1404.0W | 1404.0W | 0.0W   | /     | 151.0W       | 152.0W      | 1.0W        | 400W  |
+| flaggems监控结果 | 1404.0W | 1404.0W | 0.0W   | /     | 162.33W       | 169.0W      | 4.71W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.481%    | 1.213%   | 34.1°C       | 2.622%        |
+| flaggems监控结果 | 1.755%    | 1.227%   | 34.76°C       | 4.389%        |
diff --git a/operation/benchmarks/group_norm/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/group_norm/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..cac707e94
--- /dev/null
+++ b/operation/benchmarks/group_norm/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.31TFLOPS       | 0.3TFLOPS        | 0.1% | 0.1% |
+| nativetorch | True    | 0.41TFLOPS      | 0.37TFLOPS      | 0.13%      | 0.12%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 229.14us       | 236.54us        | 4364.19op/s | 4227.54op/s | 5382161.34us | 367.07us |
+| nativetorch | 173.57us       | 189.44us        | 5761.23op/s | 5278.72op/s | 13834.66us | 237.72us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1404.0W | 1404.0W | 0.0W   | /     | 151.5W       | 152.0W      | 0.5W        | 400W  |
+| flaggems监控结果 | 1404.0W | 1404.0W | 0.0W   | /     | 164.5W       | 165.0W      | 0.5W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.979%    | 1.213%   | 33.86°C       | 2.622%        |
+| flaggems监控结果 | 1.105%    | 1.213%   | 34.1°C       | 3.797%        |
diff --git a/operation/benchmarks/group_norm/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/group_norm/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..ed37696fb
--- /dev/null
+++ b/operation/benchmarks/group_norm/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.2TFLOPS       | 0.19TFLOPS        | 1.0% | 0.99% |
+| nativetorch | True    | 0.34TFLOPS      | 0.33TFLOPS      | 1.77%      | 1.7%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 361.43us       | 367.62us        | 2766.76op/s | 2720.23op/s | 7182146.77us | 537.05us |
+| nativetorch | 205.34us       | 212.99us        | 4869.98op/s | 4695.01op/s | 30508.51us | 271.59us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1404.0W | 1404.0W | 0.0W   | /     | 171.0W       | 172.0W      | 1.0W        | 400W  |
+| flaggems监控结果 | 1404.0W | 1404.0W | 0.0W   | /     | 166.0W       | 168.0W      | 1.87W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.695%    | 1.215%   | 33.86°C       | 2.622%        |
+| flaggems监控结果 | 0.584%    | 1.216%   | 34.86°C       | 5.449%        |
diff --git a/operation/benchmarks/group_norm/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/group_norm/nvidia/A100_40_SXM/case_config.yaml
new file mode 100644
index 000000000..529af74ce
--- /dev/null
+++ b/operation/benchmarks/group_norm/nvidia/A100_40_SXM/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
\ No newline at end of file
diff --git a/operation/benchmarks/group_norm/nvidia/A100_40_SXM/env.sh b/operation/benchmarks/group_norm/nvidia/A100_40_SXM/env.sh
new file mode 100644
index 000000000..33786ec0d
--- /dev/null
+++ b/operation/benchmarks/group_norm/nvidia/A100_40_SXM/env.sh
@@ -0,0 +1 @@
+echo "NVIDIA PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/group_norm/nvidia/A100_40_SXM/requirements.txt b/operation/benchmarks/group_norm/nvidia/A100_40_SXM/requirements.txt
new file mode 100644
index 000000000..330e27963
--- /dev/null
+++ b/operation/benchmarks/group_norm/nvidia/A100_40_SXM/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/operation/benchmarks/gt/cambricon/MLU/case_config.yaml b/operation/benchmarks/gt/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/gt/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/gt/cambricon/MLU/env.sh b/operation/benchmarks/gt/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/gt/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/gt/case_config.yaml b/operation/benchmarks/gt/case_config.yaml
index 6ebcc4612..acc0f44fb 100644
--- a/operation/benchmarks/gt/case_config.yaml
+++ b/operation/benchmarks/gt/case_config.yaml
@@ -1,5 +1,4 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 50000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/gt/iluvatar/BI150/README.md b/operation/benchmarks/gt/iluvatar/BI150/README.md
new file mode 100644
index 000000000..09944a1cd
--- /dev/null
+++ b/operation/benchmarks/gt/iluvatar/BI150/README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 0.00E+00    | 0.12TFLOPS       | 0.12TFLOPS        | 0.47% | 0.47% |
+| nativetorch | 0.00E+00    | 0.14TFLOPS      | 0.14TFLOPS      | 0.56%      | 0.56%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 0.00E+00    | 9243.81us       | 9262.75us        | 108.18op/s | 107.96op/s | 247835.77us | 9726.58us |
+| nativetorch | 0.00E+00    | 7831.4us       | 7861.4us        | 127.69op/s | 127.2op/s | 8118.6us | 8125.91us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2090.0W | 2109.0W | 32.91W   | /     | 167.92W       | 168.0W      | 0.27W        | 350W  |
+| flaggems监控结果 | 2074.8W | 2109.0W | 42.32W   | /     | 163.83W       | 165.0W      | 1.88W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 46.163%    | 2.391%   | 49.37°C       | 16.364%        |
+| flaggems监控结果 | 44.343%    | 2.394%   | 48.04°C       | 17.926%        |
diff --git a/operation/benchmarks/gt/iluvatar/BI150/case_config.yaml b/operation/benchmarks/gt/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..4958ee3ba
--- /dev/null
+++ b/operation/benchmarks/gt/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/gt/iluvatar/BI150/env.sh b/operation/benchmarks/gt/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/gt/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/gt/iluvatar/BI150/requirements.txt b/operation/benchmarks/gt/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/gt/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/gt/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/gt/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..ec1c9e233
--- /dev/null
+++ b/operation/benchmarks/gt/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,2 @@
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/gt/kunlunxin/R300p/env.sh b/operation/benchmarks/gt/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/gt/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/gt/main.py b/operation/benchmarks/gt/main.py
index 0a675c244..00ac9fc6d 100644
--- a/operation/benchmarks/gt/main.py
+++ b/operation/benchmarks/gt/main.py
@@ -4,12 +4,12 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +23,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,36 +53,22 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation"
-          )  # correctness is implemented casebycase
 
     m = case_config.Melements
 
 
-    dtype = {"FP32": torch.float32}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(m, dtype=dtype[config.dataformat])
-        b = torch.randn(m, dtype=dtype[config.dataformat])
-
-        a_fp64 = a.to(torch.float64)
-        b_fp64 = b.to(torch.float64)
-        r_fp64 = torch.gt(a_fp64, b_fp64)
-
-        a = a.to(0)
-        b = b.to(0)
-        r_device = torch.gt(a, b).cpu()
-        mape = ((r_device != r_fp64).float().sum()/r_fp64.numel()).item()
-
-        mmape.append(mape)
-    
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
 
     a = torch.randn(m, 1024, 1024,  dtype=dtype[config.dataformat]).to(0)
     b = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat]).to(0)
@@ -82,11 +76,11 @@ def main(config, case_config):
     latency_nowarm, latency_warm, cputime, kerneltime = do_test(
         torch.gt, (a, b), host_device_sync, config, case_config)
 
-    op2flops = lambda x: x * 2 * m * 1024 * 1024
+    op2flops = lambda x: x * m * 1024 * 1024
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "gt", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -94,6 +88,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
diff --git a/operation/benchmarks/gt/metax/C550_64/case_config.yaml b/operation/benchmarks/gt/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/gt/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/gt/metax/C550_64/env.sh b/operation/benchmarks/gt/metax/C550_64/env.sh
new file mode 100644
index 000000000..0cdec082d
--- /dev/null
+++ b/operation/benchmarks/gt/metax/C550_64/env.sh
@@ -0,0 +1 @@
+echo "METAX PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/gt/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/gt/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..cfd3f435d
--- /dev/null
+++ b/operation/benchmarks/gt/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.27TFLOPS       | 0.27TFLOPS        | 0.09% | 0.09% |
+| nativetorch | True    | 0.28TFLOPS      | 0.28TFLOPS      | 0.09%      | 0.09%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3971.78us       | 3991.55us        | 251.78op/s | 250.53op/s | 2015949.6us | 4068.82us |
+| nativetorch | 3814.01us       | 3828.74us        | 262.19op/s | 261.18op/s | 18569.3us | 3857.35us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1586.0W | 1638.0W | 73.54W   | /     | 263.28W       | 267.0W      | 4.15W        | 400W  |
+| flaggems监控结果 | 1638.0W | 1716.0W | 110.31W   | /     | 303.62W       | 308.0W      | 6.04W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.742%    | 2.323%   | 48.18°C       | 17.394%        |
+| flaggems监控结果 | 0.773%    | 2.323%   | 51.62°C       | 17.207%        |
diff --git a/operation/benchmarks/gt/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/gt/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..f9b361c1e
--- /dev/null
+++ b/operation/benchmarks/gt/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.27TFLOPS       | 0.27TFLOPS        | 0.09% | 0.09% |
+| nativetorch | True    | 0.28TFLOPS      | 0.28TFLOPS      | 0.09%      | 0.09%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3962.4us       | 3981.31us        | 252.37op/s | 251.17op/s | 1401777.92us | 4061.56us |
+| nativetorch | 3808.89us       | 3823.62us        | 262.54op/s | 261.53op/s | 1125523.32us | 3826.0us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1586.0W | 1638.0W | 73.54W   | /     | 275.0W       | 280.0W      | 5.19W        | 400W  |
+| flaggems监控结果 | 1638.0W | 1716.0W | 110.31W   | /     | 301.52W       | 305.0W      | 4.85W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.455%    | 2.425%   | 49.31°C       | 17.394%        |
+| flaggems监控结果 | 0.675%    | 2.42%   | 51.04°C       | 17.207%        |
diff --git a/operation/benchmarks/gt/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/gt/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..2c99c3378
--- /dev/null
+++ b/operation/benchmarks/gt/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.16TFLOPS       | 0.16TFLOPS        | 0.81% | 0.8% |
+| nativetorch | True    | 0.16TFLOPS      | 0.16TFLOPS      | 0.81%      | 0.81%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 6822.09us       | 6840.32us        | 146.58op/s | 146.19op/s | 1943887.52us | 7026.99us |
+| nativetorch | 6778.93us       | 6797.31us        | 147.52op/s | 147.12op/s | 24637.37us | 6825.28us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1618.5W | 1716.0W | 85.0W   | /     | 264.49W       | 267.0W      | 2.43W        | 400W  |
+| flaggems监控结果 | 1657.5W | 1716.0W | 101.32W   | /     | 289.0W       | 292.0W      | 2.69W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.739%    | 2.402%   | 48.09°C       | 27.499%        |
+| flaggems监控结果 | 0.742%    | 2.494%   | 49.39°C       | 27.312%        |
diff --git a/operation/benchmarks/gt/nvidia/A100_40_SXM/README.md b/operation/benchmarks/gt/nvidia/A100_40_SXM/README.md
deleted file mode 100644
index 7eb1920b3..000000000
--- a/operation/benchmarks/gt/nvidia/A100_40_SXM/README.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a31e96c235
-
-# 评测结果
-
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 0.00E+00    | 0.32TFLOPS       | 0.32TFLOPS        | 1.63% | 1.62% |
-| nativetorch | 0.00E+00    | 0.32TFLOPS      | 0.32TFLOPS      | 1.62%      | 1.62%    |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 0.00E+00    | 6770.15us       | 6787.07us        | 147.71op/s | 147.34op/s | 265004.22us | 7077.09us |
-| nativetorch | 0.00E+00    | 6778.75us       | 6796.29us        | 147.52op/s | 147.14op/s | 8453.5us | 6810.0us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1690.0W | 1716.0W | 36.77W   | /     | 263.69W       | 266.0W      | 3.48W        | 1690.0  |
-| flaggems监控结果 | 1716.0W | 1716.0W | 0.0W   | /     | 299.88W       | 303.0W      | 3.43W        | 1716.0  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 0.804%    | 1.402%   | 48.68°C       | 26.483%        |
-| flaggems监控结果 | 0.767%    | 1.402%   | 50.86°C       | 26.295%        |
diff --git a/operation/benchmarks/gt/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/gt/nvidia/A100_40_SXM/case_config.yaml
index c7975e944..bc4b04b42 100644
--- a/operation/benchmarks/gt/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/gt/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 50000
-SPECTFLOPS: 19.5
\ No newline at end of file
diff --git a/operation/benchmarks/isinf/cambricon/MLU/case_config.yaml b/operation/benchmarks/isinf/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/isinf/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/isinf/cambricon/MLU/env.sh b/operation/benchmarks/isinf/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/isinf/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/isinf/case_config.yaml b/operation/benchmarks/isinf/case_config.yaml
new file mode 100644
index 000000000..e27027ead
--- /dev/null
+++ b/operation/benchmarks/isinf/case_config.yaml
@@ -0,0 +1,5 @@
+M: 1024
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/isinf/iluvatar/BI150/README.md b/operation/benchmarks/isinf/iluvatar/BI150/README.md
new file mode 100644
index 000000000..134cf4dea
--- /dev/null
+++ b/operation/benchmarks/isinf/iluvatar/BI150/README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 0.00E+00    | 0.1TFLOPS       | 0.1TFLOPS        | 0.43% | 0.42% |
+| nativetorch | 0.00E+00    | 0.05TFLOPS      | 0.05TFLOPS      | 0.18%      | 0.18%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 0.00E+00    | 5118.75us       | 5142.98us        | 195.36op/s | 194.44op/s | 321684.18us | 5600.41us |
+| nativetorch | 0.00E+00    | 11853.5us       | 11859.15us        | 84.36op/s | 84.32op/s | 12240.24us | 12208.26us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2109.0W | 2128.0W | 20.81W   | /     | 164.0W       | 164.0W      | 0.0W        | 350W  |
+| flaggems监控结果 | 2093.8W | 2109.0W | 30.4W   | /     | 164.63W       | 165.0W      | 0.48W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 44.675%    | 2.393%   | 48.83°C       | 31.989%        |
+| flaggems监控结果 | 47.973%    | 2.399%   | 48.44°C       | 19.489%        |
diff --git a/operation/benchmarks/isinf/iluvatar/BI150/case_config.yaml b/operation/benchmarks/isinf/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..b98bc798e
--- /dev/null
+++ b/operation/benchmarks/isinf/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+M: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/isinf/iluvatar/BI150/env.sh b/operation/benchmarks/isinf/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/isinf/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/isinf/iluvatar/BI150/requirements.txt b/operation/benchmarks/isinf/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/isinf/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/isinf/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/isinf/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..ec1c9e233
--- /dev/null
+++ b/operation/benchmarks/isinf/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,2 @@
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/isinf/kunlunxin/R300p/env.sh b/operation/benchmarks/isinf/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/isinf/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/isinf/main.py b/operation/benchmarks/isinf/main.py
new file mode 100644
index 000000000..f5677a2b5
--- /dev/null
+++ b/operation/benchmarks/isinf/main.py
@@ -0,0 +1,102 @@
+ # Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+import torch
+import os
+import time
+from argparse import ArgumentParser, Namespace
+import yaml
+import sys
+import subprocess
+
+sys.path.append("..")
+from drivers.utils import *
+from drivers.calculate import *
+
+
+def parse_args():
+    parser = ArgumentParser(description=" ")
+
+    parser.add_argument("--vendor",
+                        type=str,
+                        required=True,
+                        help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
+
+    parser.add_argument("--dataformat",
+                        type=str,
+                        required=True,
+                        help="like FP32,FP16")
+
+    parser.add_argument("--oplib",
+                        type=str,
+                        required=True,
+                        help="impl like pytorch/flaggems/cpp")
+
+    parser.add_argument("--chip",
+                        type=str,
+                        required=True,
+                        help="chip like A100_40_SXM")
+
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
+    set_ieee_float32(config.vendor)
+
+
+    Melements = case_config.M
+
+
+    a = torch.randn(Melements * 1024 * 1024, dtype=dtype[config.dataformat]).to(0)
+
+    latency_nowarm, latency_warm, cputime, kerneltime = do_test(
+        torch.isinf, (a, ), host_device_sync, config, case_config)
+
+    op2flops = lambda x: x * Melements * 1024 * 1024
+
+    perf_result = cal_perf(cputime, kerneltime, op2flops,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
+                 latency_nowarm, latency_warm)
+
+
+if __name__ == "__main__":
+    config = parse_args()
+    with open("case_config.yaml", "r") as file:
+        case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
+    with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
+              "r") as file:
+        case_config_vendor = yaml.safe_load(file)
+    case_config.update(case_config_vendor)
+    case_config = Namespace(**case_config)
+
+    if config.oplib == "flaggems":
+        import flag_gems
+        flag_gems.enable()
+        print("Using flaggems")
+    else:
+        print("Using nativetorch")
+    main(config, case_config)
\ No newline at end of file
diff --git a/operation/benchmarks/isinf/metax/C550_64/case_config.yaml b/operation/benchmarks/isinf/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/isinf/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/isinf/metax/C550_64/env.sh b/operation/benchmarks/isinf/metax/C550_64/env.sh
new file mode 100644
index 000000000..bb4cd2dce
--- /dev/null
+++ b/operation/benchmarks/isinf/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_CONST_ARGS_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/isinf/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/isinf/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..7656bc6a7
--- /dev/null
+++ b/operation/benchmarks/isinf/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.44TFLOPS       | 0.44TFLOPS        | 0.14% | 0.14% |
+| nativetorch | True    | 0.2TFLOPS      | 0.2TFLOPS      | 0.06%      | 0.06%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 2437.89us       | 2449.41us        | 410.19op/s | 408.26op/s | 904535.18us | 2516.74us |
+| nativetorch | 5454.66us       | 5466.11us        | 183.33op/s | 182.95op/s | 28045.76us | 5477.05us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1586.0W | 1638.0W | 73.54W   | /     | 255.96W       | 258.0W      | 2.67W        | 400W  |
+| flaggems监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 248.96W       | 252.0W      | 2.9W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.762%    | 2.301%   | 48.34°C       | 17.394%        |
+| flaggems监控结果 | 0.781%    | 2.312%   | 45.72°C       | 12.16%        |
diff --git a/operation/benchmarks/isinf/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/isinf/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..0a9529f5b
--- /dev/null
+++ b/operation/benchmarks/isinf/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.44TFLOPS       | 0.44TFLOPS        | 0.14% | 0.14% |
+| nativetorch | True    | 0.2TFLOPS      | 0.2TFLOPS      | 0.06%      | 0.06%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 2439.45us       | 2451.46us        | 409.93op/s | 407.92op/s | 1334024.76us | 2531.36us |
+| nativetorch | 5442.92us       | 5453.82us        | 183.73op/s | 183.36op/s | 73553.12us | 5544.81us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1612.0W | 1716.0W | 97.28W   | /     | 268.91W       | 276.0W      | 7.65W        | 400W  |
+| flaggems监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 254.42W       | 259.0W      | 3.99W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.202%    | 2.305%   | 49.92°C       | 17.394%        |
+| flaggems监控结果 | 0.755%    | 2.319%   | 45.89°C       | 12.16%        |
diff --git a/operation/benchmarks/isinf/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/isinf/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..66366ca95
--- /dev/null
+++ b/operation/benchmarks/isinf/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.28TFLOPS       | 0.27TFLOPS        | 1.42% | 1.41% |
+| nativetorch | True    | 0.11TFLOPS      | 0.11TFLOPS      | 0.55%      | 0.55%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3887.72us       | 3906.56us        | 257.22op/s | 255.98op/s | 929892.76us | 3970.6us |
+| nativetorch | 10044.82us       | 10056.7us        | 99.55op/s | 99.44op/s | 37919.1us | 10100.43us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1606.8W | 1638.0W | 62.4W   | /     | 258.32W       | 262.0W      | 2.45W        | 400W  |
+| flaggems监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 254.87W       | 259.0W      | 3.57W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.162%    | 2.288%   | 49.38°C       | 27.499%        |
+| flaggems监控结果 | 0.759%    | 2.288%   | 47.48°C       | 17.212%        |
diff --git a/operation/benchmarks/isinf/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/isinf/nvidia/A100_40_SXM/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/isinf/nvidia/A100_40_SXM/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/isinf/nvidia/A100_40_SXM/env.sh b/operation/benchmarks/isinf/nvidia/A100_40_SXM/env.sh
new file mode 100644
index 000000000..33786ec0d
--- /dev/null
+++ b/operation/benchmarks/isinf/nvidia/A100_40_SXM/env.sh
@@ -0,0 +1 @@
+echo "NVIDIA PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/isinf/nvidia/A100_40_SXM/requirements.txt b/operation/benchmarks/isinf/nvidia/A100_40_SXM/requirements.txt
new file mode 100644
index 000000000..330e27963
--- /dev/null
+++ b/operation/benchmarks/isinf/nvidia/A100_40_SXM/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/operation/benchmarks/isnan/cambricon/MLU/case_config.yaml b/operation/benchmarks/isnan/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/isnan/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/isnan/cambricon/MLU/env.sh b/operation/benchmarks/isnan/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/isnan/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/isnan/case_config.yaml b/operation/benchmarks/isnan/case_config.yaml
new file mode 100644
index 000000000..e27027ead
--- /dev/null
+++ b/operation/benchmarks/isnan/case_config.yaml
@@ -0,0 +1,5 @@
+M: 1024
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/isnan/iluvatar/BI150/README.md b/operation/benchmarks/isnan/iluvatar/BI150/README.md
new file mode 100644
index 000000000..31515aae3
--- /dev/null
+++ b/operation/benchmarks/isnan/iluvatar/BI150/README.md
@@ -0,0 +1,55 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 0.00E+00    | 0.11TFLOPS       | 0.1TFLOPS        | 0.43% | 0.43% |
+| nativetorch | 0.00E+00    | 0.12TFLOPS      | 0.12TFLOPS      | 0.48%      | 0.48%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 0.00E+00    | 5074.71us       | 5107.24us        | 197.06op/s | 195.8op/s | 334015.04us | 5484.97us |
+| nativetorch | 0.00E+00    | 4530.33us       | 4555.68us        | 220.73op/s | 219.51op/s | 4846.69us | 4868.31us |
+
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2104.25W | 2128.0W | 41.14W   | /     | 170.58W       | 171.0W      | 3.57W        | 350W  |
+| flaggems监控结果 | 2112.8W | 2128.0W | 30.4W   | /     | 164.97W       | 165.0W      | 0.17W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 39.824%    | 2.589%   | 49.78°C       | 19.489%        |
+| flaggems监控结果 | 40.485%    | 2.592%   | 48.84°C       | 19.489%        |
diff --git a/operation/benchmarks/isnan/iluvatar/BI150/case_config.yaml b/operation/benchmarks/isnan/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..b98bc798e
--- /dev/null
+++ b/operation/benchmarks/isnan/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+M: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/isnan/iluvatar/BI150/env.sh b/operation/benchmarks/isnan/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/isnan/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/isnan/iluvatar/BI150/requirements.txt b/operation/benchmarks/isnan/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/isnan/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/isnan/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/isnan/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..ec1c9e233
--- /dev/null
+++ b/operation/benchmarks/isnan/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,2 @@
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/isnan/kunlunxin/R300p/env.sh b/operation/benchmarks/isnan/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/isnan/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/isnan/main.py b/operation/benchmarks/isnan/main.py
new file mode 100644
index 000000000..a75f97716
--- /dev/null
+++ b/operation/benchmarks/isnan/main.py
@@ -0,0 +1,102 @@
+ # Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+import torch
+import os
+import time
+from argparse import ArgumentParser, Namespace
+import yaml
+import sys
+import subprocess
+
+sys.path.append("..")
+from drivers.utils import *
+from drivers.calculate import *
+
+
+def parse_args():
+    parser = ArgumentParser(description=" ")
+
+    parser.add_argument("--vendor",
+                        type=str,
+                        required=True,
+                        help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
+
+    parser.add_argument("--dataformat",
+                        type=str,
+                        required=True,
+                        help="like FP32,FP16")
+
+    parser.add_argument("--oplib",
+                        type=str,
+                        required=True,
+                        help="impl like pytorch/flaggems/cpp")
+
+    parser.add_argument("--chip",
+                        type=str,
+                        required=True,
+                        help="chip like A100_40_SXM")
+
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
+    set_ieee_float32(config.vendor)
+
+
+    Melements = case_config.M
+
+
+    a = torch.randn(Melements * 1024 * 1024, dtype=dtype[config.dataformat]).to(0)
+
+    latency_nowarm, latency_warm, cputime, kerneltime = do_test(
+        torch.isnan, (a, ), host_device_sync, config, case_config)
+
+    op2flops = lambda x: x * Melements * 1024 * 1024
+
+    perf_result = cal_perf(cputime, kerneltime, op2flops,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
+                 latency_nowarm, latency_warm)
+
+
+if __name__ == "__main__":
+    config = parse_args()
+    with open("case_config.yaml", "r") as file:
+        case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
+    with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
+              "r") as file:
+        case_config_vendor = yaml.safe_load(file)
+    case_config.update(case_config_vendor)
+    case_config = Namespace(**case_config)
+
+    if config.oplib == "flaggems":
+        import flag_gems
+        flag_gems.enable()
+        print("Using flaggems")
+    else:
+        print("Using nativetorch")
+    main(config, case_config)
\ No newline at end of file
diff --git a/operation/benchmarks/isnan/metax/C550_64/case_config.yaml b/operation/benchmarks/isnan/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/isnan/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/isnan/metax/C550_64/env.sh b/operation/benchmarks/isnan/metax/C550_64/env.sh
new file mode 100644
index 000000000..bb4cd2dce
--- /dev/null
+++ b/operation/benchmarks/isnan/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_CONST_ARGS_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/isnan/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/isnan/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..1314ada6a
--- /dev/null
+++ b/operation/benchmarks/isnan/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.44TFLOPS       | 0.44TFLOPS        | 0.14% | 0.14% |
+| nativetorch | True    | 0.45TFLOPS      | 0.45TFLOPS      | 0.15%      | 0.14%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 2437.82us       | 2449.41us        | 410.2op/s | 408.26op/s | 3170632.5us | 2518.86us |
+| nativetorch | 2371.13us       | 2380.8us        | 421.74op/s | 420.03op/s | 26464.17us | 2390.8us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 270.83W       | 276.0W      | 4.96W        | 400W  |
+| flaggems监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 250.33W       | 255.0W      | 3.76W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.767%    | 2.515%   | 48.78°C       | 12.342%        |
+| flaggems监控结果 | 0.756%    | 2.513%   | 46.7°C       | 12.16%        |
diff --git a/operation/benchmarks/isnan/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/isnan/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..211d333ba
--- /dev/null
+++ b/operation/benchmarks/isnan/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.44TFLOPS       | 0.44TFLOPS        | 0.14% | 0.14% |
+| nativetorch | True    | 0.46TFLOPS      | 0.45TFLOPS      | 0.15%      | 0.15%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 2439.51us       | 2451.46us        | 409.92op/s | 407.92op/s | 1308576.42us | 2512.04us |
+| nativetorch | 2358.64us       | 2368.51us        | 423.97op/s | 422.21op/s | 19631.75us | 2372.38us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 280.04W       | 286.0W      | 5.39W        | 400W  |
+| flaggems监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 254.68W       | 260.0W      | 4.32W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.734%    | 2.516%   | 49.38°C       | 12.342%        |
+| flaggems监控结果 | 0.804%    | 2.517%   | 46.81°C       | 12.16%        |
diff --git a/operation/benchmarks/isnan/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/isnan/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..349232822
--- /dev/null
+++ b/operation/benchmarks/isnan/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.28TFLOPS       | 0.27TFLOPS        | 1.42% | 1.41% |
+| nativetorch | True    | 0.28TFLOPS      | 0.28TFLOPS      | 1.43%      | 1.43%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3887.78us       | 3907.58us        | 257.22op/s | 255.91op/s | 1484714.41us | 3996.16us |
+| nativetorch | 3841.4us       | 3856.38us        | 260.32op/s | 259.31op/s | 20411.28us | 3862.64us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 268.46W       | 273.0W      | 3.81W        | 400W  |
+| flaggems监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 254.46W       | 257.0W      | 3.33W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.761%    | 2.5%   | 49.14°C       | 17.394%        |
+| flaggems监控结果 | 0.79%    | 2.497%   | 47.69°C       | 17.212%        |
diff --git a/operation/benchmarks/isnan/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/isnan/nvidia/A100_40_SXM/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/isnan/nvidia/A100_40_SXM/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/isnan/nvidia/A100_40_SXM/env.sh b/operation/benchmarks/isnan/nvidia/A100_40_SXM/env.sh
new file mode 100644
index 000000000..33786ec0d
--- /dev/null
+++ b/operation/benchmarks/isnan/nvidia/A100_40_SXM/env.sh
@@ -0,0 +1 @@
+echo "NVIDIA PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/isnan/nvidia/A100_40_SXM/requirements.txt b/operation/benchmarks/isnan/nvidia/A100_40_SXM/requirements.txt
new file mode 100644
index 000000000..330e27963
--- /dev/null
+++ b/operation/benchmarks/isnan/nvidia/A100_40_SXM/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/operation/benchmarks/layer_norm/cambricon/MLU/case_config.yaml b/operation/benchmarks/layer_norm/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/layer_norm/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/layer_norm/cambricon/MLU/env.sh b/operation/benchmarks/layer_norm/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/layer_norm/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/layer_norm/case_config.yaml b/operation/benchmarks/layer_norm/case_config.yaml
new file mode 100644
index 000000000..805748c7e
--- /dev/null
+++ b/operation/benchmarks/layer_norm/case_config.yaml
@@ -0,0 +1,7 @@
+bs: 20
+channel: 6
+hiddensize: 1048576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/layer_norm/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/layer_norm/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..ec1c9e233
--- /dev/null
+++ b/operation/benchmarks/layer_norm/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,2 @@
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/layer_norm/kunlunxin/R300p/env.sh b/operation/benchmarks/layer_norm/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/layer_norm/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/layer_norm/main.py b/operation/benchmarks/layer_norm/main.py
new file mode 100644
index 000000000..246d93371
--- /dev/null
+++ b/operation/benchmarks/layer_norm/main.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+import torch
+import os
+import time
+from argparse import ArgumentParser, Namespace
+import yaml
+import sys
+import subprocess
+
+sys.path.append("..")
+from drivers.utils import *
+from drivers.calculate import *
+
+
+def parse_args():
+    parser = ArgumentParser(description=" ")
+
+    parser.add_argument("--vendor",
+                        type=str,
+                        required=True,
+                        help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
+    parser.add_argument("--dataformat",
+                        type=str,
+                        required=True,
+                        help="like FP32,FP16")
+
+    parser.add_argument("--oplib",
+                        type=str,
+                        required=True,
+                        help="impl like pytorch/flaggems/cpp")
+
+    parser.add_argument("--chip",
+                        type=str,
+                        required=True,
+                        help="chip like A100_40_SXM")
+
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
+    set_ieee_float32(config.vendor)
+
+    bs = case_config.bs
+    channel = case_config.channel
+    hiddensize = case_config.hiddensize
+    a = torch.randn(bs, channel,  hiddensize, dtype=dtype[config.dataformat], requires_grad=True).to(0)
+    f = torch.nn.LayerNorm([channel, hiddensize]).to(0)
+    latency_nowarm, latency_warm, cputime, kerneltime = do_test(
+        f, (a, ), host_device_sync, config, case_config)
+
+    op2flops = lambda x: x * bs * channel * hiddensize * 9
+
+    perf_result = cal_perf(cputime, kerneltime, op2flops,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
+                 latency_nowarm, latency_warm)
+
+
+if __name__ == "__main__":
+    config = parse_args()
+    with open("case_config.yaml", "r") as file:
+        case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
+    with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
+              "r") as file:
+        case_config_vendor = yaml.safe_load(file)
+    case_config.update(case_config_vendor)
+    case_config = Namespace(**case_config)
+
+    if config.oplib == "flaggems":
+        import flag_gems
+        flag_gems.enable()
+        print("Using flaggems")
+    else:
+        print("Using nativetorch")
+    main(config, case_config)
diff --git a/operation/benchmarks/layer_norm/metax/C550_64/case_config.yaml b/operation/benchmarks/layer_norm/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..529af74ce
--- /dev/null
+++ b/operation/benchmarks/layer_norm/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
\ No newline at end of file
diff --git a/operation/benchmarks/layer_norm/metax/C550_64/env.sh b/operation/benchmarks/layer_norm/metax/C550_64/env.sh
new file mode 100644
index 000000000..0cdec082d
--- /dev/null
+++ b/operation/benchmarks/layer_norm/metax/C550_64/env.sh
@@ -0,0 +1 @@
+echo "METAX PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/layer_norm/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/layer_norm/nvidia/A100_40_SXM/case_config.yaml
new file mode 100644
index 000000000..529af74ce
--- /dev/null
+++ b/operation/benchmarks/layer_norm/nvidia/A100_40_SXM/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
\ No newline at end of file
diff --git a/operation/benchmarks/layer_norm/nvidia/A100_40_SXM/env.sh b/operation/benchmarks/layer_norm/nvidia/A100_40_SXM/env.sh
new file mode 100644
index 000000000..33786ec0d
--- /dev/null
+++ b/operation/benchmarks/layer_norm/nvidia/A100_40_SXM/env.sh
@@ -0,0 +1 @@
+echo "NVIDIA PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/layer_norm/nvidia/A100_40_SXM/requirements.txt b/operation/benchmarks/layer_norm/nvidia/A100_40_SXM/requirements.txt
new file mode 100644
index 000000000..330e27963
--- /dev/null
+++ b/operation/benchmarks/layer_norm/nvidia/A100_40_SXM/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/operation/benchmarks/le/cambricon/MLU/case_config.yaml b/operation/benchmarks/le/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/le/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/le/cambricon/MLU/env.sh b/operation/benchmarks/le/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/le/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/le/case_config.yaml b/operation/benchmarks/le/case_config.yaml
index 6ebcc4612..acc0f44fb 100644
--- a/operation/benchmarks/le/case_config.yaml
+++ b/operation/benchmarks/le/case_config.yaml
@@ -1,5 +1,4 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 50000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/le/iluvatar/BI150/README.md b/operation/benchmarks/le/iluvatar/BI150/README.md
new file mode 100644
index 000000000..29ea208ea
--- /dev/null
+++ b/operation/benchmarks/le/iluvatar/BI150/README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 0.00E+00    | 0.12TFLOPS       | 0.12TFLOPS        | 0.47% | 0.47% |
+| nativetorch | 0.00E+00    | 0.14TFLOPS      | 0.14TFLOPS      | 0.56%      | 0.56%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 0.00E+00    | 9258.69us       | 9268.42us        | 108.01op/s | 107.89op/s | 273463.87us | 9801.39us |
+| nativetorch | 0.00E+00    | 7843.88us       | 7866.95us        | 127.49op/s | 127.11op/s | 8136.55us | 8127.15us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2085.25W | 2109.0W | 41.14W   | /     | 167.9W       | 168.0W      | 0.31W        | 350W  |
+| flaggems监控结果 | 2078.6W | 2109.0W | 37.23W   | /     | 163.97W       | 164.0W      | 0.18W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 48.364%    | 2.393%   | 48.79°C       | 16.364%        |
+| flaggems监控结果 | 46.779%    | 2.394%   | 48.03°C       | 17.926%        |
diff --git a/operation/benchmarks/le/iluvatar/BI150/case_config.yaml b/operation/benchmarks/le/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..4958ee3ba
--- /dev/null
+++ b/operation/benchmarks/le/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/le/iluvatar/BI150/env.sh b/operation/benchmarks/le/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/le/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/le/iluvatar/BI150/requirements.txt b/operation/benchmarks/le/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/le/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/le/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/le/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..ec1c9e233
--- /dev/null
+++ b/operation/benchmarks/le/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,2 @@
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/le/kunlunxin/R300p/env.sh b/operation/benchmarks/le/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/le/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/le/main.py b/operation/benchmarks/le/main.py
index 01b17d6c5..b534816b9 100644
--- a/operation/benchmarks/le/main.py
+++ b/operation/benchmarks/le/main.py
@@ -4,12 +4,12 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +23,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,36 +53,22 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation"
-          )  # correctness is implemented casebycase
 
     m = case_config.Melements
 
 
-    dtype = {"FP32": torch.float32}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(m, dtype=dtype[config.dataformat])
-        b = torch.randn(m, dtype=dtype[config.dataformat])
-
-        a_fp64 = a.to(torch.float64)
-        b_fp64 = b.to(torch.float64)
-        r_fp64 = torch.le(a_fp64, b_fp64)
-
-        a = a.to(0)
-        b = b.to(0)
-        r_device = torch.le(a, b).cpu()
-        mape = ((r_device != r_fp64).float().sum()/r_fp64.numel()).item()
-
-        mmape.append(mape)
-    
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
 
     a = torch.randn(m, 1024, 1024,  dtype=dtype[config.dataformat]).to(0)
     b = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat]).to(0)
@@ -82,11 +76,11 @@ def main(config, case_config):
     latency_nowarm, latency_warm, cputime, kerneltime = do_test(
         torch.le, (a, b), host_device_sync, config, case_config)
 
-    op2flops = lambda x: x * 2 * m * 1024 * 1024
+    op2flops = lambda x: x * m * 1024 * 1024
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "le", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -94,6 +88,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
diff --git a/operation/benchmarks/le/metax/C550_64/case_config.yaml b/operation/benchmarks/le/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/le/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/le/metax/C550_64/env.sh b/operation/benchmarks/le/metax/C550_64/env.sh
new file mode 100644
index 000000000..bb4cd2dce
--- /dev/null
+++ b/operation/benchmarks/le/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_CONST_ARGS_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/le/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/le/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..a1e40fe87
--- /dev/null
+++ b/operation/benchmarks/le/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.27TFLOPS       | 0.27TFLOPS        | 0.09% | 0.09% |
+| nativetorch | True    | 0.28TFLOPS      | 0.28TFLOPS      | 0.09%      | 0.09%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3971.76us       | 3990.53us        | 251.78op/s | 250.59op/s | 941877.65us | 4076.99us |
+| nativetorch | 3891.03us       | 3903.49us        | 257.0op/s | 256.18op/s | 18442.44us | 3913.36us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1586.0W | 1638.0W | 73.54W   | /     | 277.36W       | 283.0W      | 4.73W        | 400W  |
+| flaggems监控结果 | 1612.0W | 1716.0W | 147.08W   | /     | 302.62W       | 308.0W      | 5.46W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.743%    | 2.321%   | 49.81°C       | 17.394%        |
+| flaggems监控结果 | 0.773%    | 2.322%   | 51.58°C       | 19.921%        |
diff --git a/operation/benchmarks/le/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/le/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..cdcc44d09
--- /dev/null
+++ b/operation/benchmarks/le/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.27TFLOPS       | 0.27TFLOPS        | 0.09% | 0.09% |
+| nativetorch | True    | 0.28TFLOPS      | 0.28TFLOPS      | 0.09%      | 0.09%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3962.36us       | 3981.31us        | 252.37op/s | 251.17op/s | 884966.72us | 4063.87us |
+| nativetorch | 3854.44us       | 3867.65us        | 259.44op/s | 258.56op/s | 18615.12us | 3874.76us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1612.0W | 1716.0W | 97.28W   | /     | 288.92W       | 293.0W      | 4.3W        | 400W  |
+| flaggems监控结果 | 1638.0W | 1716.0W | 110.31W   | /     | 301.73W       | 306.0W      | 5.24W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.763%    | 2.328%   | 49.68°C       | 17.394%        |
+| flaggems监控结果 | 0.768%    | 2.329%   | 51.13°C       | 17.207%        |
diff --git a/operation/benchmarks/le/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/le/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..5bb54941c
--- /dev/null
+++ b/operation/benchmarks/le/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.16TFLOPS       | 0.16TFLOPS        | 0.81% | 0.8% |
+| nativetorch | True    | 0.16TFLOPS      | 0.16TFLOPS      | 0.81%      | 0.81%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 6822.15us       | 6842.37us        | 146.58op/s | 146.15op/s | 1356467.05us | 6919.02us |
+| nativetorch | 6778.42us       | 6796.29us        | 147.53op/s | 147.14op/s | 20355.52us | 6806.19us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1618.5W | 1716.0W | 85.0W   | /     | 264.88W       | 268.0W      | 2.19W        | 400W  |
+| flaggems监控结果 | 1618.5W | 1716.0W | 127.87W   | /     | 287.9W       | 292.0W      | 4.25W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.512%    | 2.297%   | 48.12°C       | 27.499%        |
+| flaggems监控结果 | 0.758%    | 2.295%   | 49.9°C       | 27.312%        |
diff --git a/operation/benchmarks/le/nvidia/A100_40_SXM/README.md b/operation/benchmarks/le/nvidia/A100_40_SXM/README.md
deleted file mode 100644
index 89e08694f..000000000
--- a/operation/benchmarks/le/nvidia/A100_40_SXM/README.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a31e96c235
-
-# 评测结果
-
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 0.00E+00    | 0.32TFLOPS       | 0.32TFLOPS        | 1.63% | 1.62% |
-| nativetorch | 0.00E+00    | 0.32TFLOPS      | 0.32TFLOPS      | 1.62%      | 1.62%    |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 0.00E+00    | 6770.18us       | 6787.07us        | 147.71op/s | 147.34op/s | 264046.26us | 6840.16us |
-| nativetorch | 0.00E+00    | 6778.31us       | 6796.29us        | 147.53op/s | 147.14op/s | 8596.94us | 6795.57us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1664.0W | 1716.0W | 36.77W   | /     | 263.56W       | 269.0W      | 4.31W        | 1664.0  |
-| flaggems监控结果 | 1716.0W | 1716.0W | 0.0W   | /     | 299.65W       | 303.0W      | 3.38W        | 1716.0  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 0.828%    | 1.402%   | 48.57°C       | 29.009%        |
-| flaggems监控结果 | 0.784%    | 1.402%   | 50.39°C       | 26.295%        |
diff --git a/operation/benchmarks/le/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/le/nvidia/A100_40_SXM/case_config.yaml
index c7975e944..bc4b04b42 100644
--- a/operation/benchmarks/le/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/le/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 50000
-SPECTFLOPS: 19.5
\ No newline at end of file
diff --git a/operation/benchmarks/linear/cambricon/MLU/case_config.yaml b/operation/benchmarks/linear/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/linear/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/linear/cambricon/MLU/env.sh b/operation/benchmarks/linear/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/linear/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/linear/case_config.yaml b/operation/benchmarks/linear/case_config.yaml
index ea88f2149..c0ff6ec74 100644
--- a/operation/benchmarks/linear/case_config.yaml
+++ b/operation/benchmarks/linear/case_config.yaml
@@ -1,8 +1,7 @@
 M: 8192
 N: 8192
 K: 8192
-SPECTFLOPS: 10000
 WARMUP: 100
-ITERS: 50000
+ITERS: 50
 KERNELWARMUP: 10
 KERNELITERS: 1000
diff --git a/operation/benchmarks/linear/iluvatar/BI150/README.md b/operation/benchmarks/linear/iluvatar/BI150/README.md
new file mode 100644
index 000000000..04585f81e
--- /dev/null
+++ b/operation/benchmarks/linear/iluvatar/BI150/README.md
@@ -0,0 +1,57 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 注意事项
+测试linear时必须调节降频问题，因此需要：bash vendors/iluvatar/dvfs.sh && python3 run.py
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 1.77E-04    | 92.3TFLOPS       | 92.1TFLOPS        | 47.81% | 47.99% |
+| nativetorch | 1.77E-04    | 95.76TFLOPS      | 95.48TFLOPS      | 49.84%      | 49.73%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 2.40E-06    | 5956.1us       | 5969.44us        | 167.9op/s | 167.52op/s | 33608132.07us | 7226.37us |
+| nativetorch | 2.40E-06    | 5740.85us       | 5758.0us        | 174.19op/s | 173.67op/s | 6583.47us | 6067.62us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2178.67W | 2261.0W | 116.44W   | /     | 299.21W       | 301.0W      | 8.6W        | 350W  |
+| flaggems监控结果 | 2170.75W | 2223.0W | 90.5W   | /     | 276.31W       | 278.0W      | 3.14W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 45.339%    | 2.572%   | 85.67°C       | 1.91%        |
+| flaggems监控结果 | 42.741%    | 2.559%   | 81.69°C       | 2.631%        |
\ No newline at end of file
diff --git a/operation/benchmarks/linear/iluvatar/BI150/case_config.yaml b/operation/benchmarks/linear/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..5112311a5
--- /dev/null
+++ b/operation/benchmarks/linear/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,8 @@
+M: 4096
+N: 8192
+K: 8192
+SPECTFLOPS: 192
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
diff --git a/operation/benchmarks/linear/iluvatar/BI150/env.sh b/operation/benchmarks/linear/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/linear/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/linear/iluvatar/BI150/requirements.txt b/operation/benchmarks/linear/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/linear/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/linear/main.py b/operation/benchmarks/linear/main.py
index 5be8803a0..5307f1f8f 100644
--- a/operation/benchmarks/linear/main.py
+++ b/operation/benchmarks/linear/main.py
@@ -4,12 +4,12 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +23,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,36 +53,22 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
-    print("Test Correctness with 16-times smaller operation"
-          )  # correctness is implemented casebycase
 
     m = case_config.M
     n = case_config.N
     k = case_config.K
 
-    dtype = {"FP16": torch.float16}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        w = torch.nn.Linear(n // 16, k // 16, bias=False, dtype=dtype[config.dataformat])
-        x = torch.randn(m // 16, n // 16, dtype=dtype[config.dataformat])
-
-        w_fp64 = torch.nn.Linear(n // 16, k // 16, bias=False, dtype=torch.float64)
-        w_fp64.weight.data.copy_(w.weight.to(torch.float64))
-        x_fp64 = x.to(torch.float64)
-        r_fp64 = w_fp64(x_fp64)
-
-        w = w.to(0)
-        x = x.to(0)
-
-        r_device = w(x).cpu()
-        mape = torch.mean(torch.abs(r_device - r_fp64) / torch.abs(r_fp64))
-        mmape.append(mape)
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
 
     w = torch.nn.Linear(n, k, bias=False, dtype=dtype[config.dataformat]).to(0)
     x = torch.randn(m, n, dtype=dtype[config.dataformat]).to(0)
@@ -85,8 +79,8 @@ def main(config, case_config):
     op2flops = lambda x: x * 2 * m * n * k
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "Linear", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -94,6 +88,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
diff --git a/operation/benchmarks/sum/nvidia/A100_40_SXM/README.md b/operation/benchmarks/linear/metax/C550_64/README.md
similarity index 57%
rename from operation/benchmarks/sum/nvidia/A100_40_SXM/README.md
rename to operation/benchmarks/linear/metax/C550_64/README.md
index d40d328be..b2a3b9b5c 100644
--- a/operation/benchmarks/sum/nvidia/A100_40_SXM/README.md
+++ b/operation/benchmarks/linear/metax/C550_64/README.md
@@ -1,11 +1,11 @@
 # 参评AI芯片信息
 
-* 厂商：Nvidia
+* 厂商：Metax
 
 
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
+* 产品名称：C550
+* 产品型号：曦云®C550 64G
+* TDP：350W
 
 # 所用服务器配置
 
@@ -13,12 +13,12 @@
 
 
 * 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
+* 服务器型号：Nettrix X640 G40
+* 操作系统版本：Ubuntu 20.04.1 LTS
+* 操作系统内核：linux5.4.0-42-generic
+* CPU：Intel(R) Xeon(R) Gold 6348-112core
+* docker版本：27.0.3
+* 内存：2.0TiB
 * 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
 
 # 算子库版本
@@ -31,8 +31,8 @@ https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a3
 
 | 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
 | ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 5.950E-7    | 0.75TFLOPS       | 0.74TFLOPS        | 3.83% | 3.8% |
-| nativetorch | 3.821E-7    | 0.73TFLOPS       | 0.72TFLOPS      | 3.73%      | 3.7%    |
+| flaggems |  1.771E-4   |       |        | 46.77% | 45.41% |
+| nativetorch | 1.772E-4    |        |      | 70.28%      | 70.02%    |
 
 说明：kerneltime采用triton.testing.do\_bench接口给出，准确度低于nsys等profiling工具
 
@@ -40,19 +40,19 @@ https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a3
 
 | 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
 | ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 2.694E-6    | 2875.25us       | 2896.9us        | 347.80 op/s | 345.20 op/s | 829415.49 us | 2940.47 us |
-| nativetorch | 9.708E-7    | 2952.52us       | 2974.72us        | 338.69 op/s | 336.17 op/s | 3129.86 us | 2974.01 us |
+| flaggems |  1.752E-6   |       |         |  |  |  |  |
+| nativetorch | 2.510E-6    |        |         |  |  |  |  |
 
 ## 能耗监控结果
 
 | 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
 | ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| flaggems监控结果 | 1716.0W | 1716.0W | 0.0W    | /     | 274.1W       | 279.0W       | 4.83W        | 400W  |
-| nativetorch监控结果 | 1638.0W | 1638.0W | 0.0W    | /     | 265.1W       | 270.0W       | 4.91W        | 400W  |
+| flaggems监控结果 | 193.0W | 218.39W |  30.09W   | /     |     72.0W   |  72.0W     |    0.0W    | 350W  |
+| nativetorch监控结果 | 207.29W | 245.18W | 15.85W    | /     | 72.0W       | 72.0W       | 0.0W        | 350W  |
 
 ## 其他重要监控结果
 
 | 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
 | ---- | --------- | -------- | ------------ | -------------- |
-| flaggems监控结果 | 0.649%    | 1.283%   | 47.13°C      | 11.138%        |
-| nativetorch监控结果 | 0.674%    | 1.291%   | 47.04°C      | 11.326%        |
+| flaggems监控结果 |  3.089%   | 2.042% |   44.0°C   |     18.599%    |
+| nativetorch监控结果 | 4.314%    | 1.991   | 44.0°C      | 18.599%        |
diff --git a/operation/benchmarks/linear/metax/C550_64/case_config.yaml b/operation/benchmarks/linear/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..80f1f8af0
--- /dev/null
+++ b/operation/benchmarks/linear/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50
diff --git a/operation/benchmarks/linear/metax/C550_64/env.sh b/operation/benchmarks/linear/metax/C550_64/env.sh
new file mode 100644
index 000000000..79ff0fea1
--- /dev/null
+++ b/operation/benchmarks/linear/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_ENABLE_COMPILER_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/linear/metax/C550_64/requirements.txt b/operation/benchmarks/linear/metax/C550_64/requirements.txt
new file mode 100644
index 000000000..7248303e5
--- /dev/null
+++ b/operation/benchmarks/linear/metax/C550_64/requirements.txt
@@ -0,0 +1 @@
+loguru
\ No newline at end of file
diff --git a/operation/benchmarks/linear/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/linear/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..c93c73180
--- /dev/null
+++ b/operation/benchmarks/linear/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 7042de1d8fb6f978596322faaeda6b55ca1ae5ec
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems |  True    | 256.74TFLOPS       | 257.93TFLOPS        | 82.29% | 82.67% |
+| nativetorch |  True    | 268.61TFLOPS      | 280.42TFLOPS      | 86.09%      | 89.88%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 4282.58us       | 4262.91us        | 233.5op/s | 234.58op/s | 22318671.8us | 4529.59us |
+| nativetorch | 4093.27us       | 3920.9us        | 244.3op/s | 255.04op/s | 154468.03us | 4139.56us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1430.0W | 1482.0W | 36.77W   | /     | 158.55W       | 177.0W      | 18.6W        | 400W  |
+| flaggems监控结果 | 1456.0W | 1482.0W | 36.77W   | /     | 156.87W       | 188.0W      | 19.02W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.686%    | 1.084%   | 36.25°C       | 2.534%        |
+| flaggems监控结果 | 0.697%    | 1.086%   | 36.04°C       | 2.77%        |
diff --git a/operation/benchmarks/linear/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/linear/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..e1005a013
--- /dev/null
+++ b/operation/benchmarks/linear/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 7042de1d8fb6f978596322faaeda6b55ca1ae5ec
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems |  True    | 259.29TFLOPS       | 262.34TFLOPS        | 83.11% | 84.08% |
+| nativetorch |  True    | 259.62TFLOPS      | 262.02TFLOPS      | 83.21%      | 83.98%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 4240.44us       | 4191.23us        | 235.82op/s | 238.59op/s | 24118510.88us | 4329.25us |
+| nativetorch | 4235.09us       | 4196.35us        | 236.12op/s | 238.3op/s | 143678.03us | 4232.55us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1430.0W | 1482.0W | 36.77W   | /     | 158.93W       | 189.0W      | 20.27W        | 400W  |
+| flaggems监控结果 | 1456.0W | 1482.0W | 36.77W   | /     | 403.5W       | 404.0W      | 0.5W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.606%    | 1.085%   | 36.21°C       | 2.534%        |
+| flaggems监控结果 | 0.648%    | 1.087%   | 45.0°C       | 3.377%        |
diff --git a/operation/benchmarks/linear/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/linear/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..20f640c95
--- /dev/null
+++ b/operation/benchmarks/linear/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 7042de1d8fb6f978596322faaeda6b55ca1ae5ec
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems |  True    | 13.63TFLOPS       | 13.63TFLOPS        | 69.89% | 69.9% |
+| nativetorch |  True    | 18.98TFLOPS      | 18.98TFLOPS      | 97.35%      | 97.32%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 80682.31us       | 80667.65us        | 12.39op/s | 12.4op/s | 103876277.64us | 80805.88us |
+| nativetorch | 57919.12us       | 57937.92us        | 17.27op/s | 17.26op/s | 154126.57us | 58291.09us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1430.0W | 1482.0W | 36.77W   | /     | 271.0W       | 318.0W      | 57.52W        | 400W  |
+| flaggems监控结果 | 1443.0W | 1560.0W | 67.55W   | /     | 187.88W       | 248.0W      | 28.33W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.602%    | 1.084%   | 48.5°C       | 3.634%        |
+| flaggems监控结果 | 0.679%    | 1.088%   | 40.5°C       | 4.349%        |
diff --git a/operation/benchmarks/linear/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/linear/nvidia/A100_40_SXM/case_config.yaml
index f3489fba2..80f1f8af0 100644
--- a/operation/benchmarks/linear/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/linear/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
-ITERS: 50000
-SPECTFLOPS: 312
+ITERS: 50
diff --git a/operation/benchmarks/log_softmax/cambricon/MLU/case_config.yaml b/operation/benchmarks/log_softmax/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..749516b3b
--- /dev/null
+++ b/operation/benchmarks/log_softmax/cambricon/MLU/case_config.yaml
@@ -0,0 +1,3 @@
+SPECTFLOPS: 999999
+M: 16
+N: 1024
\ No newline at end of file
diff --git a/operation/benchmarks/log_softmax/cambricon/MLU/env.sh b/operation/benchmarks/log_softmax/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/log_softmax/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/log_softmax/case_config.yaml b/operation/benchmarks/log_softmax/case_config.yaml
new file mode 100644
index 000000000..4fe1f64e1
--- /dev/null
+++ b/operation/benchmarks/log_softmax/case_config.yaml
@@ -0,0 +1,6 @@
+M: 1024
+N: 10
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/abs/nvidia/A100_40_SXM/README.md b/operation/benchmarks/log_softmax/iluvatar/BI150/README.md
similarity index 52%
rename from operation/benchmarks/abs/nvidia/A100_40_SXM/README.md
rename to operation/benchmarks/log_softmax/iluvatar/BI150/README.md
index 7cc9a4677..afa93055d 100644
--- a/operation/benchmarks/abs/nvidia/A100_40_SXM/README.md
+++ b/operation/benchmarks/log_softmax/iluvatar/BI150/README.md
@@ -1,56 +1,54 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a31e96c235
-
-# 评测结果
-
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 0.00E+00    | 2.72TFLOPS       | 2.72TFLOPS        | 0.96% | 0.96% |
-| nativetorch | 0.00E+00    | 2.72TFLOPS      | 2.72TFLOPS      | 0.96%      | 0.96%    |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 0.00E+00    | 6191.22us       | 6194.18us        | 161.52op/s | 161.44op/s | 253536.13us | 6251.42us |
-| nativetorch | 0.00E+00    | 6196.2us       | 6196.22us        | 161.39op/s | 161.39op/s | 11111.41us | 6269.32us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1638.0W | 1638.0W | 0.0W   | /     | 254.56W       | 257.0W      | 2.96W        | 1638.0  |
-| flaggems监控结果 | 1716.0W | 1716.0W | 0.0W   | /     | 294.6W       | 300.0W      | 3.22W        | 1716.0  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 0.792%    | 1.3%   | 48.05°C       | 31.535%        |
-| flaggems监控结果 | 0.824%    | 1.301%   | 51.02°C       | 31.347%        |
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 3.31E-08    | 0.12TFLOPS       | 0.11TFLOPS        | 0.47% | 0.47% |
+| nativetorch | 3.32E-08    | 0.06TFLOPS      | 0.06TFLOPS      | 0.24%      | 0.24%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 3.96E-10    | 224.78us       | 228.62us        | 4448.74op/s | 4374.01op/s | 1017780.2us | 2572.8us |
+| nativetorch | 5.49E-10    | 441.94us       | 445.58us        | 2262.74op/s | 2244.24op/s | 638.18us | 592.6us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2033.0W | 2033.0W | 0.0W   | /     | 128.0W       | 129.0W      | 2.0W        | 350W  |
+| flaggems监控结果 | 2033.0W | 2033.0W | 0.0W   | /     | 165.0W       | 175.0W      | 14.14W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 56.639%    | 2.386%   | 43.2°C       | 1.489%        |
+| flaggems监控结果 | 43.249%    | 2.386%   | 53.0°C       | 2.24%        |
diff --git a/operation/benchmarks/log_softmax/iluvatar/BI150/case_config.yaml b/operation/benchmarks/log_softmax/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..68d76b609
--- /dev/null
+++ b/operation/benchmarks/log_softmax/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,7 @@
+M: 512
+N: 10
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/log_softmax/iluvatar/BI150/env.sh b/operation/benchmarks/log_softmax/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/log_softmax/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/log_softmax/iluvatar/BI150/requirements.txt b/operation/benchmarks/log_softmax/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/log_softmax/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/log_softmax/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/log_softmax/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..398fbc7c8
--- /dev/null
+++ b/operation/benchmarks/log_softmax/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,3 @@
+Shape: [4096, 256]
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/log_softmax/kunlunxin/R300p/env.sh b/operation/benchmarks/log_softmax/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/log_softmax/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/log_softmax/main.py b/operation/benchmarks/log_softmax/main.py
new file mode 100644
index 000000000..28c2fc039
--- /dev/null
+++ b/operation/benchmarks/log_softmax/main.py
@@ -0,0 +1,113 @@
+ # Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+import torch
+import os
+import time
+from argparse import ArgumentParser, Namespace
+import yaml
+import sys
+import subprocess
+import math
+
+sys.path.append("..")
+from drivers.utils import *
+from drivers.calculate import *
+
+
+def parse_args():
+    parser = ArgumentParser(description=" ")
+
+    parser.add_argument("--vendor",
+                        type=str,
+                        required=True,
+                        help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
+
+    parser.add_argument("--dataformat",
+                        type=str,
+                        required=True,
+                        help="like FP32,FP16")
+
+    parser.add_argument("--oplib",
+                        type=str,
+                        required=True,
+                        help="impl like pytorch/flaggems/cpp")
+
+    parser.add_argument("--chip",
+                        type=str,
+                        required=True,
+                        help="chip like A100_40_SXM")
+
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
+    set_ieee_float32(config.vendor)
+
+    m = case_config.M
+    n = case_config.N
+    f = torch.nn.LogSoftmax(dim=1)
+    # default shape: (M * 1024, N)
+    shape = (m * 1024, n)
+
+    if config.vendor == 'kunlunxin':
+        # if `Shape' specified in `case_config.yaml', use it
+        if case_config.__contains__('Shape') and case_config.Shape is not None:
+            shape = case_config.Shape
+    elif config.vendor == 'cambricon':
+        shape = (m, 1024, n)
+
+    a = torch.randn(shape, dtype=dtype[config.dataformat], requires_grad=True).to(0)
+    print(f'Shape for performance test: {a.shape}')
+
+    latency_nowarm, latency_warm, cputime, kerneltime = do_test(
+        f, (a, ), host_device_sync, config, case_config, bp=True) # 调整为torch.sub
+
+    op2flops = lambda x: x * 4 * math.prod(shape)
+
+    perf_result = cal_perf(cputime, kerneltime, op2flops,
+                           config.spectflops, bp=True)
+    print_result(config, config.case_name, *perf_result, correctness,
+                 latency_nowarm, latency_warm)
+
+
+if __name__ == "__main__":
+    config = parse_args()
+    with open("case_config.yaml", "r") as file:
+        case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
+    with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
+              "r") as file:
+        case_config_vendor = yaml.safe_load(file)
+    case_config.update(case_config_vendor)
+    case_config = Namespace(**case_config)
+
+    if config.oplib == "flaggems":
+        import flag_gems
+        flag_gems.enable()
+        print("Using flaggems")
+    else:
+        print("Using nativetorch")
+    main(config, case_config)
diff --git a/operation/benchmarks/log_softmax/metax/C550_64/case_config.yaml b/operation/benchmarks/log_softmax/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..8693d5e39
--- /dev/null
+++ b/operation/benchmarks/log_softmax/metax/C550_64/case_config.yaml
@@ -0,0 +1,2 @@
+N: 16
+ITERS: 50000
diff --git a/operation/benchmarks/log_softmax/metax/C550_64/env.sh b/operation/benchmarks/log_softmax/metax/C550_64/env.sh
new file mode 100644
index 000000000..0cdec082d
--- /dev/null
+++ b/operation/benchmarks/log_softmax/metax/C550_64/env.sh
@@ -0,0 +1 @@
+echo "METAX PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/log_softmax/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/log_softmax/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..b8a2c2690
--- /dev/null
+++ b/operation/benchmarks/log_softmax/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.33TFLOPS       | 0.47TFLOPS        | 0.1% | 0.15% |
+| nativetorch | True    | 0.61TFLOPS      | 0.6TFLOPS      | 0.2%      | 0.19%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 386.08us       | 265.22us        | 2590.16op/s | 3770.51op/s | 3066398.64us | 252.55us |
+| nativetorch | 204.78us       | 208.9us        | 4883.22op/s | 4787.07op/s | 9292.98us | 142.41us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1443.0W | 1482.0W | 39.0W   | /     | 241.6W       | 315.0W      | 59.59W        | 400W  |
+| flaggems监控结果 | 1443.0W | 1482.0W | 39.0W   | /     | 175.0W       | 198.0W      | 28.93W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 2.157%    | 1.267%   | 50.5°C       | 3.106%        |
+| flaggems监控结果 | 0.581%    | 1.277%   | 37.81°C       | 3.106%        |
diff --git a/operation/benchmarks/log_softmax/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/log_softmax/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..05387b5d1
--- /dev/null
+++ b/operation/benchmarks/log_softmax/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.32TFLOPS       | 0.45TFLOPS        | 0.1% | 0.14% |
+| nativetorch | True    | 0.62TFLOPS      | 0.61TFLOPS      | 0.2%      | 0.19%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 394.74us       | 282.62us        | 2533.3op/s | 3538.27op/s | 3115999.44us | 215.19us |
+| nativetorch | 203.9us       | 207.87us        | 4904.4op/s | 4810.65op/s | 8886.84us | 140.31us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1443.0W | 1482.0W | 39.0W   | /     | 237.0W       | 325.0W      | 59.94W        | 400W  |
+| flaggems监控结果 | 1443.0W | 1482.0W | 39.0W   | /     | 194.45W       | 221.0W      | 13.21W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.095%    | 1.263%   | 50.5°C       | 3.106%        |
+| flaggems监控结果 | 0.624%    | 1.275%   | 37.84°C       | 3.106%        |
diff --git a/operation/benchmarks/eq/nvidia/A100_40_SXM/README.md b/operation/benchmarks/log_softmax/nvidia/A100_40_SXM/FP32_README.md
similarity index 50%
rename from operation/benchmarks/eq/nvidia/A100_40_SXM/README.md
rename to operation/benchmarks/log_softmax/nvidia/A100_40_SXM/FP32_README.md
index 9173fdf51..d262bca0c 100644
--- a/operation/benchmarks/eq/nvidia/A100_40_SXM/README.md
+++ b/operation/benchmarks/log_softmax/nvidia/A100_40_SXM/FP32_README.md
@@ -2,7 +2,6 @@
 
 * 厂商：Nvidia
 
-
 * 产品名称：A100
 * 产品型号：A100-40GiB-SXM
 * TDP：400W
@@ -10,9 +9,7 @@
 # 所用服务器配置
 
 * 服务器数量：1
-
-
-* 单服务器内使用卡数：1
+* 单服务器内使用卡数: 1
 * 服务器型号：DGX A100
 * 操作系统版本：Ubuntu 20.04.4 LTS
 * 操作系统内核：linux5.4.0-113
@@ -23,34 +20,34 @@
 
 # 算子库版本
 
-https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a31e96c235
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
 
 # 评测结果
 
 ## 核心评测结果
 
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
 | ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 0.00E+00    | 0.32TFLOPS       | 0.32TFLOPS        | 0.1% | 0.1% |
-| nativetorch | 0.00E+00    | 0.32TFLOPS      | 0.32TFLOPS      | 0.1%      | 0.1%    |
+| flaggems | True    | 0.29TFLOPS       | 0.41TFLOPS        | 1.48% | 2.09% |
+| nativetorch | True    | 0.44TFLOPS      | 0.44TFLOPS      | 2.28%      | 2.25%    |
 
 ## 其他评测结果
 
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 0.00E+00    | 6769.91us       | 6786.05us        | 147.71op/s | 147.36op/s | 262548.71us | 6835.66us |
-| nativetorch | 0.00E+00    | 6778.26us       | 6795.26us        | 147.53op/s | 147.16op/s | 9427.55us | 6795.5us |
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 437.21us       | 309.25us        | 2287.21op/s | 3233.65op/s | 4480521.52us | 231.46us |
+| nativetorch | 282.92us       | 286.72us        | 3534.62op/s | 3487.72op/s | 8203.01us | 144.71us |
 
 ## 能耗监控结果
 
 | 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
 | ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1664.0W | 1716.0W | 36.77W   | /     | 255.12W       | 257.0W      | 3.16W        | 1664.0  |
-| flaggems监控结果 | 1716.0W | 1716.0W | 0.0W   | /     | 293.74W       | 297.0W      | 3.46W        | 1716.0  |
+| nativetorch监控结果 | 1443.0W | 1482.0W | 39.0W   | /     | 260.2W       | 307.0W      | 53.7W        | 400W  |
+| flaggems监控结果 | 1443.0W | 1482.0W | 39.0W   | /     | 203.75W       | 240.0W      | 19.0W        | 400W  |
 
 ## 其他重要监控结果
 
 | 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
 | ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 0.525%    | 2.095%   | 49.42°C       | 26.483%        |
-| flaggems监控结果 | 0.599%    | 2.093%   | 51.94°C       | 26.295%        |
+| nativetorch监控结果 | 0.55%    | 1.269%   | 42.0°C       | 3.106%        |
+| flaggems监控结果 | 1.445%    | 1.272%   | 38.98°C       | 3.106%        |
diff --git a/operation/benchmarks/log_softmax/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/log_softmax/nvidia/A100_40_SXM/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/log_softmax/nvidia/A100_40_SXM/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/log_softmax/nvidia/A100_40_SXM/env.sh b/operation/benchmarks/log_softmax/nvidia/A100_40_SXM/env.sh
new file mode 100644
index 000000000..33786ec0d
--- /dev/null
+++ b/operation/benchmarks/log_softmax/nvidia/A100_40_SXM/env.sh
@@ -0,0 +1 @@
+echo "NVIDIA PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/log_softmax/nvidia/A100_40_SXM/requirements.txt b/operation/benchmarks/log_softmax/nvidia/A100_40_SXM/requirements.txt
new file mode 100644
index 000000000..330e27963
--- /dev/null
+++ b/operation/benchmarks/log_softmax/nvidia/A100_40_SXM/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/operation/benchmarks/lt/cambricon/MLU/case_config.yaml b/operation/benchmarks/lt/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/lt/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/lt/cambricon/MLU/env.sh b/operation/benchmarks/lt/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/lt/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/lt/case_config.yaml b/operation/benchmarks/lt/case_config.yaml
index 6ebcc4612..acc0f44fb 100644
--- a/operation/benchmarks/lt/case_config.yaml
+++ b/operation/benchmarks/lt/case_config.yaml
@@ -1,5 +1,4 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 50000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/lt/iluvatar/BI150/README.md b/operation/benchmarks/lt/iluvatar/BI150/README.md
new file mode 100644
index 000000000..774d60018
--- /dev/null
+++ b/operation/benchmarks/lt/iluvatar/BI150/README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 0.00E+00    | 0.11TFLOPS       | 0.11TFLOPS        | 0.46% | 0.46% |
+| nativetorch | 0.00E+00    | 0.14TFLOPS      | 0.14TFLOPS      | 0.56%      | 0.56%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 0.00E+00    | 9396.11us       | 9432.73us        | 106.43op/s | 106.01op/s | 243689.85us | 10774.47us |
+| nativetorch | 0.00E+00    | 7844.46us       | 7864.16us        | 127.48op/s | 127.16op/s | 8093.03us | 8074.71us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2071.0W | 2090.0W | 32.91W   | /     | 167.74W       | 168.0W      | 0.44W        | 350W  |
+| flaggems监控结果 | 2074.8W | 2090.0W | 30.4W   | /     | 163.99W       | 164.0W      | 0.1W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 40.09%    | 2.393%   | 47.65°C       | 16.364%        |
+| flaggems监控结果 | 40.439%    | 2.394%   | 46.77°C       | 16.364%        |
\ No newline at end of file
diff --git a/operation/benchmarks/lt/iluvatar/BI150/case_config.yaml b/operation/benchmarks/lt/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..4958ee3ba
--- /dev/null
+++ b/operation/benchmarks/lt/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/lt/iluvatar/BI150/env.sh b/operation/benchmarks/lt/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/lt/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/lt/iluvatar/BI150/requirements.txt b/operation/benchmarks/lt/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/lt/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/lt/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/lt/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..ec1c9e233
--- /dev/null
+++ b/operation/benchmarks/lt/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,2 @@
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/lt/kunlunxin/R300p/env.sh b/operation/benchmarks/lt/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/lt/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/lt/main.py b/operation/benchmarks/lt/main.py
index 746e66651..3c789ae95 100644
--- a/operation/benchmarks/lt/main.py
+++ b/operation/benchmarks/lt/main.py
@@ -4,12 +4,12 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +23,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,36 +53,22 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation"
-          )  # correctness is implemented casebycase
 
     m = case_config.Melements
 
 
-    dtype = {"FP32": torch.float32}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(m, dtype=dtype[config.dataformat])
-        b = torch.randn(m, dtype=dtype[config.dataformat])
-
-        a_fp64 = a.to(torch.float64)
-        b_fp64 = b.to(torch.float64)
-        r_fp64 = torch.lt(a_fp64, b_fp64)
-
-        a = a.to(0)
-        b = b.to(0)
-        r_device = torch.lt(a, b).cpu()
-        mape = ((r_device != r_fp64).float().sum()/r_fp64.numel()).item()
-
-        mmape.append(mape)
-    
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
 
     a = torch.randn(m, 1024, 1024,  dtype=dtype[config.dataformat]).to(0)
     b = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat]).to(0)
@@ -82,11 +76,11 @@ def main(config, case_config):
     latency_nowarm, latency_warm, cputime, kerneltime = do_test(
         torch.lt, (a, b), host_device_sync, config, case_config)
 
-    op2flops = lambda x: x * 2 * m * 1024 * 1024
+    op2flops = lambda x: x * m * 1024 * 1024
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "lt", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -94,6 +88,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
diff --git a/operation/benchmarks/lt/metax/C550_64/case_config.yaml b/operation/benchmarks/lt/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/lt/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/lt/metax/C550_64/env.sh b/operation/benchmarks/lt/metax/C550_64/env.sh
new file mode 100644
index 000000000..bb4cd2dce
--- /dev/null
+++ b/operation/benchmarks/lt/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_CONST_ARGS_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/lt/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/lt/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..d8508e492
--- /dev/null
+++ b/operation/benchmarks/lt/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.27TFLOPS       | 0.27TFLOPS        | 0.09% | 0.09% |
+| nativetorch | True    | 0.28TFLOPS      | 0.28TFLOPS      | 0.09%      | 0.09%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3971.7us       | 3990.53us        | 251.78op/s | 250.59op/s | 2412245.21us | 4067.72us |
+| nativetorch | 3890.94us       | 3903.49us        | 257.01op/s | 256.18op/s | 18805.12us | 3909.68us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1586.0W | 1716.0W | 132.57W   | /     | 275.18W       | 281.0W      | 4.85W        | 400W  |
+| flaggems监控结果 | 1612.0W | 1716.0W | 147.08W   | /     | 302.98W       | 308.0W      | 5.55W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.744%    | 2.322%   | 49.44°C       | 17.394%        |
+| flaggems监控结果 | 0.861%    | 2.321%   | 51.67°C       | 19.921%        |
diff --git a/operation/benchmarks/lt/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/lt/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..c0f88d670
--- /dev/null
+++ b/operation/benchmarks/lt/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.27TFLOPS       | 0.27TFLOPS        | 0.09% | 0.09% |
+| nativetorch | True    | 0.28TFLOPS      | 0.28TFLOPS      | 0.09%      | 0.09%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3962.87us       | 3981.31us        | 252.34op/s | 251.17op/s | 969223.67us | 4055.96us |
+| nativetorch | 3854.36us       | 3867.65us        | 259.45op/s | 258.56op/s | 16959.71us | 3875.4us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1612.0W | 1716.0W | 97.28W   | /     | 286.74W       | 291.0W      | 4.71W        | 400W  |
+| flaggems监控结果 | 1612.0W | 1716.0W | 147.08W   | /     | 301.75W       | 306.0W      | 5.05W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.721%    | 2.326%   | 49.69°C       | 17.394%        |
+| flaggems监控结果 | 0.762%    | 2.325%   | 51.19°C       | 17.207%        |
diff --git a/operation/benchmarks/lt/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/lt/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..4b40d134c
--- /dev/null
+++ b/operation/benchmarks/lt/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.16TFLOPS       | 0.16TFLOPS        | 0.81% | 0.8% |
+| nativetorch | True    | 0.16TFLOPS      | 0.16TFLOPS      | 0.81%      | 0.81%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 6822.2us       | 6840.32us        | 146.58op/s | 146.19op/s | 2148358.73us | 6960.76us |
+| nativetorch | 6778.53us       | 6796.29us        | 147.52op/s | 147.14op/s | 22914.94us | 6804.9us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1638.0W | 1716.0W | 95.53W   | /     | 265.1W       | 269.0W      | 2.47W        | 400W  |
+| flaggems监控结果 | 1638.0W | 1716.0W | 135.1W   | /     | 288.59W       | 292.0W      | 3.28W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.737%    | 2.295%   | 48.06°C       | 30.025%        |
+| flaggems监控结果 | 0.743%    | 2.295%   | 49.36°C       | 30.025%        |
diff --git a/operation/benchmarks/lt/nvidia/A100_40_SXM/README.md b/operation/benchmarks/lt/nvidia/A100_40_SXM/README.md
deleted file mode 100644
index 368eabf5f..000000000
--- a/operation/benchmarks/lt/nvidia/A100_40_SXM/README.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a31e96c235
-
-# 评测结果
-
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 0.00E+00    | 0.32TFLOPS       | 0.32TFLOPS        | 1.63% | 1.62% |
-| nativetorch | 0.00E+00    | 0.32TFLOPS      | 0.32TFLOPS      | 1.62%      | 1.62%    |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 0.00E+00    | 6770.06us       | 6786.05us        | 147.71op/s | 147.36op/s | 271907.01us | 6842.09us |
-| nativetorch | 0.00E+00    | 6778.35us       | 6794.24us        | 147.53op/s | 147.18op/s | 746839.2us | 6800.15us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1664.0W | 1716.0W | 36.77W   | /     | 264.24W       | 267.0W      | 3.52W        | 1664.0  |
-| flaggems监控结果 | 1716.0W | 1716.0W | 0.0W   | /     | 300.38W       | 303.0W      | 3.5W        | 1716.0  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 0.846%    | 1.402%   | 48.69°C       | 26.483%        |
-| flaggems监控结果 | 0.749%    | 1.402%   | 50.46°C       | 26.295%        |
diff --git a/operation/benchmarks/lt/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/lt/nvidia/A100_40_SXM/case_config.yaml
index c7975e944..bc4b04b42 100644
--- a/operation/benchmarks/lt/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/lt/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 50000
-SPECTFLOPS: 19.5
\ No newline at end of file
diff --git a/operation/benchmarks/max/cambricon/MLU/case_config.yaml b/operation/benchmarks/max/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/max/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/max/cambricon/MLU/env.sh b/operation/benchmarks/max/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/max/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/max/case_config.yaml b/operation/benchmarks/max/case_config.yaml
index 6ebcc4612..acc0f44fb 100644
--- a/operation/benchmarks/max/case_config.yaml
+++ b/operation/benchmarks/max/case_config.yaml
@@ -1,5 +1,4 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 50000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/max/iluvatar/BI150/README.md b/operation/benchmarks/max/iluvatar/BI150/README.md
new file mode 100644
index 000000000..fe60c4945
--- /dev/null
+++ b/operation/benchmarks/max/iluvatar/BI150/README.md
@@ -0,0 +1,55 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 0.00E+00    | 0.16TFLOPS       | 0.16TFLOPS        | 0.66% | 0.65% |
+| nativetorch | 0.00E+00    | 0.09TFLOPS      | 0.09TFLOPS      | 0.37%      | 0.37%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 0.00E+00    | 3321.15us       | 3358.5us        | 301.1op/s | 297.75op/s | 782670.81us | 3724.88us |
+| nativetorch | 0.00E+00    | 5932.36us       | 5957.83us        | 168.57op/s | 167.85op/s | 6175.06us | 6183.7us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2052.0W | 2071.0W | 26.87W   | /     | 141.64W       | 143.0W      | 3.11W        | 350W  |
+| flaggems监控结果 | 2061.5W | 2109.0W | 47.5W   | /     | 170.97W       | 171.0W      | 0.17W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 47.447%    | 2.389%   | 43.88°C       | 6.989%        |
+| flaggems监控结果 | 54.227%    | 2.396%   | 48.61°C       | 6.989%        |
diff --git a/operation/benchmarks/max/iluvatar/BI150/case_config.yaml b/operation/benchmarks/max/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..4958ee3ba
--- /dev/null
+++ b/operation/benchmarks/max/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/max/iluvatar/BI150/env.sh b/operation/benchmarks/max/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/max/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/max/iluvatar/BI150/requirements.txt b/operation/benchmarks/max/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/max/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/max/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/max/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..1138a649a
--- /dev/null
+++ b/operation/benchmarks/max/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,3 @@
+Shape: [4096, 1]
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/max/kunlunxin/R300p/env.sh b/operation/benchmarks/max/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..176026da9
--- /dev/null
+++ b/operation/benchmarks/max/kunlunxin/R300p/env.sh
@@ -0,0 +1,6 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+export Triton_big_instcombine=1000
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/max/main.py b/operation/benchmarks/max/main.py
index 9833f2cda..9077db1da 100644
--- a/operation/benchmarks/max/main.py
+++ b/operation/benchmarks/max/main.py
@@ -4,12 +4,13 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
+import math
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +24,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,43 +54,39 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation"
-          )  # correctness is implemented casebycase
 
     m = case_config.Melements
+    # default shape: (M, 1024, 1024)
+    shape = (m, 1024, 1024)
 
-    dtype = {"FP32": torch.float32}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(m, dtype=dtype[config.dataformat])
-
-        a_fp64 = a.to(torch.float64)
-        r_fp64 = torch.max(a_fp64)
-
-        a = a.to(0)
-        r_device = torch.max(a).cpu()
-        mape = torch.mean(torch.abs(r_device - r_fp64) / torch.abs(r_fp64))
-
-        mmape.append(mape)
-    
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
+    if config.vendor == 'kunlunxin':
+        # if `Shape' specified in `case_config.yaml', use it
+        if case_config.__contains__('Shape') and case_config.Shape is not None:
+            shape = case_config.Shape
 
-    a = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat]).to(0)
+    a = torch.randn(shape, dtype=dtype[config.dataformat]).to(0)
+    print(f'Shape for performance_test: {a.shape}')
 
     latency_nowarm, latency_warm, cputime, kerneltime = do_test(
         torch.max, (a, ), host_device_sync, config, case_config)
 
-    op2flops = lambda x: x * m * 1024 * 1024
+    op2flops = lambda x: x * math.prod(shape)
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "max", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -89,6 +94,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
@@ -101,4 +107,4 @@ def main(config, case_config):
         print("Using flaggems")
     else:
         print("Using nativetorch")
-    main(config, case_config)
\ No newline at end of file
+    main(config, case_config)
diff --git a/operation/benchmarks/max/metax/C550_64/case_config.yaml b/operation/benchmarks/max/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/max/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/max/metax/C550_64/env.sh b/operation/benchmarks/max/metax/C550_64/env.sh
new file mode 100644
index 000000000..0cdec082d
--- /dev/null
+++ b/operation/benchmarks/max/metax/C550_64/env.sh
@@ -0,0 +1 @@
+echo "METAX PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/max/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/max/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..f3b6788e5
--- /dev/null
+++ b/operation/benchmarks/max/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.74TFLOPS       | 0.73TFLOPS        | 0.24% | 0.23% |
+| nativetorch | True    | 0.67TFLOPS      | 0.66TFLOPS      | 0.21%      | 0.21%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 1443.7us       | 1465.34us        | 692.67op/s | 682.43op/s | 2584879.14us | 1521.05us |
+| nativetorch | 1608.7us       | 1629.18us        | 621.62op/s | 613.8op/s | 21389.17us | 1627.47us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1599.0W | 1716.0W | 117.0W   | /     | 307.06W       | 313.0W      | 5.94W        | 400W  |
+| flaggems监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 281.2W       | 287.0W      | 4.15W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.707%    | 2.32%   | 50.91°C       | 7.295%        |
+| flaggems监控结果 | 0.635%    | 2.324%   | 46.59°C       | 7.902%        |
diff --git a/operation/benchmarks/max/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/max/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..8dae5e2ed
--- /dev/null
+++ b/operation/benchmarks/max/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.74TFLOPS       | 0.73TFLOPS        | 0.24% | 0.23% |
+| nativetorch | True    | 0.68TFLOPS      | 0.67TFLOPS      | 0.22%      | 0.21%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 1443.77us       | 1465.34us        | 692.63op/s | 682.43op/s | 1186949.93us | 1507.3us |
+| nativetorch | 1590.38us       | 1612.8us        | 628.78op/s | 620.04op/s | 25896.01us | 1663.53us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1599.0W | 1716.0W | 117.0W   | /     | 310.59W       | 323.0W      | 24.09W        | 400W  |
+| flaggems监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 279.6W       | 285.0W      | 4.66W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.697%    | 2.329%   | 50.12°C       | 7.902%        |
+| flaggems监控结果 | 0.76%    | 2.327%   | 45.38°C       | 7.107%        |
diff --git a/operation/benchmarks/max/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/max/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..00533e2b2
--- /dev/null
+++ b/operation/benchmarks/max/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.37TFLOPS       | 0.37TFLOPS        | 1.92% | 1.9% |
+| nativetorch | True    | 0.36TFLOPS      | 0.36TFLOPS      | 1.86%      | 1.85%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 2872.74us       | 2894.85us        | 348.1op/s | 345.44op/s | 1118567.39us | 2940.77us |
+| nativetorch | 2961.98us       | 2982.91us        | 337.61op/s | 335.24op/s | 22158.57us | 2983.21us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1599.0W | 1716.0W | 117.0W   | /     | 278.13W       | 281.0W      | 4.0W        | 400W  |
+| flaggems监控结果 | 1599.0W | 1716.0W | 117.0W   | /     | 277.96W       | 282.0W      | 3.28W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.691%    | 2.294%   | 46.75°C       | 12.347%        |
+| flaggems监控结果 | 0.747%    | 2.302%   | 46.64°C       | 12.954%        |
diff --git a/operation/benchmarks/max/nvidia/A100_40_SXM/README.md b/operation/benchmarks/max/nvidia/A100_40_SXM/README.md
deleted file mode 100644
index 9ad005e13..000000000
--- a/operation/benchmarks/max/nvidia/A100_40_SXM/README.md
+++ /dev/null
@@ -1,58 +0,0 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a31e96c235
-
-# 评测结果
-
-## 核心评测结果
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 0.00E+00    | 0.37TFLOPS       | 0.37TFLOPS        | 1.92% | 1.9% |
-| nativetorch | 0.00E+00    | 0.36TFLOPS      | 0.36TFLOPS      | 1.86%      | 1.85%    |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时>延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 0.00E+00    | 2872.97us       | 2894.85us        | 348.07op/s | 345.44op/s | 715679.85us | 2927.86us |
-| nativetorch | 0.00E+00    | 2961.24us       | 2982.91us        | 337.7op/s | 335.24op/s | 3189.87us | 2980.2us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单
-卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1716.0W | 1716.0W | 0.0W   | /     | 274.77W       | 280.0W      | 3.77W        | 1716.0  |
-| flaggems监控结果 | 1716.0W | 1716.0W | 0.0W   | /     | 276.03W       | 278.0W      | 3.02W        | 1716.0  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 0.711%    | 1.41%   | 46.98°C       | 11.326%        |
-| flaggems监控结果 | 0.747%    | 1.405%   | 46.16°C       | 11.138%        |
diff --git a/operation/benchmarks/max/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/max/nvidia/A100_40_SXM/case_config.yaml
index c7975e944..bc4b04b42 100644
--- a/operation/benchmarks/max/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/max/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 50000
-SPECTFLOPS: 19.5
\ No newline at end of file
diff --git a/operation/benchmarks/mean/cambricon/MLU/case_config.yaml b/operation/benchmarks/mean/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/mean/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/mean/cambricon/MLU/env.sh b/operation/benchmarks/mean/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/mean/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/mean/case_config.yaml b/operation/benchmarks/mean/case_config.yaml
index 6ebcc4612..acc0f44fb 100644
--- a/operation/benchmarks/mean/case_config.yaml
+++ b/operation/benchmarks/mean/case_config.yaml
@@ -1,5 +1,4 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 50000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/add/nvidia/A100_40_SXM/README.md b/operation/benchmarks/mean/iluvatar/BI150/README.md
similarity index 52%
rename from operation/benchmarks/add/nvidia/A100_40_SXM/README.md
rename to operation/benchmarks/mean/iluvatar/BI150/README.md
index ad93ac1fe..b5a4cfe6f 100644
--- a/operation/benchmarks/add/nvidia/A100_40_SXM/README.md
+++ b/operation/benchmarks/mean/iluvatar/BI150/README.md
@@ -1,56 +1,54 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a31e96c235
-
-# 评测结果
-
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 1.62E-08    | 0.23TFLOPS       | 0.23TFLOPS        | 0.07% | 0.07% |
-| nativetorch | 1.62E-08    | 0.23TFLOPS      | 0.23TFLOPS      | 0.07%      | 0.07%    |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 5.66E-10    | 9435.2us       | 9470.98us        | 105.99op/s | 105.59op/s | 307676.86us | 9541.19us |
-| nativetorch | 5.66E-10    | 9473.79us       | 9480.19us        | 105.55op/s | 105.48op/s | 14133.42us | 9528.13us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1638.0W | 1638.0W | 0.0W   | /     | 249.48W       | 252.0W      | 3.56W        | 1638.0  |
-| flaggems监控结果 | 1716.0W | 1716.0W | 0.0W   | /     | 278.41W       | 282.0W      | 3.02W        | 1716.0  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 0.542%    | 2.388%   | 49.32°C       | 41.64%        |
-| flaggems监控结果 | 1.0%    | 2.58%   | 51.29°C       | 62.816%        |
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 2.01E-07    | 0.16TFLOPS       | 0.16TFLOPS        | 0.66% | 0.65% |
+| nativetorch | 1.66E-07    | 0.09TFLOPS      | 0.09TFLOPS      | 0.37%      | 0.37%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 3.61E-07    | 3329.21us       | 3357.1us        | 300.37op/s | 297.88op/s | 779528.2us | 3780.82us |
+| nativetorch | 2.09E-07    | 5926.53us       | 5951.03us        | 168.73op/s | 168.04op/s | 6167.72us | 6243.39us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2052.0W | 2071.0W | 26.87W   | /     | 140.0W       | 140.0W      | 0.0W        | 350W  |
+| flaggems监控结果 | 2052.0W | 2090.0W | 38.0W   | /     | 169.91W       | 172.0W      | 6.35W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 39.402%    | 2.389%   | 42.98°C       | 6.989%        |
+| flaggems监控结果 | 38.954%    | 2.396%   | 48.47°C       | 6.989%        |
\ No newline at end of file
diff --git a/operation/benchmarks/mean/iluvatar/BI150/case_config.yaml b/operation/benchmarks/mean/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..4958ee3ba
--- /dev/null
+++ b/operation/benchmarks/mean/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/mean/iluvatar/BI150/env.sh b/operation/benchmarks/mean/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/mean/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/mean/iluvatar/BI150/requirements.txt b/operation/benchmarks/mean/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/mean/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/mean/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/mean/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..398fbc7c8
--- /dev/null
+++ b/operation/benchmarks/mean/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,3 @@
+Shape: [4096, 256]
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/mean/kunlunxin/R300p/env.sh b/operation/benchmarks/mean/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/mean/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/mean/main.py b/operation/benchmarks/mean/main.py
index 5622e29f6..4a7116d10 100644
--- a/operation/benchmarks/mean/main.py
+++ b/operation/benchmarks/mean/main.py
@@ -4,12 +4,13 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
+import math
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +24,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,43 +54,39 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation"
-          )  # correctness is implemented casebycase
 
     m = case_config.Melements
+    # default shape: (M, 1024, 1024)
+    shape = (m, 1024, 1024)
 
-    dtype = {"FP32": torch.float32}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(m, dtype=dtype[config.dataformat])
-
-        a_fp64 = a.to(torch.float64)
-        r_fp64 = torch.mean(a_fp64)
-
-        a = a.to(0)
-        r_device = torch.mean(a).cpu()
-        mape = torch.mean(torch.abs(r_device - r_fp64) / torch.abs(r_fp64))
-
-        mmape.append(mape)
-    
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
+    if config.vendor == 'kunlunxin':
+        # if `Shape' specified in `case_config.yaml', use it
+        if case_config.__contains__('Shape') and case_config.Shape is not None:
+            shape = case_config.Shape
 
-    a = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat]).to(0)
+    a = torch.randn(shape, dtype=dtype[config.dataformat]).to(0)
+    print(f'Shape for performance_test: {a.shape}')
 
     latency_nowarm, latency_warm, cputime, kerneltime = do_test(
         torch.mean, (a, ), host_device_sync, config, case_config)
 
-    op2flops = lambda x: x * m * 1024 * 1024
+    op2flops = lambda x: x * math.prod(shape)
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "mean", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -89,6 +94,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
@@ -101,4 +107,4 @@ def main(config, case_config):
         print("Using flaggems")
     else:
         print("Using nativetorch")
-    main(config, case_config)
\ No newline at end of file
+    main(config, case_config)
diff --git a/operation/benchmarks/mean/metax/C550_64/case_config.yaml b/operation/benchmarks/mean/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/mean/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/mean/metax/C550_64/env.sh b/operation/benchmarks/mean/metax/C550_64/env.sh
new file mode 100644
index 000000000..0cdec082d
--- /dev/null
+++ b/operation/benchmarks/mean/metax/C550_64/env.sh
@@ -0,0 +1 @@
+echo "METAX PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/mean/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/mean/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..6460949ad
--- /dev/null
+++ b/operation/benchmarks/mean/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.74TFLOPS       | 0.73TFLOPS        | 0.24% | 0.23% |
+| nativetorch | True    | 0.69TFLOPS      | 0.68TFLOPS      | 0.22%      | 0.22%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 1443.54us       | 1465.34us        | 692.74op/s | 682.43op/s | 1503555.34us | 1511.85us |
+| nativetorch | 1556.52us       | 1576.96us        | 642.46op/s | 634.13op/s | 1055802.83us | 1605.92us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 267.38W       | 271.0W      | 2.32W        | 400W  |
+| flaggems监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 279.36W       | 283.0W      | 3.26W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.715%    | 2.324%   | 45.54°C       | 7.902%        |
+| flaggems监控结果 | 0.769%    | 2.327%   | 46.77°C       | 7.107%        |
diff --git a/operation/benchmarks/mean/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/mean/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..951e3eba9
--- /dev/null
+++ b/operation/benchmarks/mean/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.74TFLOPS       | 0.73TFLOPS        | 0.24% | 0.23% |
+| nativetorch | True    | 0.69TFLOPS      | 0.68TFLOPS      | 0.22%      | 0.22%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 1443.65us       | 1465.34us        | 692.69op/s | 682.43op/s | 1077937.79us | 1513.12us |
+| nativetorch | 1555.63us       | 1576.96us        | 642.82op/s | 634.13op/s | 703745.72us | 1575.87us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 278.62W       | 283.0W      | 3.53W        | 400W  |
+| flaggems监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 279.27W       | 283.0W      | 3.15W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.699%    | 2.326%   | 46.89°C       | 7.295%        |
+| flaggems监控结果 | 0.793%    | 2.327%   | 45.62°C       | 7.107%        |
diff --git a/operation/benchmarks/mean/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/mean/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..cb1339e72
--- /dev/null
+++ b/operation/benchmarks/mean/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.37TFLOPS       | 0.37TFLOPS        | 1.92% | 1.9% |
+| nativetorch | True    | 0.36TFLOPS      | 0.36TFLOPS      | 1.87%      | 1.85%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 2874.66us       | 2895.87us        | 347.87op/s | 345.32op/s | 1190148.65us | 2934.33us |
+| nativetorch | 2951.85us       | 2973.7us        | 338.77op/s | 336.28op/s | 80140.21us | 2990.15us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 265.62W       | 267.0W      | 2.16W        | 400W  |
+| flaggems监控结果 | 1599.0W | 1716.0W | 117.0W   | /     | 273.97W       | 277.0W      | 2.81W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.814%    | 2.299%   | 46.17°C       | 12.347%        |
+| flaggems监控结果 | 0.776%    | 2.297%   | 46.49°C       | 12.16%        |
diff --git a/operation/benchmarks/mean/nvidia/A100_40_SXM/README.md b/operation/benchmarks/mean/nvidia/A100_40_SXM/README.md
deleted file mode 100644
index ac91d31c3..000000000
--- a/operation/benchmarks/mean/nvidia/A100_40_SXM/README.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a31e96c235
-
-# 评测结果
-
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 6.34E-07    | 0.37TFLOPS       | 0.37TFLOPS        | 1.92% | 1.9% |
-| nativetorch | 3.81E-07    | 0.36TFLOPS      | 0.36TFLOPS      | 1.87%      | 1.85%    |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 3.69E-06    | 2874.93us       | 2896.9us        | 347.83op/s | 345.2op/s | 731652.08us | 3893.26us |
-| nativetorch | 1.90E-06    | 2952.39us       | 2973.7us        | 338.71op/s | 336.28op/s | 3196.17us | 2976.84us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1638.0W | 1638.0W | 0.0W   | /     | 264.57W       | 270.0W      | 4.77W        | 1638.0  |
-| flaggems监控结果 | 1716.0W | 1716.0W | 0.0W   | /     | 275.86W       | 279.0W      | 3.24W        | 1716.0  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 0.742%    | 1.399%   | 47.0°C       | 11.326%        |
-| flaggems监控结果 | 0.732%    | 1.408%   | 46.38°C       | 11.93%        |
diff --git a/operation/benchmarks/mean/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/mean/nvidia/A100_40_SXM/case_config.yaml
index c7975e944..bc4b04b42 100644
--- a/operation/benchmarks/mean/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/mean/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 50000
-SPECTFLOPS: 19.5
\ No newline at end of file
diff --git a/operation/benchmarks/min/cambricon/MLU/case_config.yaml b/operation/benchmarks/min/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/min/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/min/cambricon/MLU/env.sh b/operation/benchmarks/min/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/min/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/min/case_config.yaml b/operation/benchmarks/min/case_config.yaml
index 6ebcc4612..acc0f44fb 100644
--- a/operation/benchmarks/min/case_config.yaml
+++ b/operation/benchmarks/min/case_config.yaml
@@ -1,5 +1,4 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 50000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/min/iluvatar/BI150/README.md b/operation/benchmarks/min/iluvatar/BI150/README.md
new file mode 100644
index 000000000..4142175f4
--- /dev/null
+++ b/operation/benchmarks/min/iluvatar/BI150/README.md
@@ -0,0 +1,55 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 0.00E+00    | 0.16TFLOPS       | 0.16TFLOPS        | 0.66% | 0.65% |
+| nativetorch | 0.00E+00    | 0.09TFLOPS      | 0.09TFLOPS      | 0.37%      | 0.37%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 0.00E+00    | 3328.02us       | 3355.72us        | 300.48op/s | 298.0op/s | 780935.6us | 3678.99us |
+| nativetorch | 0.00E+00    | 5932.67us       | 5957.81us        | 168.56op/s | 167.85op/s | 6140.08us | 6205.71us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2052.0W | 2071.0W | 26.87W   | /     | 142.63W       | 143.0W      | 2.84W        | 350W  |
+| flaggems监控结果 | 2061.5W | 2109.0W | 47.5W   | /     | 169.79W       | 171.0W      | 6.33W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 51.305%    | 2.389%   | 43.93°C       | 6.989%        |
+| flaggems监控结果 | 54.13%    | 2.396%   | 48.66°C       | 6.989%        |
diff --git a/operation/benchmarks/min/iluvatar/BI150/case_config.yaml b/operation/benchmarks/min/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..4958ee3ba
--- /dev/null
+++ b/operation/benchmarks/min/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/min/iluvatar/BI150/env.sh b/operation/benchmarks/min/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/min/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/min/iluvatar/BI150/requirements.txt b/operation/benchmarks/min/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/min/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/min/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/min/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..1138a649a
--- /dev/null
+++ b/operation/benchmarks/min/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,3 @@
+Shape: [4096, 1]
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/min/kunlunxin/R300p/env.sh b/operation/benchmarks/min/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..176026da9
--- /dev/null
+++ b/operation/benchmarks/min/kunlunxin/R300p/env.sh
@@ -0,0 +1,6 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+export Triton_big_instcombine=1000
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/min/main.py b/operation/benchmarks/min/main.py
index 2bf1b54a4..f1399f63d 100644
--- a/operation/benchmarks/min/main.py
+++ b/operation/benchmarks/min/main.py
@@ -4,12 +4,13 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
+import math
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +24,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,43 +54,39 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation"
-          )  # correctness is implemented casebycase
 
     m = case_config.Melements
+    # default shape: (M, 1024, 1024)
+    shape = (m, 1024, 1024)
 
-    dtype = {"FP32": torch.float32}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(m, dtype=dtype[config.dataformat])
-
-        a_fp64 = a.to(torch.float64)
-        r_fp64 = torch.min(a_fp64)
-
-        a = a.to(0)
-        r_device = torch.min(a).cpu()
-        mape = torch.mean(torch.abs(r_device - r_fp64) / torch.abs(r_fp64))
-
-        mmape.append(mape)
-    
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
+    if config.vendor == 'kunlunxin':
+        # if `Shape' specified in `case_config.yaml', use it
+        if case_config.__contains__('Shape') and case_config.Shape is not None:
+            shape = case_config.Shape
 
-    a = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat]).to(0)
+    a = torch.randn(shape, dtype=dtype[config.dataformat]).to(0)
+    print(f'Shape for performance_test: {a.shape}')
 
     latency_nowarm, latency_warm, cputime, kerneltime = do_test(
         torch.min, (a, ), host_device_sync, config, case_config)
 
-    op2flops = lambda x: x * m * 1024 * 1024
+    op2flops = lambda x: x * math.prod(shape)
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "min", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -89,6 +94,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
@@ -101,4 +107,4 @@ def main(config, case_config):
         print("Using flaggems")
     else:
         print("Using nativetorch")
-    main(config, case_config)
\ No newline at end of file
+    main(config, case_config)
diff --git a/operation/benchmarks/min/metax/C550_64/case_config.yaml b/operation/benchmarks/min/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/min/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/min/metax/C550_64/env.sh b/operation/benchmarks/min/metax/C550_64/env.sh
new file mode 100644
index 000000000..0cdec082d
--- /dev/null
+++ b/operation/benchmarks/min/metax/C550_64/env.sh
@@ -0,0 +1 @@
+echo "METAX PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/min/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/min/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..4fec12650
--- /dev/null
+++ b/operation/benchmarks/min/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.74TFLOPS       | 0.73TFLOPS        | 0.24% | 0.23% |
+| nativetorch | True    | 0.67TFLOPS      | 0.66TFLOPS      | 0.21%      | 0.21%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 1443.7us       | 1465.34us        | 692.66op/s | 682.43op/s | 2326719.8us | 1511.22us |
+| nativetorch | 1608.85us       | 1629.18us        | 621.56op/s | 613.8op/s | 21419.48us | 1700.41us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1599.0W | 1716.0W | 117.0W   | /     | 308.19W       | 316.0W      | 5.97W        | 400W  |
+| flaggems监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 279.07W       | 284.0W      | 3.19W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.244%    | 2.32%   | 51.36°C       | 7.295%        |
+| flaggems监控结果 | 0.669%    | 2.324%   | 46.29°C       | 7.902%        |
diff --git a/operation/benchmarks/min/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/min/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..1bc2760b6
--- /dev/null
+++ b/operation/benchmarks/min/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.74TFLOPS       | 0.73TFLOPS        | 0.24% | 0.23% |
+| nativetorch | True    | 0.68TFLOPS      | 0.67TFLOPS      | 0.22%      | 0.21%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 1443.59us       | 1465.34us        | 692.72op/s | 682.43op/s | 1086461.1us | 1512.87us |
+| nativetorch | 1590.4us       | 1611.78us        | 628.77op/s | 620.43op/s | 946829.93us | 1661.11us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1599.0W | 1716.0W | 117.0W   | /     | 316.38W       | 323.0W      | 6.07W        | 400W  |
+| flaggems监控结果 | 1599.0W | 1716.0W | 117.0W   | /     | 278.93W       | 283.0W      | 3.86W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.642%    | 2.324%   | 49.96°C       | 7.295%        |
+| flaggems监控结果 | 0.698%    | 2.327%   | 45.29°C       | 7.107%        |
diff --git a/operation/benchmarks/min/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/min/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..ee17c2105
--- /dev/null
+++ b/operation/benchmarks/min/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.37TFLOPS       | 0.37TFLOPS        | 1.92% | 1.9% |
+| nativetorch | True    | 0.36TFLOPS      | 0.36TFLOPS      | 1.86%      | 1.85%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 2872.56us       | 2894.85us        | 348.12op/s | 345.44op/s | 1132797.06us | 2940.16us |
+| nativetorch | 2962.03us       | 2983.94us        | 337.61op/s | 335.13op/s | 21439.96us | 2992.93us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1599.0W | 1716.0W | 117.0W   | /     | 276.53W       | 280.0W      | 2.54W        | 400W  |
+| flaggems监控结果 | 1560.0W | 1716.0W | 156.0W   | /     | 275.57W       | 278.0W      | 3.57W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.696%    | 2.297%   | 46.82°C       | 12.347%        |
+| flaggems监控结果 | 0.755%    | 2.302%   | 47.34°C       | 12.954%        |
diff --git a/operation/benchmarks/min/nvidia/A100_40_SXM/README.md b/operation/benchmarks/min/nvidia/A100_40_SXM/README.md
deleted file mode 100644
index 70993f225..000000000
--- a/operation/benchmarks/min/nvidia/A100_40_SXM/README.md
+++ /dev/null
@@ -1,58 +0,0 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a31e96c235
-
-# 评测结果
-
-## 核心评测结果
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 0.00E+00    | 0.37TFLOPS       | 0.37TFLOPS        | 1.92% | 1.9% |
-| nativetorch | 0.00E+00    | 0.36TFLOPS      | 0.36TFLOPS      | 1.86%      | 1.85%    |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时>延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 0.00E+00    | 2873.03us       | 2895.87us        | 348.06op/s | 345.32op/s | 1925186.13us | 2939.79us |
-| nativetorch | 0.00E+00    | 2961.08us       | 2981.89us        | 337.71op/s | 335.36op/s | 3209.14us | 2987.49us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单
-卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1716.0W | 1716.0W | 0.0W   | /     | 276.03W       | 278.0W      | 2.76W        | 1716.0  |
-| flaggems监控结果 | 1716.0W | 1716.0W | 0.0W   | /     | 276.66W       | 281.0W      | 2.56W        | 1716.0  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 0.784%    | 1.401%   | 46.81°C       | 11.326%        |
-| flaggems监控结果 | 0.761%    | 1.407%   | 46.42°C       | 11.138%        |
diff --git a/operation/benchmarks/min/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/min/nvidia/A100_40_SXM/case_config.yaml
index c7975e944..bc4b04b42 100644
--- a/operation/benchmarks/min/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/min/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 50000
-SPECTFLOPS: 19.5
\ No newline at end of file
diff --git a/operation/benchmarks/mm/cambricon/MLU/case_config.yaml b/operation/benchmarks/mm/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/mm/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/mm/cambricon/MLU/env.sh b/operation/benchmarks/mm/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/mm/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/mm/case_config.yaml b/operation/benchmarks/mm/case_config.yaml
index ea88f2149..dca460c3d 100644
--- a/operation/benchmarks/mm/case_config.yaml
+++ b/operation/benchmarks/mm/case_config.yaml
@@ -1,7 +1,6 @@
 M: 8192
 N: 8192
 K: 8192
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 50000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/mm/main.py b/operation/benchmarks/mm/main.py
index de60b1a4e..5f22bae66 100644
--- a/operation/benchmarks/mm/main.py
+++ b/operation/benchmarks/mm/main.py
@@ -4,12 +4,12 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
 
 sys.path.append("..")
 from drivers.utils import *
@@ -24,6 +24,16 @@ def parse_args():
                         required=True,
                         help="vendor name like nvidia")
 
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
+
     parser.add_argument("--dataformat",
                         type=str,
                         required=True,
@@ -45,34 +55,22 @@ def parse_args():
 
 
 def main(config, case_config):
-    print("Test Correctness with 16-times smaller operation"
-          )  # correctness is implemented casebycase
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
 
     m = case_config.M
     n = case_config.N
     k = case_config.K
+    op2flops = lambda x: x * 2 * m * n * k
 
-    dtype = {"FP16": torch.float16}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn((m // 16, n // 16), dtype=dtype[config.dataformat])
-        b = torch.randn((n // 16, k // 16), dtype=dtype[config.dataformat])
-
-        a_fp64 = a.to(torch.float64)
-        b_fp64 = b.to(torch.float64)
-        r_fp64 = torch.mm(a, b)
-
-        a = a.to(0)
-        b = b.to(0)
-
-        r_device = torch.mm(a, b).cpu()
-        mape = torch.mean(torch.abs(r_device - r_fp64) / torch.abs(r_fp64))
-        mmape.append(mape)
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+    }
 
     a = torch.randn((m, n), dtype=dtype[config.dataformat]).to(0)
     b = torch.randn((n, k), dtype=dtype[config.dataformat]).to(0)
@@ -80,11 +78,9 @@ def main(config, case_config):
     latency_nowarm, latency_warm, cputime, kerneltime = do_test(
         torch.mm, (a, b), host_device_sync, config, case_config)
 
-    op2flops = lambda x: x * 2 * m * n * k
-
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "matrix multiply(mm)", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -92,6 +88,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
diff --git a/operation/benchmarks/mm/nvidia/A100_40_SXM/README.md b/operation/benchmarks/mm/metax/C550_64/README.md
similarity index 55%
rename from operation/benchmarks/mm/nvidia/A100_40_SXM/README.md
rename to operation/benchmarks/mm/metax/C550_64/README.md
index 6c6177389..ed6257912 100644
--- a/operation/benchmarks/mm/nvidia/A100_40_SXM/README.md
+++ b/operation/benchmarks/mm/metax/C550_64/README.md
@@ -1,11 +1,11 @@
 # 参评AI芯片信息
 
-* 厂商：Nvidia
+* 厂商：Metax
 
 
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
+* 产品名称：C550
+* 产品型号：曦云®C550 64G
+* TDP：350W
 
 # 所用服务器配置
 
@@ -13,12 +13,12 @@
 
 
 * 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
+* 服务器型号：Nettrix X640 G40
+* 操作系统版本：Ubuntu 20.04.1 LTS
+* 操作系统内核：linux5.4.0-42-generic
+* CPU：Intel(R) Xeon(R) Gold 6348-112core
+* docker版本：27.0.3
+* 内存：2.0TiB
 * 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
 
 # 算子库版本
@@ -31,26 +31,26 @@ https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a3
 
 | 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
 | ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 5.066E-6    | 255.67TFLOPS       | 258.48TFLOPS        | 82.85% | 81.95% |
-| nativetorch | 5.066E-6    | 256.01TFLOPS       | 260.17TFLOPS      | 83.39%      | 82.05%    |
+| flaggems |  3.112E-6   |         |         |  45.4% | 45.01%  |
+| nativetorch | 3.112E-6     |        |       | 64.17%      | 63.97%    |
 
 ## 其他评测结果
 
 | 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
 | ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 3.576E-6    | 4300.53us       | 4253.70us        | 232.53 op/s | 235.09 op/s | 2194470.42 us | 4267.14 us |
-| nativetorch | 3.576E-6    | 4294.78us       | 4226.05us        | 232.84 op/s | 236.63 op/s | 10435.93 us | 4209.60 us |
+| flaggems |  2.859E-6  |        |         |  |  |  |  |
+| nativetorch | 2.859E-6    |        |         |  |  |  |  |
 
 ## 能耗监控结果
 
 | 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
 | ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1833.0W | 1872.0W | 39.0W    | /     | 399.56W       | 412.0W       | 6.05W        | 400W  |
-| flaggems监控结果 | 1833.0W | 1872.0W | 39.0W    | /     | 398.25W       | 410.0W       | 3.3W        | 400W  |
+| nativetorch监控结果 | 214.8W | 297.36W | 20.29W    | /     | 70.2W       | 72.0W       | 0.0W        | 350W  |
+| flaggems监控结果 | 199.56W | 354.75W |  54.51W   | /     |    72.0W    |    72.0W    |     0.0W    | 350W  |
 
 ## 其他重要监控结果
 
 | 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡平均显存占用 |
 | ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 1.021%    | 1.228%   | 63.70°C      | 0.262%        |
-| flaggems监控结果 | 1.058%    | 1.192%   | 64.05°C      | 0.338%        |
+| nativetorch监控结果 | 5.583%    | 1.973%   | 44.0°C      | 18.599%        |
+| flaggems监控结果 |  3.038%   |   2.018% |    44.0 °C    |   18.599%      |
diff --git a/operation/benchmarks/mm/metax/C550_64/case_config.yaml b/operation/benchmarks/mm/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/mm/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/mm/metax/C550_64/env.sh b/operation/benchmarks/mm/metax/C550_64/env.sh
new file mode 100644
index 000000000..79ff0fea1
--- /dev/null
+++ b/operation/benchmarks/mm/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_ENABLE_COMPILER_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/mm/metax/C550_64/requirements.txt b/operation/benchmarks/mm/metax/C550_64/requirements.txt
new file mode 100644
index 000000000..7248303e5
--- /dev/null
+++ b/operation/benchmarks/mm/metax/C550_64/requirements.txt
@@ -0,0 +1 @@
+loguru
\ No newline at end of file
diff --git a/operation/benchmarks/mm/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/mm/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..c961f3d74
--- /dev/null
+++ b/operation/benchmarks/mm/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 263.37TFLOPS       | 264.66TFLOPS        | 84.41% | 84.83% |
+| nativetorch | True    | 263.23TFLOPS      | 265.19TFLOPS      | 84.37%      | 85.0%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 4174.73us       | 4154.37us        | 239.54op/s | 240.71op/s | 17801636.01us | 4133.21us |
+| nativetorch | 4177.02us       | 4146.18us        | 239.41op/s | 241.19op/s | 150712.5us | 4240.62us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1490.67W | 1794.0W | 115.95W   | /     | 392.9W       | 418.0W      | 17.96W        | 400W  |
+| flaggems监控结果 | 1473.33W | 1872.0W | 132.29W   | /     | 399.0W       | 412.0W      | 5.34W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.799%    | 2.307%   | 64.38°C       | 7.181%        |
+| flaggems监控结果 | 0.978%    | 2.381%   | 64.69°C       | 7.181%        |
diff --git a/operation/benchmarks/mm/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/mm/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..50ff71e4d
--- /dev/null
+++ b/operation/benchmarks/mm/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 255.52TFLOPS       | 258.3TFLOPS        | 81.9% | 82.79% |
+| nativetorch | True    | 255.94TFLOPS      | 257.62TFLOPS      | 82.03%      | 82.57%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 4302.98us       | 4256.77us        | 232.4op/s | 234.92op/s | 21432134.56us | 4219.83us |
+| nativetorch | 4296.02us       | 4268.03us        | 232.77op/s | 234.3op/s | 184282.1us | 4428.74us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1495.0W | 1872.0W | 138.19W   | /     | 396.67W       | 407.0W      | 4.2W        | 400W  |
+| flaggems监控结果 | 1495.0W | 1872.0W | 130.65W   | /     | 399.71W       | 417.0W      | 6.78W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.84%    | 2.648%   | 64.56°C       | 7.181%        |
+| flaggems监控结果 | 0.912%    | 2.857%   | 64.11°C       | 7.181%        |
diff --git a/operation/benchmarks/mm/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/mm/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..4af7d11ff
--- /dev/null
+++ b/operation/benchmarks/mm/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 18.69TFLOPS       | 18.69TFLOPS        | 95.87% | 95.86% |
+| nativetorch | True    | 127.29TFLOPS      | 127.09TFLOPS      | 652.78%      | 651.72%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 58817.26us       | 58819.58us        | 17.0op/s | 17.0op/s | 98562329.26us | 58999.11us |
+| nativetorch | 8637.72us       | 8651.78us        | 115.77op/s | 115.58op/s | 112106.58us | 8488.85us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1521.0W | 1872.0W | 170.0W   | /     | 398.74W       | 416.0W      | 4.85W        | 400W  |
+| flaggems监控结果 | 1647.51W | 1794.0W | 172.44W   | /     | 346.39W       | 349.0W      | 3.75W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.82%    | 2.785%   | 64.89°C       | 7.181%        |
+| flaggems监控结果 | 1.11%    | 2.609%   | 57.24°C       | 7.181%        |
diff --git a/operation/benchmarks/mm/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/mm/nvidia/A100_40_SXM/case_config.yaml
index f3489fba2..bc4b04b42 100644
--- a/operation/benchmarks/mm/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/mm/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 50000
-SPECTFLOPS: 312
diff --git a/operation/benchmarks/mul/cambricon/MLU/case_config.yaml b/operation/benchmarks/mul/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/mul/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/mul/cambricon/MLU/env.sh b/operation/benchmarks/mul/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/mul/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/mul/case_config.yaml b/operation/benchmarks/mul/case_config.yaml
index 6ebcc4612..acc0f44fb 100644
--- a/operation/benchmarks/mul/case_config.yaml
+++ b/operation/benchmarks/mul/case_config.yaml
@@ -1,5 +1,4 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 50000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/mul/iluvatar/BI150/README.md b/operation/benchmarks/mul/iluvatar/BI150/README.md
new file mode 100644
index 000000000..61190d561
--- /dev/null
+++ b/operation/benchmarks/mul/iluvatar/BI150/README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 0.00E+00    | 0.07TFLOPS       | 0.07TFLOPS        | 0.3% | 0.29% |
+| nativetorch | 0.00E+00    | 0.07TFLOPS      | 0.07TFLOPS      | 0.3%      | 0.3%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 0.00E+00    | 7399.96us       | 7416.38us        | 135.14op/s | 134.84op/s | 229898.76us | 7908.33us |
+| nativetorch | 0.00E+00    | 7381.1us       | 7394.29us        | 135.48op/s | 135.24op/s | 7697.06us | 7654.44us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2071.0W | 2090.0W | 32.91W   | /     | 164.0W       | 165.0W      | 5.59W        | 350W  |
+| flaggems监控结果 | 2071.0W | 2090.0W | 32.91W   | /     | 170.96W       | 171.0W      | 0.2W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 38.981%    | 2.388%   | 46.85°C       | 19.489%        |
+| flaggems监控结果 | 49.387%    | 2.391%   | 47.7°C       | 19.489%        |
diff --git a/operation/benchmarks/mul/iluvatar/BI150/case_config.yaml b/operation/benchmarks/mul/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..4958ee3ba
--- /dev/null
+++ b/operation/benchmarks/mul/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/mul/iluvatar/BI150/env.sh b/operation/benchmarks/mul/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/mul/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/mul/iluvatar/BI150/requirements.txt b/operation/benchmarks/mul/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/mul/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/mul/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/mul/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..ec1c9e233
--- /dev/null
+++ b/operation/benchmarks/mul/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,2 @@
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/mul/kunlunxin/R300p/env.sh b/operation/benchmarks/mul/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/mul/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/mul/main.py b/operation/benchmarks/mul/main.py
index 9d5009072..b111a6dca 100644
--- a/operation/benchmarks/mul/main.py
+++ b/operation/benchmarks/mul/main.py
@@ -4,12 +4,12 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +23,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,32 +53,21 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation"
-          )  # correctness is implemented casebycase
 
     Melements = case_config.Melements
 
-    dtype = {"FP32": torch.float32}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(Melements, dtype=dtype[config.dataformat])
-
-        a_fp64 = a.to(torch.float64)
-        r_fp64 = torch.mul(a, 2)
-
-        a = a.to(0)
-        r_device = torch.mul(a, 2).cpu()
-        mape = torch.mean(torch.abs(r_device - r_fp64) / torch.abs(r_fp64))
-
-        mmape.append(mape)
-    
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
 
     a = torch.randn(Melements * 1024 * 1024, dtype=dtype[config.dataformat]).to(0)
 
@@ -80,8 +77,8 @@ def main(config, case_config):
     op2flops = lambda x: x * Melements * 1024 * 1024
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "mul", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -89,6 +86,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
diff --git a/operation/benchmarks/mul/metax/C550_64/case_config.yaml b/operation/benchmarks/mul/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/mul/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/mul/metax/C550_64/env.sh b/operation/benchmarks/mul/metax/C550_64/env.sh
new file mode 100644
index 000000000..bb4cd2dce
--- /dev/null
+++ b/operation/benchmarks/mul/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_CONST_ARGS_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/mul/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/mul/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..a16e112d0
--- /dev/null
+++ b/operation/benchmarks/mul/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.34TFLOPS       | 0.34TFLOPS        | 0.11% | 0.11% |
+| nativetorch | True    | 0.35TFLOPS      | 0.35TFLOPS      | 0.11%      | 0.11%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3180.82us       | 3187.71us        | 314.38op/s | 313.7op/s | 1120746.29us | 3266.46us |
+| nativetorch | 3081.14us       | 3087.36us        | 324.55op/s | 323.9op/s | 19881.11us | 3105.43us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 265.39W       | 270.0W      | 4.45W        | 400W  |
+| flaggems监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 259.59W       | 265.0W      | 4.28W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.93%    | 2.307%   | 47.61°C       | 17.394%        |
+| flaggems监控结果 | 1.016%    | 2.309%   | 47.5°C       | 17.207%        |
diff --git a/operation/benchmarks/mul/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/mul/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..950d854c7
--- /dev/null
+++ b/operation/benchmarks/mul/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.34TFLOPS       | 0.34TFLOPS        | 0.11% | 0.11% |
+| nativetorch | True    | 0.35TFLOPS      | 0.35TFLOPS      | 0.11%      | 0.11%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3178.57us       | 3184.64us        | 314.61op/s | 314.01op/s | 850317.95us | 3267.59us |
+| nativetorch | 3079.73us       | 3086.34us        | 324.7op/s | 324.01op/s | 721935.23us | 3100.84us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 277.06W       | 283.0W      | 4.33W        | 400W  |
+| flaggems监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 260.34W       | 264.0W      | 4.35W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.875%    | 2.311%   | 48.88°C       | 17.394%        |
+| flaggems监控结果 | 1.709%    | 2.31%   | 47.29°C       | 17.207%        |
diff --git a/operation/benchmarks/mul/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/mul/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..a4a4a74c8
--- /dev/null
+++ b/operation/benchmarks/mul/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.17TFLOPS       | 0.17TFLOPS        | 0.88% | 0.88% |
+| nativetorch | True    | 0.17TFLOPS      | 0.17TFLOPS      | 0.89%      | 0.89%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 6260.62us       | 6270.98us        | 159.73op/s | 159.46op/s | 856485.53us | 6372.64us |
+| nativetorch | 6195.24us       | 6197.25us        | 161.41op/s | 161.36op/s | 23388.63us | 6281.23us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1586.0W | 1638.0W | 73.54W   | /     | 261.31W       | 264.0W      | 3.24W        | 400W  |
+| flaggems监控结果 | 1586.0W | 1638.0W | 73.54W   | /     | 257.32W       | 260.0W      | 2.83W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.811%    | 2.293%   | 48.57°C       | 32.551%        |
+| flaggems监控结果 | 0.824%    | 2.288%   | 48.68°C       | 32.364%        |
diff --git a/operation/benchmarks/mul/nvidia/A100_40_SXM/README.md b/operation/benchmarks/mul/nvidia/A100_40_SXM/README.md
deleted file mode 100644
index 7cc9a4677..000000000
--- a/operation/benchmarks/mul/nvidia/A100_40_SXM/README.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a31e96c235
-
-# 评测结果
-
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 0.00E+00    | 2.72TFLOPS       | 2.72TFLOPS        | 0.96% | 0.96% |
-| nativetorch | 0.00E+00    | 2.72TFLOPS      | 2.72TFLOPS      | 0.96%      | 0.96%    |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 0.00E+00    | 6191.22us       | 6194.18us        | 161.52op/s | 161.44op/s | 253536.13us | 6251.42us |
-| nativetorch | 0.00E+00    | 6196.2us       | 6196.22us        | 161.39op/s | 161.39op/s | 11111.41us | 6269.32us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1638.0W | 1638.0W | 0.0W   | /     | 254.56W       | 257.0W      | 2.96W        | 1638.0  |
-| flaggems监控结果 | 1716.0W | 1716.0W | 0.0W   | /     | 294.6W       | 300.0W      | 3.22W        | 1716.0  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 0.792%    | 1.3%   | 48.05°C       | 31.535%        |
-| flaggems监控结果 | 0.824%    | 1.301%   | 51.02°C       | 31.347%        |
diff --git a/operation/benchmarks/mul/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/mul/nvidia/A100_40_SXM/case_config.yaml
index 7d02883ab..bc4b04b42 100644
--- a/operation/benchmarks/mul/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/mul/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 50000
-SPECTFLOPS: 19.5
diff --git a/operation/benchmarks/mv/cambricon/MLU/case_config.yaml b/operation/benchmarks/mv/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/mv/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/mv/cambricon/MLU/env.sh b/operation/benchmarks/mv/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/mv/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/mv/case_config.yaml b/operation/benchmarks/mv/case_config.yaml
new file mode 100644
index 000000000..34df1050f
--- /dev/null
+++ b/operation/benchmarks/mv/case_config.yaml
@@ -0,0 +1,6 @@
+M: 1024
+N: 1024
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/mv/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/mv/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..ec1c9e233
--- /dev/null
+++ b/operation/benchmarks/mv/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,2 @@
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/mv/kunlunxin/R300p/env.sh b/operation/benchmarks/mv/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/mv/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/mv/main.py b/operation/benchmarks/mv/main.py
new file mode 100644
index 000000000..591842e71
--- /dev/null
+++ b/operation/benchmarks/mv/main.py
@@ -0,0 +1,105 @@
+ # Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+import torch
+import os
+import time
+from argparse import ArgumentParser, Namespace
+import yaml
+import sys
+import subprocess
+
+sys.path.append("..")
+from drivers.utils import *
+from drivers.calculate import *
+
+
+def parse_args():
+    parser = ArgumentParser(description=" ")
+
+    parser.add_argument("--vendor",
+                        type=str,
+                        required=True,
+                        help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
+
+    parser.add_argument("--dataformat",
+                        type=str,
+                        required=True,
+                        help="like FP32,FP16")
+
+    parser.add_argument("--oplib",
+                        type=str,
+                        required=True,
+                        help="impl like pytorch/flaggems/cpp")
+
+    parser.add_argument("--chip",
+                        type=str,
+                        required=True,
+                        help="chip like A100_40_SXM")
+
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
+    set_ieee_float32(config.vendor)
+
+
+    m = case_config.M
+    n = case_config.N
+
+
+
+    a = torch.randn(m, n,  dtype=dtype[config.dataformat]).to(0)
+    b = torch.randn(n, dtype=dtype[config.dataformat]).to(0)
+
+    latency_nowarm, latency_warm, cputime, kerneltime = do_test(
+        torch.mv, (a, b, ), host_device_sync, config, case_config)
+
+    op2flops = lambda x: x * m * n + x * m *(n-1)
+
+    perf_result = cal_perf(cputime, kerneltime, op2flops,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
+                 latency_nowarm, latency_warm)
+
+
+if __name__ == "__main__":
+    config = parse_args()
+    with open("case_config.yaml", "r") as file:
+        case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
+    with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
+              "r") as file:
+        case_config_vendor = yaml.safe_load(file)
+    case_config.update(case_config_vendor)
+    case_config = Namespace(**case_config)
+
+    if config.oplib == "flaggems":
+        import flag_gems
+        flag_gems.enable()
+        print("Using flaggems")
+    else:
+        print("Using nativetorch")
+    main(config, case_config)
\ No newline at end of file
diff --git a/operation/benchmarks/mv/metax/C550_64/case_config.yaml b/operation/benchmarks/mv/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bb110afa5
--- /dev/null
+++ b/operation/benchmarks/mv/metax/C550_64/case_config.yaml
@@ -0,0 +1,3 @@
+M: 26624
+N: 26624
+ITERS: 50000
diff --git a/operation/benchmarks/mv/metax/C550_64/env.sh b/operation/benchmarks/mv/metax/C550_64/env.sh
new file mode 100644
index 000000000..0cdec082d
--- /dev/null
+++ b/operation/benchmarks/mv/metax/C550_64/env.sh
@@ -0,0 +1 @@
+echo "METAX PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/mv/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/mv/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..e204f5181
--- /dev/null
+++ b/operation/benchmarks/mv/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.04TFLOPS       | 0.17TFLOPS        | 0.01% | 0.05% |
+| nativetorch | True    | 0.15TFLOPS      | 0.17TFLOPS      | 0.05%      | 0.05%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 48.26us       | 12.29us        | 20721.7op/s | 81380.21op/s | 19575635.47us | 68.19us |
+| nativetorch | 14.39us       | 12.29us        | 69469.75op/s | 81380.21op/s | 162237.72us | 33.13us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1508.0W | 1560.0W | 73.54W   | /     | 157.94W       | 177.0W      | 20.48W        | 400W  |
+| flaggems监控结果 | 1521.0W | 1560.0W | 39.0W   | /     | 167.0W       | 177.0W      | 13.13W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.84%    | 2.484%   | 37.56°C       | 3.461%        |
+| flaggems监控结果 | 1.35%    | 2.484%   | 37.74°C       | 3.461%        |
diff --git a/operation/benchmarks/mv/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/mv/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..0e12d79e8
--- /dev/null
+++ b/operation/benchmarks/mv/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.04TFLOPS       | 0.19TFLOPS        | 0.01% | 0.06% |
+| nativetorch | True    | 0.14TFLOPS      | 0.17TFLOPS      | 0.04%      | 0.05%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 52.9us       | 11.26us        | 18901.85op/s | 88778.41op/s | 16327512.33us | 73.89us |
+| nativetorch | 15.44us       | 12.29us        | 64751.84op/s | 81380.21op/s | 148089.35us | 37.16us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1482.0W | 1560.0W | 63.69W   | /     | 165.24W       | 178.0W      | 13.78W        | 400W  |
+| flaggems监控结果 | 1482.0W | 1560.0W | 63.69W   | /     | 164.88W       | 177.0W      | 15.11W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.982%    | 1.613%   | 37.37°C       | 2.519%        |
+| flaggems监控结果 | 1.325%    | 1.548%   | 37.92°C       | 87.752%        |
diff --git a/operation/benchmarks/mv/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/mv/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..25c3c2a51
--- /dev/null
+++ b/operation/benchmarks/mv/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.04TFLOPS       | 0.16TFLOPS        | 0.22% | 0.81% |
+| nativetorch | True    | 0.16TFLOPS      | 0.16TFLOPS      | 0.83%      | 0.81%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 49.05us       | 13.31us        | 20387.12op/s | 75120.19op/s | 19870586.09us | 76.59us |
+| nativetorch | 13.02us       | 13.31us        | 76807.16op/s | 75120.19op/s | 94715.66us | 31.02us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1482.0W | 1560.0W | 63.69W   | /     | 164.0W       | 178.0W      | 14.68W        | 400W  |
+| flaggems监控结果 | 1482.0W | 1560.0W | 55.15W   | /     | 165.67W       | 178.0W      | 16.16W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.664%    | 2.487%   | 37.6°C       | 3.461%        |
+| flaggems监控结果 | 0.687%    | 2.49%   | 37.1°C       | 3.461%        |
diff --git a/operation/benchmarks/mv/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/mv/nvidia/A100_40_SXM/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/mv/nvidia/A100_40_SXM/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/mv/nvidia/A100_40_SXM/env.sh b/operation/benchmarks/mv/nvidia/A100_40_SXM/env.sh
new file mode 100644
index 000000000..33786ec0d
--- /dev/null
+++ b/operation/benchmarks/mv/nvidia/A100_40_SXM/env.sh
@@ -0,0 +1 @@
+echo "NVIDIA PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/mv/nvidia/A100_40_SXM/requirements.txt b/operation/benchmarks/mv/nvidia/A100_40_SXM/requirements.txt
new file mode 100644
index 000000000..330e27963
--- /dev/null
+++ b/operation/benchmarks/mv/nvidia/A100_40_SXM/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/operation/benchmarks/native_dropout/cambricon/MLU/case_config.yaml b/operation/benchmarks/native_dropout/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/native_dropout/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/native_dropout/cambricon/MLU/env.sh b/operation/benchmarks/native_dropout/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/native_dropout/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/native_dropout/case_config.yaml b/operation/benchmarks/native_dropout/case_config.yaml
new file mode 100644
index 000000000..acc0f44fb
--- /dev/null
+++ b/operation/benchmarks/native_dropout/case_config.yaml
@@ -0,0 +1,5 @@
+Melements: 1024
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/native_dropout/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/native_dropout/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..bfd12215d
--- /dev/null
+++ b/operation/benchmarks/native_dropout/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,3 @@
+Melements: 1
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/native_dropout/kunlunxin/R300p/env.sh b/operation/benchmarks/native_dropout/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/native_dropout/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/native_dropout/main.py b/operation/benchmarks/native_dropout/main.py
new file mode 100644
index 000000000..04f1b2138
--- /dev/null
+++ b/operation/benchmarks/native_dropout/main.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+import torch
+import os
+import time
+from argparse import ArgumentParser, Namespace
+import yaml
+import sys
+import subprocess
+
+sys.path.append("..")
+from drivers.utils import *
+from drivers.calculate import *
+
+
+def parse_args():
+    parser = ArgumentParser(description=" ")
+
+    parser.add_argument("--vendor",
+                        type=str,
+                        required=True,
+                        help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
+    parser.add_argument("--dataformat",
+                        type=str,
+                        required=True,
+                        help="like FP32,FP16")
+
+    parser.add_argument("--oplib",
+                        type=str,
+                        required=True,
+                        help="impl like pytorch/flaggems/cpp")
+
+    parser.add_argument("--chip",
+                        type=str,
+                        required=True,
+                        help="chip like A100_40_SXM")
+
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
+    set_ieee_float32(config.vendor)
+
+    m = case_config.Melements
+    a = torch.randn(m * 1024 * 1024, dtype=dtype[config.dataformat], requires_grad=True).to(0)
+    f = torch.nn.Dropout(p=0.2)
+    latency_nowarm, latency_warm, cputime, kerneltime = do_test(
+        f, (a, ), host_device_sync, config, case_config, bp=True)
+
+    op2flops = lambda x: x * m * 1024 * 1024
+
+    perf_result = cal_perf(cputime, kerneltime, op2flops,
+                           config.spectflops, bp=True)
+    print_result(config, config.case_name, *perf_result, correctness,
+                 latency_nowarm, latency_warm)
+
+
+if __name__ == "__main__":
+    config = parse_args()
+    with open("case_config.yaml", "r") as file:
+        case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
+    with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
+              "r") as file:
+        case_config_vendor = yaml.safe_load(file)
+    case_config.update(case_config_vendor)
+    case_config = Namespace(**case_config)
+
+    if config.oplib == "flaggems":
+        import flag_gems
+        flag_gems.enable()
+        print("Using flaggems")
+    else:
+        print("Using nativetorch")
+    main(config, case_config)
diff --git a/operation/benchmarks/native_dropout/metax/C550_64/case_config.yaml b/operation/benchmarks/native_dropout/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..529af74ce
--- /dev/null
+++ b/operation/benchmarks/native_dropout/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
\ No newline at end of file
diff --git a/operation/benchmarks/native_dropout/metax/C550_64/env.sh b/operation/benchmarks/native_dropout/metax/C550_64/env.sh
new file mode 100644
index 000000000..0cdec082d
--- /dev/null
+++ b/operation/benchmarks/native_dropout/metax/C550_64/env.sh
@@ -0,0 +1 @@
+echo "METAX PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/native_dropout/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/native_dropout/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..51420378c
--- /dev/null
+++ b/operation/benchmarks/native_dropout/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.33TFLOPS       | 0.33TFLOPS        | 0.11% | 0.11% |
+| nativetorch | True    | 0.32TFLOPS      | 0.32TFLOPS      | 0.1%      | 0.1%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 9627.75us       | 9748.48us        | 103.87op/s | 102.58op/s | 1408977.47us | 3302.64us |
+| nativetorch | 10215.84us       | 10185.73us        | 97.89op/s | 98.18op/s | 1137786.15us | 4681.54us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1653.6W | 1716.0W | 124.8W   | /     | 289.69W       | 297.0W      | 4.6W        | 400W  |
+| flaggems监控结果 | 1700.4W | 1794.0W | 151.25W   | /     | 347.53W       | 356.0W      | 6.39W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.277%    | 1.46%   | 51.55°C       | 26.483%        |
+| flaggems监控结果 | 0.643%    | 1.465%   | 54.23°C       | 21.43%        |
diff --git a/operation/benchmarks/native_dropout/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/native_dropout/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..3b95c2fbf
--- /dev/null
+++ b/operation/benchmarks/native_dropout/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.33TFLOPS       | 0.33TFLOPS        | 0.11% | 0.11% |
+| nativetorch | True    | 0.32TFLOPS      | 0.32TFLOPS      | 0.1%      | 0.1%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 9621.87us       | 9734.14us        | 103.93op/s | 102.73op/s | 530372.57us | 3298.25us |
+| nativetorch | 10168.93us       | 10148.86us        | 98.34op/s | 98.53op/s | 15433.59us | 4557.64us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1669.2W | 1716.0W | 93.6W   | /     | 294.44W       | 305.0W      | 11.44W        | 400W  |
+| flaggems监控结果 | 1731.6W | 1794.0W | 124.8W   | /     | 348.92W       | 359.0W      | 15.24W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.448%    | 1.451%   | 51.97°C       | 26.483%        |
+| flaggems监控结果 | 0.756%    | 1.454%   | 53.64°C       | 26.483%        |
diff --git a/operation/benchmarks/native_dropout/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/native_dropout/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..3cbe9dd02
--- /dev/null
+++ b/operation/benchmarks/native_dropout/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.17TFLOPS       | 0.17TFLOPS        | 0.89% | 0.88% |
+| nativetorch | True    | 0.21TFLOPS      | 0.21TFLOPS      | 1.06%      | 1.06%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 18613.17us       | 18829.31us        | 53.73op/s | 53.11op/s | 548479.29us | 6866.79us |
+| nativetorch | 15512.51us       | 15555.58us        | 64.46op/s | 64.29op/s | 20707.57us | 7843.75us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1626.86W | 1716.0W | 64.97W   | /     | 277.03W       | 287.0W      | 5.08W        | 400W  |
+| flaggems监控结果 | 1681.33W | 1716.0W | 74.55W   | /     | 285.66W       | 298.0W      | 7.12W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.919%    | 1.64%   | 52.12°C       | 46.692%        |
+| flaggems监控结果 | 1.94%    | 1.664%   | 51.01°C       | 41.64%        |
diff --git a/operation/benchmarks/native_dropout/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/native_dropout/nvidia/A100_40_SXM/case_config.yaml
new file mode 100644
index 000000000..529af74ce
--- /dev/null
+++ b/operation/benchmarks/native_dropout/nvidia/A100_40_SXM/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
\ No newline at end of file
diff --git a/operation/benchmarks/native_dropout/nvidia/A100_40_SXM/env.sh b/operation/benchmarks/native_dropout/nvidia/A100_40_SXM/env.sh
new file mode 100644
index 000000000..33786ec0d
--- /dev/null
+++ b/operation/benchmarks/native_dropout/nvidia/A100_40_SXM/env.sh
@@ -0,0 +1 @@
+echo "NVIDIA PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/native_dropout/nvidia/A100_40_SXM/requirements.txt b/operation/benchmarks/native_dropout/nvidia/A100_40_SXM/requirements.txt
new file mode 100644
index 000000000..330e27963
--- /dev/null
+++ b/operation/benchmarks/native_dropout/nvidia/A100_40_SXM/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/operation/benchmarks/native_group_norm/cambricon/MLU/case_config.yaml b/operation/benchmarks/native_group_norm/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/native_group_norm/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/native_group_norm/cambricon/MLU/env.sh b/operation/benchmarks/native_group_norm/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/native_group_norm/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/native_group_norm/case_config.yaml b/operation/benchmarks/native_group_norm/case_config.yaml
new file mode 100644
index 000000000..406c921c8
--- /dev/null
+++ b/operation/benchmarks/native_group_norm/case_config.yaml
@@ -0,0 +1,7 @@
+bs: 20
+channel: 6
+hiddensize: 32768
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
diff --git a/operation/benchmarks/native_group_norm/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/native_group_norm/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..0c2890f86
--- /dev/null
+++ b/operation/benchmarks/native_group_norm/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,5 @@
+bs: 1
+channel: 3
+hiddensize: 4
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/native_group_norm/kunlunxin/R300p/env.sh b/operation/benchmarks/native_group_norm/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/native_group_norm/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/native_group_norm/main.py b/operation/benchmarks/native_group_norm/main.py
new file mode 100644
index 000000000..183e686db
--- /dev/null
+++ b/operation/benchmarks/native_group_norm/main.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+import torch
+import os
+import time
+from argparse import ArgumentParser, Namespace
+import yaml
+import sys
+import subprocess
+
+sys.path.append("..")
+from drivers.utils import *
+from drivers.calculate import *
+
+
+def parse_args():
+    parser = ArgumentParser(description=" ")
+
+    parser.add_argument("--vendor",
+                        type=str,
+                        required=True,
+                        help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
+    parser.add_argument("--dataformat",
+                        type=str,
+                        required=True,
+                        help="like FP32,FP16")
+
+    parser.add_argument("--oplib",
+                        type=str,
+                        required=True,
+                        help="impl like pytorch/flaggems/cpp")
+
+    parser.add_argument("--chip",
+                        type=str,
+                        required=True,
+                        help="chip like A100_40_SXM")
+
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
+    set_ieee_float32(config.vendor)
+
+    bs = case_config.bs
+    channel = case_config.channel
+    hiddensize = case_config.hiddensize
+    a = torch.randn(bs, channel,  hiddensize, dtype=dtype[config.dataformat], requires_grad=True).to(0)
+    f = torch.nn.GroupNorm(channel // 2, channel, dtype=dtype[config.dataformat]).to(0)
+    latency_nowarm, latency_warm, cputime, kerneltime = do_test(
+        f, (a, ), host_device_sync, config, case_config, bp=True)
+
+    op2flops = lambda x: x * bs * channel * hiddensize * 9
+
+    perf_result = cal_perf(cputime, kerneltime, op2flops,
+                           config.spectflops, bp=True)
+    print_result(config, config.case_name, *perf_result, correctness,
+                 latency_nowarm, latency_warm)
+
+
+if __name__ == "__main__":
+    config = parse_args()
+    with open("case_config.yaml", "r") as file:
+        case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
+    with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
+              "r") as file:
+        case_config_vendor = yaml.safe_load(file)
+    case_config.update(case_config_vendor)
+    case_config = Namespace(**case_config)
+
+    if config.oplib == "flaggems":
+        import flag_gems
+        flag_gems.enable()
+        print("Using flaggems")
+    else:
+        print("Using nativetorch")
+    main(config, case_config)
diff --git a/operation/benchmarks/native_group_norm/metax/C550_64/case_config.yaml b/operation/benchmarks/native_group_norm/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..2c13a2d53
--- /dev/null
+++ b/operation/benchmarks/native_group_norm/metax/C550_64/case_config.yaml
@@ -0,0 +1,6 @@
+bs: 16
+channel: 8192
+hiddensize: 1024
+ITERS: 50000
+KERNELWARMUP: 1000
+KERNELITERS: 10000
\ No newline at end of file
diff --git a/operation/benchmarks/native_group_norm/metax/C550_64/env.sh b/operation/benchmarks/native_group_norm/metax/C550_64/env.sh
new file mode 100644
index 000000000..0cdec082d
--- /dev/null
+++ b/operation/benchmarks/native_group_norm/metax/C550_64/env.sh
@@ -0,0 +1 @@
+echo "METAX PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/native_group_norm/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/native_group_norm/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..33e479dec
--- /dev/null
+++ b/operation/benchmarks/native_group_norm/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.02TFLOPS       | 0.02TFLOPS        | 0.01% | 0.01% |
+| nativetorch | True    | 0.46TFLOPS      | 0.6TFLOPS      | 0.15%      | 0.19%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 4328.45us       | 4333.57us        | 231.03op/s | 230.76op/s | 3293166.19us | 239.14us |
+| nativetorch | 228.43us       | 177.15us        | 4377.71op/s | 5644.87op/s | 13573.29us | 140.1us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1404.0W | 1404.0W | 0.0W   | /     | 146.33W       | 168.0W      | 15.37W        | 400W  |
+| flaggems监控结果 | 1430.0W | 1482.0W | 36.77W   | /     | 98.93W       | 101.0W      | 2.61W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.444%    | 1.617%   | 35.24°C       | 2.622%        |
+| flaggems监控结果 | 1.56%    | 2.062%   | 34.14°C       | 43.204%        |
diff --git a/operation/benchmarks/native_group_norm/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/native_group_norm/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..59d5d42d9
--- /dev/null
+++ b/operation/benchmarks/native_group_norm/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.04TFLOPS       | 0.04TFLOPS        | 0.01% | 0.01% |
+| nativetorch | True    | 0.47TFLOPS      | 0.6TFLOPS      | 0.15%      | 0.19%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 2866.46us       | 2872.32us        | 348.86op/s | 348.15op/s | 3057429.97us | 236.14us |
+| nativetorch | 226.19us       | 176.13us        | 4421.05op/s | 5677.69op/s | 13866.92us | 128.79us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1404.0W | 1404.0W | 0.0W   | /     | 148.0W       | 169.0W      | 14.85W        | 400W  |
+| flaggems监控结果 | 1419.6W | 1482.0W | 31.2W   | /     | 102.28W       | 104.0W      | 0.87W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.44%    | 1.617%   | 35.03°C       | 2.622%        |
+| flaggems监控结果 | 0.895%    | 2.027%   | 34.1°C       | 35.911%        |
diff --git a/operation/benchmarks/native_group_norm/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/native_group_norm/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..4908a03a2
--- /dev/null
+++ b/operation/benchmarks/native_group_norm/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.05TFLOPS       | 0.05TFLOPS        | 0.24% | 0.24% |
+| nativetorch | True    | 0.47TFLOPS      | 0.47TFLOPS      | 2.4%      | 2.43%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 2257.16us       | 2268.16us        | 443.04op/s | 440.89op/s | 3394693.6us | 338.95us |
+| nativetorch | 226.58us       | 224.26us        | 4413.52op/s | 4459.19op/s | 14645.49us | 165.67us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1482.0W | 1482.0W | 0.0W   | /     | 166.0W       | 174.0W      | 9.93W        | 400W  |
+| flaggems监控结果 | 1410.0W | 1482.0W | 20.78W   | /     | 110.27W       | 111.0W      | 0.81W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.967%    | 1.688%   | 34.38°C       | 2.622%        |
+| flaggems监控结果 | 1.602%    | 2.225%   | 33.95°C       | 35.132%        |
diff --git a/operation/benchmarks/native_group_norm/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/native_group_norm/nvidia/A100_40_SXM/case_config.yaml
new file mode 100644
index 000000000..529af74ce
--- /dev/null
+++ b/operation/benchmarks/native_group_norm/nvidia/A100_40_SXM/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
\ No newline at end of file
diff --git a/operation/benchmarks/native_group_norm/nvidia/A100_40_SXM/env.sh b/operation/benchmarks/native_group_norm/nvidia/A100_40_SXM/env.sh
new file mode 100644
index 000000000..33786ec0d
--- /dev/null
+++ b/operation/benchmarks/native_group_norm/nvidia/A100_40_SXM/env.sh
@@ -0,0 +1 @@
+echo "NVIDIA PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/native_group_norm/nvidia/A100_40_SXM/requirements.txt b/operation/benchmarks/native_group_norm/nvidia/A100_40_SXM/requirements.txt
new file mode 100644
index 000000000..330e27963
--- /dev/null
+++ b/operation/benchmarks/native_group_norm/nvidia/A100_40_SXM/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/operation/benchmarks/ne/cambricon/MLU/case_config.yaml b/operation/benchmarks/ne/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/ne/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/ne/cambricon/MLU/env.sh b/operation/benchmarks/ne/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/ne/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/ne/case_config.yaml b/operation/benchmarks/ne/case_config.yaml
index 6ebcc4612..acc0f44fb 100644
--- a/operation/benchmarks/ne/case_config.yaml
+++ b/operation/benchmarks/ne/case_config.yaml
@@ -1,5 +1,4 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 50000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/ne/iluvatar/BI150/README.md b/operation/benchmarks/ne/iluvatar/BI150/README.md
new file mode 100644
index 000000000..ae55c39e4
--- /dev/null
+++ b/operation/benchmarks/ne/iluvatar/BI150/README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 0.00E+00    | 0.12TFLOPS       | 0.12TFLOPS        | 0.47% | 0.47% |
+| nativetorch | 0.00E+00    | 0.14TFLOPS      | 0.14TFLOPS      | 0.56%      | 0.56%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 0.00E+00    | 9245.01us       | 9269.66us        | 108.17op/s | 107.88op/s | 240524.51us | 9739.06us |
+| nativetorch | 0.00E+00    | 7840.97us       | 7861.39us        | 127.54op/s | 127.2op/s | 8121.84us | 8122.61us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2109.0W | 2128.0W | 32.91W   | /     | 167.93W       | 168.0W      | 0.25W        | 350W  |
+| flaggems监控结果 | 2082.4W | 2109.0W | 33.13W   | /     | 163.91W       | 164.0W      | 0.54W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 40.643%    | 2.577%   | 49.77°C       | 16.364%        |
+| flaggems监控结果 | 40.401%    | 2.584%   | 48.82°C       | 17.926%        |
\ No newline at end of file
diff --git a/operation/benchmarks/ne/iluvatar/BI150/case_config.yaml b/operation/benchmarks/ne/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..4958ee3ba
--- /dev/null
+++ b/operation/benchmarks/ne/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/ne/iluvatar/BI150/env.sh b/operation/benchmarks/ne/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/ne/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/ne/iluvatar/BI150/requirements.txt b/operation/benchmarks/ne/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/ne/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/ne/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/ne/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..ec1c9e233
--- /dev/null
+++ b/operation/benchmarks/ne/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,2 @@
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/ne/kunlunxin/R300p/env.sh b/operation/benchmarks/ne/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/ne/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/ne/main.py b/operation/benchmarks/ne/main.py
index 83d07f469..fe6f0134a 100644
--- a/operation/benchmarks/ne/main.py
+++ b/operation/benchmarks/ne/main.py
@@ -4,12 +4,12 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +23,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,36 +53,22 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation"
-          )  # correctness is implemented casebycase
 
     m = case_config.Melements
 
 
-    dtype = {"FP32": torch.float32}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(m, dtype=dtype[config.dataformat])
-        b = torch.randn(m, dtype=dtype[config.dataformat])
-
-        a_fp64 = a.to(torch.float64)
-        b_fp64 = b.to(torch.float64)
-        r_fp64 = torch.ne(a_fp64, b_fp64)
-
-        a = a.to(0)
-        b = b.to(0)
-        r_device = torch.ne(a, b).cpu()
-        mape = ((r_device != r_fp64).float().sum()/r_fp64.numel()).item()
-
-        mmape.append(mape)
-    
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
 
     a = torch.randn(m, 1024, 1024,  dtype=dtype[config.dataformat]).to(0)
     b = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat]).to(0)
@@ -82,11 +76,11 @@ def main(config, case_config):
     latency_nowarm, latency_warm, cputime, kerneltime = do_test(
         torch.ne, (a, b), host_device_sync, config, case_config)
 
-    op2flops = lambda x: x * 2 * m * 1024 * 1024
+    op2flops = lambda x: x * m * 1024 * 1024
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "ne", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -94,6 +88,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
diff --git a/operation/benchmarks/ne/metax/C550_64/case_config.yaml b/operation/benchmarks/ne/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/ne/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/ne/metax/C550_64/env.sh b/operation/benchmarks/ne/metax/C550_64/env.sh
new file mode 100644
index 000000000..bb4cd2dce
--- /dev/null
+++ b/operation/benchmarks/ne/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_CONST_ARGS_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/ne/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/ne/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..45e778f4d
--- /dev/null
+++ b/operation/benchmarks/ne/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.27TFLOPS       | 0.27TFLOPS        | 0.09% | 0.09% |
+| nativetorch | True    | 0.28TFLOPS      | 0.28TFLOPS      | 0.09%      | 0.09%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3971.65us       | 3990.53us        | 251.78op/s | 250.59op/s | 2304675.66us | 4078.79us |
+| nativetorch | 3814.52us       | 3828.74us        | 262.16op/s | 261.18op/s | 23685.74us | 3835.35us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1586.0W | 1638.0W | 73.54W   | /     | 261.84W       | 266.0W      | 4.46W        | 400W  |
+| flaggems监控结果 | 1638.0W | 1716.0W | 110.31W   | /     | 300.6W       | 305.0W      | 5.36W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.054%    | 2.523%   | 48.22°C       | 17.394%        |
+| flaggems监控结果 | 0.768%    | 2.524%   | 51.62°C       | 17.207%        |
diff --git a/operation/benchmarks/ne/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/ne/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..6e5e13963
--- /dev/null
+++ b/operation/benchmarks/ne/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.27TFLOPS       | 0.27TFLOPS        | 0.09% | 0.09% |
+| nativetorch | True    | 0.28TFLOPS      | 0.28TFLOPS      | 0.09%      | 0.09%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3962.32us       | 3981.31us        | 252.38op/s | 251.17op/s | 2143105.17us | 4068.5us |
+| nativetorch | 3809.05us       | 3823.62us        | 262.53op/s | 261.53op/s | 22980.26us | 3834.04us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1586.0W | 1638.0W | 73.54W   | /     | 270.0W       | 276.0W      | 4.64W        | 400W  |
+| flaggems监控结果 | 1612.0W | 1716.0W | 97.28W   | /     | 298.12W       | 303.0W      | 5.56W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.21%    | 2.524%   | 48.89°C       | 17.394%        |
+| flaggems监控结果 | 0.735%    | 2.525%   | 50.81°C       | 17.207%        |
diff --git a/operation/benchmarks/ne/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/ne/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..a17d87b2d
--- /dev/null
+++ b/operation/benchmarks/ne/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.16TFLOPS       | 0.16TFLOPS        | 0.81% | 0.8% |
+| nativetorch | True    | 0.16TFLOPS      | 0.16TFLOPS      | 0.81%      | 0.81%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 6822.12us       | 6840.32us        | 146.58op/s | 146.19op/s | 1710672.5us | 6933.42us |
+| nativetorch | 6778.83us       | 6796.29us        | 147.52op/s | 147.14op/s | 26330.66us | 6818.97us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1618.5W | 1716.0W | 85.0W   | /     | 260.96W       | 266.0W      | 3.26W        | 400W  |
+| flaggems监控结果 | 1657.5W | 1716.0W | 101.32W   | /     | 286.36W       | 290.0W      | 3.08W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.764%    | 2.493%   | 48.42°C       | 27.499%        |
+| flaggems监控结果 | 1.18%    | 2.494%   | 49.3°C       | 30.025%        |
diff --git a/operation/benchmarks/ne/nvidia/A100_40_SXM/README.md b/operation/benchmarks/ne/nvidia/A100_40_SXM/README.md
deleted file mode 100644
index 5d0d9d5f6..000000000
--- a/operation/benchmarks/ne/nvidia/A100_40_SXM/README.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a31e96c235
-
-# 评测结果
-
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 0.00E+00    | 0.32TFLOPS       | 0.32TFLOPS        | 1.63% | 1.62% |
-| nativetorch | 0.00E+00    | 0.32TFLOPS      | 0.32TFLOPS      | 1.62%      | 1.62%    |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 0.00E+00    | 6770.05us       | 6787.07us        | 147.71op/s | 147.34op/s | 262163.66us | 6847.53us |
-| nativetorch | 0.00E+00    | 6778.49us       | 6796.29us        | 147.53op/s | 147.14op/s | 8599.52us | 6801.63us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1638.0W | 1638.0W | 0.0W   | /     | 261.18W       | 264.0W      | 3.0W        | 1638.0  |
-| flaggems监控结果 | 1716.0W | 1716.0W | 0.0W   | /     | 297.74W       | 301.0W      | 3.7W        | 1716.0  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 0.683%    | 1.402%   | 48.57°C       | 26.483%        |
-| flaggems监控结果 | 0.736%    | 1.405%   | 50.4°C       | 26.295%        |
diff --git a/operation/benchmarks/ne/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/ne/nvidia/A100_40_SXM/case_config.yaml
index c7975e944..bc4b04b42 100644
--- a/operation/benchmarks/ne/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/ne/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 50000
-SPECTFLOPS: 19.5
\ No newline at end of file
diff --git a/operation/benchmarks/neg/cambricon/MLU/case_config.yaml b/operation/benchmarks/neg/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/neg/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/neg/cambricon/MLU/env.sh b/operation/benchmarks/neg/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/neg/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/neg/case_config.yaml b/operation/benchmarks/neg/case_config.yaml
index 6ebcc4612..acc0f44fb 100644
--- a/operation/benchmarks/neg/case_config.yaml
+++ b/operation/benchmarks/neg/case_config.yaml
@@ -1,5 +1,4 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 50000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/neg/iluvatar/BI150/README.md b/operation/benchmarks/neg/iluvatar/BI150/README.md
new file mode 100644
index 000000000..9d77d20d3
--- /dev/null
+++ b/operation/benchmarks/neg/iluvatar/BI150/README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 0.00E+00    | 0.07TFLOPS       | 0.07TFLOPS        | 0.29% | 0.29% |
+| nativetorch | 0.00E+00    | 0.07TFLOPS      | 0.07TFLOPS      | 0.3%      | 0.3%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 0.00E+00    | 7420.76us       | 7420.54us        | 134.76op/s | 134.76op/s | 234488.25us | 7949.36us |
+| nativetorch | 0.00E+00    | 7388.23us       | 7397.06us        | 135.35op/s | 135.19op/s | 7658.95us | 7594.52us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2061.5W | 2090.0W | 28.5W   | /     | 162.74W       | 163.0W      | 2.21W        | 350W  |
+| flaggems监控结果 | 2071.0W | 2090.0W | 32.91W   | /     | 169.94W       | 170.0W      | 0.23W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 39.843%    | 2.388%   | 45.86°C       | 25.739%        |
+| flaggems监控结果 | 39.059%    | 2.391%   | 46.71°C       | 19.489%        |
\ No newline at end of file
diff --git a/operation/benchmarks/neg/iluvatar/BI150/case_config.yaml b/operation/benchmarks/neg/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..4958ee3ba
--- /dev/null
+++ b/operation/benchmarks/neg/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/neg/iluvatar/BI150/env.sh b/operation/benchmarks/neg/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/neg/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/neg/iluvatar/BI150/requirements.txt b/operation/benchmarks/neg/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/neg/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/neg/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/neg/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..ec1c9e233
--- /dev/null
+++ b/operation/benchmarks/neg/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,2 @@
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/neg/kunlunxin/R300p/env.sh b/operation/benchmarks/neg/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/neg/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/neg/main.py b/operation/benchmarks/neg/main.py
index 250d9b0ba..7dc295968 100644
--- a/operation/benchmarks/neg/main.py
+++ b/operation/benchmarks/neg/main.py
@@ -4,12 +4,12 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +23,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,32 +53,21 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation"
-          )  # correctness is implemented casebycase
 
     m = case_config.Melements
 
-    dtype = {"FP32": torch.float32}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(m, dtype=dtype[config.dataformat])
-
-        a_fp64 = a.to(torch.float64)
-        r_fp64 = torch.neg(a_fp64)
-
-        a = a.to(0)
-        r_device = torch.neg(a).cpu()
-        mape = torch.mean(torch.abs(r_device - r_fp64) / torch.abs(r_fp64))
-
-        mmape.append(mape)
-    
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
 
     a = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat]).to(0)
 
@@ -80,8 +77,8 @@ def main(config, case_config):
     op2flops = lambda x: x * m * 1024 * 1024
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "neg", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -89,6 +86,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
diff --git a/operation/benchmarks/neg/metax/C550_64/case_config.yaml b/operation/benchmarks/neg/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/neg/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/neg/metax/C550_64/env.sh b/operation/benchmarks/neg/metax/C550_64/env.sh
new file mode 100644
index 000000000..bb4cd2dce
--- /dev/null
+++ b/operation/benchmarks/neg/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_CONST_ARGS_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/neg/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/neg/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..6ecaf32db
--- /dev/null
+++ b/operation/benchmarks/neg/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.34TFLOPS       | 0.33TFLOPS        | 0.11% | 0.11% |
+| nativetorch | True    | 0.35TFLOPS      | 0.35TFLOPS      | 0.11%      | 0.11%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3201.63us       | 3206.14us        | 312.34op/s | 311.9op/s | 913081.68us | 3279.62us |
+| nativetorch | 3079.55us       | 3086.34us        | 324.72op/s | 324.01op/s | 23891.4us | 3098.73us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 259.9W       | 263.0W      | 2.82W        | 400W  |
+| flaggems监控结果 | 1599.0W | 1716.0W | 117.0W   | /     | 301.19W       | 307.0W      | 6.11W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.157%    | 2.51%   | 47.74°C       | 22.447%        |
+| flaggems监控结果 | 0.728%    | 2.506%   | 51.28°C       | 22.447%        |
diff --git a/operation/benchmarks/neg/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/neg/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..b7cc2ea66
--- /dev/null
+++ b/operation/benchmarks/neg/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.33TFLOPS       | 0.33TFLOPS        | 0.11% | 0.11% |
+| nativetorch | True    | 0.35TFLOPS      | 0.35TFLOPS      | 0.11%      | 0.11%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3206.71us       | 3213.31us        | 311.85op/s | 311.21op/s | 1492053.34us | 3304.21us |
+| nativetorch | 3079.79us       | 3086.34us        | 324.7op/s | 324.01op/s | 23635.07us | 3100.39us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1521.0W | 1638.0W | 117.0W   | /     | 276.13W       | 282.0W      | 3.96W        | 400W  |
+| flaggems监控结果 | 1599.0W | 1716.0W | 117.0W   | /     | 309.62W       | 315.0W      | 5.28W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.789%    | 2.515%   | 48.74°C       | 17.394%        |
+| flaggems监控结果 | 0.762%    | 2.513%   | 50.77°C       | 17.207%        |
diff --git a/operation/benchmarks/neg/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/neg/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..132c5ba8b
--- /dev/null
+++ b/operation/benchmarks/neg/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.17TFLOPS       | 0.17TFLOPS        | 0.88% | 0.88% |
+| nativetorch | True    | 0.17TFLOPS      | 0.17TFLOPS      | 0.89%      | 0.89%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 6256.77us       | 6266.88us        | 159.83op/s | 159.57op/s | 2005321.14us | 6367.07us |
+| nativetorch | 6195.55us       | 6198.27us        | 161.41op/s | 161.34op/s | 32710.7us | 6607.89us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1586.0W | 1638.0W | 73.54W   | /     | 257.5W       | 261.0W      | 2.47W        | 400W  |
+| flaggems监控结果 | 1638.0W | 1716.0W | 110.31W   | /     | 282.68W       | 286.0W      | 3.16W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.767%    | 2.488%   | 48.67°C       | 32.551%        |
+| flaggems监控结果 | 0.748%    | 2.492%   | 49.76°C       | 42.656%        |
diff --git a/operation/benchmarks/neg/nvidia/A100_40_SXM/README.md b/operation/benchmarks/neg/nvidia/A100_40_SXM/README.md
deleted file mode 100644
index 852ba64ea..000000000
--- a/operation/benchmarks/neg/nvidia/A100_40_SXM/README.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a31e96c235
-
-# 评测结果
-
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 0.00E+00    | 2.72TFLOPS       | 2.72TFLOPS        | 0.96% | 0.96% |
-| nativetorch | 0.00E+00    | 2.72TFLOPS      | 2.72TFLOPS      | 0.96%      | 0.96%    |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 0.00E+00    | 6191.03us       | 6194.18us        | 161.52op/s | 161.44op/s | 1269753.1us | 6257.5us |
-| nativetorch | 0.00E+00    | 6196.07us       | 6200.32us        | 161.39op/s | 161.28op/s | 767433.09us | 6213.74us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1638.0W | 1638.0W | 0.0W   | /     | 255.76W       | 259.0W      | 3.17W        | 1638.0  |
-| flaggems监控结果 | 1716.0W | 1716.0W | 0.0W   | /     | 294.94W       | 299.0W      | 2.76W        | 1716.0  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 0.826%    | 1.305%   | 48.62°C       | 31.535%        |
-| flaggems监控结果 | 0.777%    | 1.318%   | 50.13°C       | 31.347%        |
diff --git a/operation/benchmarks/neg/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/neg/nvidia/A100_40_SXM/case_config.yaml
index 7d02883ab..bc4b04b42 100644
--- a/operation/benchmarks/neg/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/neg/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 50000
-SPECTFLOPS: 19.5
diff --git a/operation/benchmarks/outer/cambricon/MLU/case_config.yaml b/operation/benchmarks/outer/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/outer/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/outer/cambricon/MLU/env.sh b/operation/benchmarks/outer/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/outer/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/outer/case_config.yaml b/operation/benchmarks/outer/case_config.yaml
new file mode 100644
index 000000000..34df1050f
--- /dev/null
+++ b/operation/benchmarks/outer/case_config.yaml
@@ -0,0 +1,6 @@
+M: 1024
+N: 1024
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/outer/main.py b/operation/benchmarks/outer/main.py
new file mode 100644
index 000000000..eef7334fc
--- /dev/null
+++ b/operation/benchmarks/outer/main.py
@@ -0,0 +1,105 @@
+ # Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+import torch
+import os
+import time
+from argparse import ArgumentParser, Namespace
+import yaml
+import sys
+import subprocess
+
+sys.path.append("..")
+from drivers.utils import *
+from drivers.calculate import *
+
+
+def parse_args():
+    parser = ArgumentParser(description=" ")
+
+    parser.add_argument("--vendor",
+                        type=str,
+                        required=True,
+                        help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
+
+    parser.add_argument("--dataformat",
+                        type=str,
+                        required=True,
+                        help="like FP32,FP16")
+
+    parser.add_argument("--oplib",
+                        type=str,
+                        required=True,
+                        help="impl like pytorch/flaggems/cpp")
+
+    parser.add_argument("--chip",
+                        type=str,
+                        required=True,
+                        help="chip like A100_40_SXM")
+
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
+    set_ieee_float32(config.vendor)
+
+
+    m = case_config.M
+    n = case_config.N
+
+
+
+    a = torch.randn(m * 10,  dtype=dtype[config.dataformat]).to(0)
+    b = torch.randn(n * 10, dtype=dtype[config.dataformat]).to(0)
+
+    latency_nowarm, latency_warm, cputime, kerneltime = do_test(
+        torch.outer, (a, b, ), host_device_sync, config, case_config)
+
+    op2flops = lambda x: x * m * 10 * n * 10
+
+    perf_result = cal_perf(cputime, kerneltime, op2flops,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
+                 latency_nowarm, latency_warm)
+
+
+if __name__ == "__main__":
+    config = parse_args()
+    with open("case_config.yaml", "r") as file:
+        case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
+    with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
+              "r") as file:
+        case_config_vendor = yaml.safe_load(file)
+    case_config.update(case_config_vendor)
+    case_config = Namespace(**case_config)
+
+    if config.oplib == "flaggems":
+        import flag_gems
+        flag_gems.enable()
+        print("Using flaggems")
+    else:
+        print("Using nativetorch")
+    main(config, case_config)
\ No newline at end of file
diff --git a/operation/benchmarks/outer/metax/C550_64/case_config.yaml b/operation/benchmarks/outer/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/outer/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/outer/metax/C550_64/env.sh b/operation/benchmarks/outer/metax/C550_64/env.sh
new file mode 100644
index 000000000..0cdec082d
--- /dev/null
+++ b/operation/benchmarks/outer/metax/C550_64/env.sh
@@ -0,0 +1 @@
+echo "METAX PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/outer/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/outer/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..6c1b57e4e
--- /dev/null
+++ b/operation/benchmarks/outer/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.4TFLOPS       | 0.4TFLOPS        | 0.13% | 0.13% |
+| nativetorch | True    | 0.33TFLOPS      | 0.33TFLOPS      | 0.11%      | 0.11%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 260.99us       | 265.22us        | 3831.51op/s | 3770.51op/s | 836416.01us | 376.07us |
+| nativetorch | 313.07us       | 316.42us        | 3194.15op/s | 3160.4op/s | 8030.29us | 331.23us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1443.0W | 1482.0W | 39.0W   | /     | 323.67W       | 326.0W      | 2.05W        | 400W  |
+| flaggems监控结果 | 1426.29W | 1482.0W | 35.24W   | /     | 324.33W       | 327.0W      | 2.49W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.86%    | 1.343%   | 48.5°C       | 6.925%        |
+| flaggems监控结果 | 1.069%    | 1.344%   | 49.33°C       | 6.925%        |
diff --git a/operation/benchmarks/outer/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/outer/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..8a9f2589d
--- /dev/null
+++ b/operation/benchmarks/outer/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.41TFLOPS       | 0.4TFLOPS        | 0.13% | 0.13% |
+| nativetorch | True    | 0.35TFLOPS      | 0.34TFLOPS      | 0.11%      | 0.11%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 256.59us       | 260.1us        | 3897.25op/s | 3844.73op/s | 858145.51us | 449.84us |
+| nativetorch | 300.85us       | 304.13us        | 3323.96op/s | 3288.09op/s | 7931.05us | 319.4us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1431.86W | 1482.0W | 37.37W   | /     | 328.25W       | 344.0W      | 21.71W        | 400W  |
+| flaggems监控结果 | 1415.14W | 1482.0W | 27.29W   | /     | 336.0W       | 342.0W      | 4.55W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.859%    | 1.342%   | 49.75°C       | 6.925%        |
+| flaggems监控结果 | 0.897%    | 1.342%   | 49.33°C       | 6.925%        |
diff --git a/operation/benchmarks/outer/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/outer/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..2f671b31c
--- /dev/null
+++ b/operation/benchmarks/outer/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.38TFLOPS       | 0.37TFLOPS        | 1.92% | 1.9% |
+| nativetorch | True    | 0.31TFLOPS      | 0.31TFLOPS      | 1.6%      | 1.59%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 279.4us       | 282.62us        | 3579.12op/s | 3538.27op/s | 1885093.46us | 452.13us |
+| nativetorch | 335.71us       | 337.92us        | 2978.76op/s | 2959.28op/s | 11899.75us | 366.04us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1616.33W | 1638.0W | 34.94W   | /     | 368.67W       | 373.0W      | 4.19W        | 400W  |
+| flaggems监控结果 | 1526.57W | 1638.0W | 105.12W   | /     | 395.67W       | 400.0W      | 3.09W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 33.885%    | 28.128%   | 49.5°C       | 6.925%        |
+| flaggems监控结果 | 20.204%    | 14.926%   | 53.33°C       | 6.925%        |
diff --git a/operation/benchmarks/outer/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/outer/nvidia/A100_40_SXM/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/outer/nvidia/A100_40_SXM/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/outer/nvidia/A100_40_SXM/env.sh b/operation/benchmarks/outer/nvidia/A100_40_SXM/env.sh
new file mode 100644
index 000000000..33786ec0d
--- /dev/null
+++ b/operation/benchmarks/outer/nvidia/A100_40_SXM/env.sh
@@ -0,0 +1 @@
+echo "NVIDIA PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/outer/nvidia/A100_40_SXM/requirements.txt b/operation/benchmarks/outer/nvidia/A100_40_SXM/requirements.txt
new file mode 100644
index 000000000..330e27963
--- /dev/null
+++ b/operation/benchmarks/outer/nvidia/A100_40_SXM/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/operation/benchmarks/pow/cambricon/MLU/case_config.yaml b/operation/benchmarks/pow/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..e1206aaa4
--- /dev/null
+++ b/operation/benchmarks/pow/cambricon/MLU/case_config.yaml
@@ -0,0 +1,2 @@
+SPECTFLOPS: 999999
+Melements: 1
\ No newline at end of file
diff --git a/operation/benchmarks/pow/cambricon/MLU/env.sh b/operation/benchmarks/pow/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/pow/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/pow/case_config.yaml b/operation/benchmarks/pow/case_config.yaml
index 6ebcc4612..acc0f44fb 100644
--- a/operation/benchmarks/pow/case_config.yaml
+++ b/operation/benchmarks/pow/case_config.yaml
@@ -1,5 +1,4 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 50000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/pow/iluvatar/BI150/README.md b/operation/benchmarks/pow/iluvatar/BI150/README.md
new file mode 100644
index 000000000..578084010
--- /dev/null
+++ b/operation/benchmarks/pow/iluvatar/BI150/README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 2.15E-08    | 0.07TFLOPS       | 0.07TFLOPS        | 0.3% | 0.29% |
+| nativetorch | 2.15E-08    | 0.07TFLOPS      | 0.07TFLOPS      | 0.3%      | 0.3%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 6.00E-10    | 7402.11us       | 7413.81us        | 135.1op/s | 134.88op/s | 352149.52us | 7944.32us |
+| nativetorch | 6.00E-10    | 7389.13us       | 7394.44us        | 135.33op/s | 135.24op/s | 7708.49us | 7700.08us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2052.0W | 2071.0W | 32.91W   | /     | 163.24W       | 164.0W      | 4.57W        | 350W  |
+| flaggems监控结果 | 2071.0W | 2109.0W | 44.56W   | /     | 180.99W       | 181.0W      | 0.12W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 41.737%    | 2.388%   | 45.81°C       | 19.489%        |
+| flaggems监控结果 | 41.443%    | 2.391%   | 49.69°C       | 19.489%        |
\ No newline at end of file
diff --git a/operation/benchmarks/pow/iluvatar/BI150/case_config.yaml b/operation/benchmarks/pow/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..4958ee3ba
--- /dev/null
+++ b/operation/benchmarks/pow/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/pow/iluvatar/BI150/env.sh b/operation/benchmarks/pow/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/pow/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/pow/iluvatar/BI150/requirements.txt b/operation/benchmarks/pow/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/pow/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/pow/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/pow/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..ec1c9e233
--- /dev/null
+++ b/operation/benchmarks/pow/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,2 @@
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/pow/kunlunxin/R300p/env.sh b/operation/benchmarks/pow/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..0f16bb9ae
--- /dev/null
+++ b/operation/benchmarks/pow/kunlunxin/R300p/env.sh
@@ -0,0 +1,6 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+export TRITON_LOCAL_VALUE_MAX=2048
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/pow/main.py b/operation/benchmarks/pow/main.py
index 56b6bdb57..9df593732 100644
--- a/operation/benchmarks/pow/main.py
+++ b/operation/benchmarks/pow/main.py
@@ -4,12 +4,12 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +23,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,32 +53,21 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation"
-          )  # correctness is implemented casebycase
 
     m = case_config.Melements
 
-    dtype = {"FP32": torch.float32}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(m, dtype=dtype[config.dataformat])
-
-        a_fp64 = a.to(torch.float64)
-        r_fp64 = torch.pow(a_fp64, 2)
-
-        a = a.to(0)
-        r_device = torch.pow(a, 2).cpu()
-        mape = torch.mean(torch.abs(r_device - r_fp64) / torch.abs(r_fp64))
-
-        mmape.append(mape)
-    
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
 
     a = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat]).to(0)
 
@@ -80,8 +77,8 @@ def main(config, case_config):
     op2flops = lambda x: x * m * 1024 * 1024
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "pow", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -89,6 +86,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
diff --git a/operation/benchmarks/pow/metax/C550_64/case_config.yaml b/operation/benchmarks/pow/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/pow/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/pow/metax/C550_64/env.sh b/operation/benchmarks/pow/metax/C550_64/env.sh
new file mode 100644
index 000000000..bb4cd2dce
--- /dev/null
+++ b/operation/benchmarks/pow/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_CONST_ARGS_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/pow/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/pow/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..9f002cc34
--- /dev/null
+++ b/operation/benchmarks/pow/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.3TFLOPS       | 0.3TFLOPS        | 0.1% | 0.1% |
+| nativetorch | True    | 0.35TFLOPS      | 0.35TFLOPS      | 0.11%      | 0.11%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3572.2us       | 3589.12us        | 279.94op/s | 278.62op/s | 1181283.56us | 3660.04us |
+| nativetorch | 3081.11us       | 3087.36us        | 324.56op/s | 323.9op/s | 31025.66us | 3105.73us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 263.26W       | 269.0W      | 4.06W        | 400W  |
+| flaggems监控结果 | 1638.0W | 1794.0W | 156.0W   | /     | 384.36W       | 390.0W      | 7.08W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.834%    | 2.308%   | 47.42°C       | 17.394%        |
+| flaggems监控结果 | 0.848%    | 2.307%   | 57.64°C       | 22.447%        |
diff --git a/operation/benchmarks/pow/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/pow/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..6e9f0ce02
--- /dev/null
+++ b/operation/benchmarks/pow/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.31TFLOPS       | 0.31TFLOPS        | 0.1% | 0.1% |
+| nativetorch | True    | 0.35TFLOPS      | 0.35TFLOPS      | 0.11%      | 0.11%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3470.27us       | 3484.67us        | 288.16op/s | 286.97op/s | 895508.7us | 3579.93us |
+| nativetorch | 3079.72us       | 3086.34us        | 324.7op/s | 324.01op/s | 36068.52us | 3105.13us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 275.42W       | 281.0W      | 4.98W        | 400W  |
+| flaggems监控结果 | 1638.0W | 1794.0W | 156.0W   | /     | 395.11W       | 407.0W      | 8.77W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.799%    | 2.311%   | 48.1°C       | 17.394%        |
+| flaggems监控结果 | 0.763%    | 2.309%   | 58.22°C       | 17.212%        |
diff --git a/operation/benchmarks/pow/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/pow/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..77021f7bc
--- /dev/null
+++ b/operation/benchmarks/pow/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.17TFLOPS       | 0.17TFLOPS        | 0.88% | 0.88% |
+| nativetorch | True    | 0.17TFLOPS      | 0.17TFLOPS      | 0.89%      | 0.89%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 6258.22us       | 6276.1us        | 159.79op/s | 159.33op/s | 950718.43us | 6357.87us |
+| nativetorch | 6195.48us       | 6198.27us        | 161.41op/s | 161.34op/s | 32856.31us | 6328.34us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1586.0W | 1638.0W | 73.54W   | /     | 261.62W       | 264.0W      | 3.64W        | 400W  |
+| flaggems监控结果 | 1664.0W | 1794.0W | 132.57W   | /     | 337.46W       | 343.0W      | 3.67W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.391%    | 2.294%   | 48.91°C       | 42.656%        |
+| flaggems监控结果 | 0.743%    | 2.289%   | 52.77°C       | 32.369%        |
diff --git a/operation/benchmarks/pow/nvidia/A100_40_SXM/README.md b/operation/benchmarks/pow/nvidia/A100_40_SXM/README.md
deleted file mode 100644
index d4f32ce6b..000000000
--- a/operation/benchmarks/pow/nvidia/A100_40_SXM/README.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a31e96c235
-
-# 评测结果
-
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 2.15E-08    | 2.72TFLOPS       | 2.72TFLOPS        | 0.96% | 0.96% |
-| nativetorch | 2.15E-08    | 2.72TFLOPS      | 2.72TFLOPS      | 0.96%      | 0.96%    |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 3.97E-10    | 6159.64us       | 6160.38us        | 162.35op/s | 162.33op/s | 331867.49us | 6223.51us |
-| nativetorch | 3.97E-10    | 6196.01us       | 6199.3us        | 161.39op/s | 161.31op/s | 10710.77us | 6228.87us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1638.0W | 1638.0W | 0.0W   | /     | 257.6W       | 262.0W      | 3.23W        | 1638.0  |
-| flaggems监控结果 | 1794.0W | 1794.0W | 0.0W   | /     | 348.58W       | 351.0W      | 3.55W        | 1794.0  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 0.638%    | 1.297%   | 48.81°C       | 31.535%        |
-| flaggems监控结果 | 0.661%    | 1.3%   | 52.93°C       | 31.352%        |
diff --git a/operation/benchmarks/pow/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/pow/nvidia/A100_40_SXM/case_config.yaml
index 7d02883ab..bc4b04b42 100644
--- a/operation/benchmarks/pow/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/pow/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 50000
-SPECTFLOPS: 19.5
diff --git a/operation/benchmarks/prod/cambricon/MLU/case_config.yaml b/operation/benchmarks/prod/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/prod/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/prod/cambricon/MLU/env.sh b/operation/benchmarks/prod/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/prod/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/prod/case_config.yaml b/operation/benchmarks/prod/case_config.yaml
index 6ebcc4612..acc0f44fb 100644
--- a/operation/benchmarks/prod/case_config.yaml
+++ b/operation/benchmarks/prod/case_config.yaml
@@ -1,5 +1,4 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 50000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/prod/iluvatar/BI150/README.md b/operation/benchmarks/prod/iluvatar/BI150/README.md
new file mode 100644
index 000000000..aa4fd4373
--- /dev/null
+++ b/operation/benchmarks/prod/iluvatar/BI150/README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 1.00E+00    | 0.16TFLOPS       | 0.16TFLOPS        | 0.66% | 0.65% |
+| nativetorch | 1.00E+00    | 0.09TFLOPS      | 0.09TFLOPS      | 0.37%      | 0.36%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 0.00E+00    | 3320.89us       | 3358.5us        | 301.12op/s | 297.75op/s | 824814.53us | 3735.2us |
+| nativetorch | 0.00E+00    | 5983.83us       | 6017.23us        | 167.12op/s | 166.19op/s | 6195.52us | 6238.09us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2033.0W | 2052.0W | 26.87W   | /     | 138.05W       | 139.0W      | 0.5W        | 350W  |
+| flaggems监控结果 | 2033.0W | 2071.0W | 38.0W   | /     | 170.97W       | 171.0W      | 0.17W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 44.753%    | 2.392%   | 41.68°C       | 7.739%        |
+| flaggems监控结果 | 43.84%    | 2.396%   | 47.41°C       | 6.989%        |
\ No newline at end of file
diff --git a/operation/benchmarks/prod/iluvatar/BI150/case_config.yaml b/operation/benchmarks/prod/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..4958ee3ba
--- /dev/null
+++ b/operation/benchmarks/prod/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/prod/iluvatar/BI150/env.sh b/operation/benchmarks/prod/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/prod/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/prod/iluvatar/BI150/requirements.txt b/operation/benchmarks/prod/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/prod/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/prod/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/prod/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..398fbc7c8
--- /dev/null
+++ b/operation/benchmarks/prod/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,3 @@
+Shape: [4096, 256]
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/prod/kunlunxin/R300p/env.sh b/operation/benchmarks/prod/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/prod/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/prod/main.py b/operation/benchmarks/prod/main.py
index 7f50876ab..257bf71f7 100644
--- a/operation/benchmarks/prod/main.py
+++ b/operation/benchmarks/prod/main.py
@@ -4,12 +4,13 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
+import math
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +24,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,44 +54,39 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation"
-          )  # correctness is implemented casebycase
 
     m = case_config.Melements
+    # default shape: (M, 1024, 1024)
+    shape = (m, 1024, 1024)
 
-    dtype = {"FP32": torch.float32}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(m, dtype=dtype[config.dataformat])
-
-        a_fp64 = a.to(torch.float64)
-        r_fp64 = torch.prod(a_fp64)
-
-        a = a.to(0)
-        r_device = torch.prod(a).cpu()
-        mape = torch.mean(torch.where(r_fp64 == 0, torch.tensor(0.0), torch.abs(r_device - r_fp64) / torch.abs(r_fp64)))
-        #mape = torch.mean(torch.abs(r_device - r_fp64) / torch.abs(r_fp64))
-
-        mmape.append(mape)
-    
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
+    if config.vendor == 'kunlunxin':
+        # if `Shape' specified in `case_config.yaml', use it
+        if case_config.__contains__('Shape') and case_config.Shape is not None:
+            shape = case_config.Shape
 
-    a = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat]).to(0)
+    a = torch.randn(shape, dtype=dtype[config.dataformat]).to(0)
+    print(f'Shape for performance_test: {a.shape}')
 
     latency_nowarm, latency_warm, cputime, kerneltime = do_test(
         torch.prod, (a,), host_device_sync, config, case_config)
 
-    op2flops = lambda x: x * m * 1024 * 1024
+    op2flops = lambda x: x * math.prod(shape)
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "prod", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -90,6 +94,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
@@ -102,4 +107,4 @@ def main(config, case_config):
         print("Using flaggems")
     else:
         print("Using nativetorch")
-    main(config, case_config)
\ No newline at end of file
+    main(config, case_config)
diff --git a/operation/benchmarks/prod/metax/C550_64/case_config.yaml b/operation/benchmarks/prod/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/prod/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/prod/metax/C550_64/env.sh b/operation/benchmarks/prod/metax/C550_64/env.sh
new file mode 100644
index 000000000..0cdec082d
--- /dev/null
+++ b/operation/benchmarks/prod/metax/C550_64/env.sh
@@ -0,0 +1 @@
+echo "METAX PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/prod/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/prod/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..30c8fc05b
--- /dev/null
+++ b/operation/benchmarks/prod/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.74TFLOPS       | 0.73TFLOPS        | 0.24% | 0.23% |
+| nativetorch | True    | 0.69TFLOPS      | 0.68TFLOPS      | 0.22%      | 0.22%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 1443.44us       | 1465.34us        | 692.79op/s | 682.43op/s | 1360195.72us | 1518.2us |
+| nativetorch | 1556.14us       | 1575.94us        | 642.62op/s | 634.54op/s | 745570.32us | 1606.43us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 265.19W       | 275.0W      | 19.15W        | 400W  |
+| flaggems监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 279.87W       | 287.0W      | 5.97W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.786%    | 2.324%   | 45.41°C       | 7.112%        |
+| flaggems监控结果 | 0.732%    | 2.323%   | 46.92°C       | 7.107%        |
diff --git a/operation/benchmarks/prod/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/prod/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..8d4c98124
--- /dev/null
+++ b/operation/benchmarks/prod/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.74TFLOPS       | 0.73TFLOPS        | 0.24% | 0.23% |
+| nativetorch | True    | 0.69TFLOPS      | 0.68TFLOPS      | 0.22%      | 0.22%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 1443.73us       | 1465.34us        | 692.65op/s | 682.43op/s | 1044867.05us | 1506.09us |
+| nativetorch | 1555.39us       | 1576.96us        | 642.92op/s | 634.13op/s | 746274.74us | 1611.19us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 281.8W       | 289.0W      | 4.46W        | 400W  |
+| flaggems监控结果 | 1599.0W | 1716.0W | 117.0W   | /     | 287.27W       | 293.0W      | 5.14W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.713%    | 2.329%   | 46.66°C       | 7.112%        |
+| flaggems监控结果 | 0.755%    | 2.327%   | 45.62°C       | 7.902%        |
diff --git a/operation/benchmarks/prod/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/prod/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..ae749d27e
--- /dev/null
+++ b/operation/benchmarks/prod/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.37TFLOPS       | 0.37TFLOPS        | 1.92% | 1.9% |
+| nativetorch | True    | 0.36TFLOPS      | 0.36TFLOPS      | 1.87%      | 1.85%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 2874.46us       | 2895.87us        | 347.89op/s | 345.32op/s | 2674477.59us | 3024.18us |
+| nativetorch | 2952.03us       | 2972.67us        | 338.75op/s | 336.4op/s | 1172869.36us | 3011.51us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1599.0W | 1716.0W | 117.0W   | /     | 266.9W       | 270.0W      | 4.09W        | 400W  |
+| flaggems监控结果 | 1599.0W | 1716.0W | 117.0W   | /     | 275.83W       | 279.0W      | 3.61W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.875%    | 2.296%   | 46.24°C       | 12.164%        |
+| flaggems监控结果 | 0.745%    | 2.296%   | 46.56°C       | 12.16%        |
diff --git a/operation/benchmarks/prod/nvidia/A100_40_SXM/README.md b/operation/benchmarks/prod/nvidia/A100_40_SXM/README.md
deleted file mode 100644
index 0d7bd8cdf..000000000
--- a/operation/benchmarks/prod/nvidia/A100_40_SXM/README.md
+++ /dev/null
@@ -1,57 +0,0 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:9168f2d031ecc1b31a9f658fb66dd6735b7306b3
-
-# 评测结果
-
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 9.80E-01    | 0.37TFLOPS       | 0.37TFLOPS        | 1.92% | 1.9% |
-| nativetorch | 9.80E-01    | 0.36TFLOPS      | 0.36TFLOPS      | 1.87%      | 1.85%    |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时>延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 1.41E-01    | 2875.15us       | 2896.9us        | 347.81op/s | 345.2op/s | 703446.76us | 2926.41us |
-| nativetorch | 1.41E-01    | 2952.4us       | 2973.7us        | 338.71op/s | 336.28op/s | 3197.03us | 2979.27us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单
-卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1638.0W | 1638.0W | 0.0W   | /     | 267.55W       | 270.0W      | 3.29W        | 1638.0  |
-| flaggems监控结果 | 1716.0W | 1716.0W | 0.0W   | /     | 275.55W       | 278.0W      | 3.63W        | 1716.0  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 0.579%    | 1.401%   | 46.34°C       | 11.143%        |
-| flaggems监控结果 | 0.664%    | 1.393%   | 46.84°C       | 11.138%        |
diff --git a/operation/benchmarks/prod/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/prod/nvidia/A100_40_SXM/case_config.yaml
index c7975e944..bc4b04b42 100644
--- a/operation/benchmarks/prod/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/prod/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 50000
-SPECTFLOPS: 19.5
\ No newline at end of file
diff --git a/operation/benchmarks/reciprocal/cambricon/MLU/case_config.yaml b/operation/benchmarks/reciprocal/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/reciprocal/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/reciprocal/cambricon/MLU/env.sh b/operation/benchmarks/reciprocal/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/reciprocal/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/reciprocal/case_config.yaml b/operation/benchmarks/reciprocal/case_config.yaml
index 6ebcc4612..acc0f44fb 100644
--- a/operation/benchmarks/reciprocal/case_config.yaml
+++ b/operation/benchmarks/reciprocal/case_config.yaml
@@ -1,5 +1,4 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 50000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/reciprocal/iluvatar/BI150/README.md b/operation/benchmarks/reciprocal/iluvatar/BI150/README.md
new file mode 100644
index 000000000..bad311811
--- /dev/null
+++ b/operation/benchmarks/reciprocal/iluvatar/BI150/README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 2.15E-08    | 0.07TFLOPS       | 0.07TFLOPS        | 0.29% | 0.29% |
+| nativetorch | 2.15E-08    | 0.07TFLOPS      | 0.07TFLOPS      | 0.3%      | 0.3%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 6.04E-10    | 7408.89us       | 7416.92us        | 134.97op/s | 134.83op/s | 240487.77us | 7943.96us |
+| nativetorch | 6.04E-10    | 7381.8us       | 7387.88us        | 135.47op/s | 135.36op/s | 7882.89us | 7665.21us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2090.0W | 2109.0W | 32.91W   | /     | 170.68W       | 171.0W      | 2.67W        | 350W  |
+| flaggems监控结果 | 2099.5W | 2128.0W | 49.36W   | /     | 176.97W       | 177.0W      | 0.16W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 39.881%    | 2.584%   | 49.15°C       | 25.739%        |
+| flaggems监控结果 | 40.9%    | 2.579%   | 49.79°C       | 19.489%        |
\ No newline at end of file
diff --git a/operation/benchmarks/reciprocal/iluvatar/BI150/case_config.yaml b/operation/benchmarks/reciprocal/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..4958ee3ba
--- /dev/null
+++ b/operation/benchmarks/reciprocal/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/reciprocal/iluvatar/BI150/env.sh b/operation/benchmarks/reciprocal/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/reciprocal/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/reciprocal/iluvatar/BI150/requirements.txt b/operation/benchmarks/reciprocal/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/reciprocal/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/reciprocal/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/reciprocal/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..ec1c9e233
--- /dev/null
+++ b/operation/benchmarks/reciprocal/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,2 @@
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/reciprocal/kunlunxin/R300p/env.sh b/operation/benchmarks/reciprocal/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/reciprocal/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/reciprocal/main.py b/operation/benchmarks/reciprocal/main.py
index 4dce22547..edc206f40 100644
--- a/operation/benchmarks/reciprocal/main.py
+++ b/operation/benchmarks/reciprocal/main.py
@@ -4,12 +4,12 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +23,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,32 +53,21 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation"
-          )  # correctness is implemented casebycase
 
     m = case_config.Melements
 
-    dtype = {"FP32": torch.float32}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(m, dtype=dtype[config.dataformat])
-
-        a_fp64 = a.to(torch.float64)
-        r_fp64 = torch.reciprocal(a_fp64)
-
-        a = a.to(0)
-        r_device = torch.reciprocal(a).cpu()
-        mape = torch.mean(torch.abs(r_device - r_fp64) / torch.abs(r_fp64))
-
-        mmape.append(mape)
-    
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
 
     a = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat]).to(0)
 
@@ -80,8 +77,8 @@ def main(config, case_config):
     op2flops = lambda x: x * m * 1024 * 1024
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "reciprocal", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -89,6 +86,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
diff --git a/operation/benchmarks/reciprocal/metax/C550_64/case_config.yaml b/operation/benchmarks/reciprocal/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/reciprocal/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/reciprocal/metax/C550_64/env.sh b/operation/benchmarks/reciprocal/metax/C550_64/env.sh
new file mode 100644
index 000000000..bb4cd2dce
--- /dev/null
+++ b/operation/benchmarks/reciprocal/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_CONST_ARGS_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/reciprocal/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/reciprocal/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..48b3eb427
--- /dev/null
+++ b/operation/benchmarks/reciprocal/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.33TFLOPS       | 0.33TFLOPS        | 0.11% | 0.11% |
+| nativetorch | True    | 0.34TFLOPS      | 0.34TFLOPS      | 0.11%      | 0.11%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3228.67us       | 3239.94us        | 309.72op/s | 308.65op/s | 904897.5us | 3317.92us |
+| nativetorch | 3122.49us       | 3128.32us        | 320.26op/s | 319.66op/s | 1006889.77us | 3141.61us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1599.0W | 1716.0W | 117.0W   | /     | 322.94W       | 326.0W      | 5.24W        | 400W  |
+| flaggems监控结果 | 1599.0W | 1716.0W | 117.0W   | /     | 331.67W       | 336.0W      | 6.13W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.339%    | 2.508%   | 52.42°C       | 17.394%        |
+| flaggems监控结果 | 0.787%    | 2.506%   | 53.31°C       | 17.207%        |
diff --git a/operation/benchmarks/reciprocal/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/reciprocal/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..6e94e41d7
--- /dev/null
+++ b/operation/benchmarks/reciprocal/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.33TFLOPS       | 0.33TFLOPS        | 0.11% | 0.11% |
+| nativetorch | True    | 0.35TFLOPS      | 0.34TFLOPS      | 0.11%      | 0.11%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3222.97us       | 3230.72us        | 310.27op/s | 309.53op/s | 925282.23us | 3303.58us |
+| nativetorch | 3107.07us       | 3112.96us        | 321.85op/s | 321.24op/s | 19897.62us | 3128.01us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1716.0W | 156.0W   | /     | 328.03W       | 333.0W      | 5.41W        | 400W  |
+| flaggems监控结果 | 1599.0W | 1716.0W | 117.0W   | /     | 337.18W       | 343.0W      | 7.07W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.853%    | 2.517%   | 52.38°C       | 17.394%        |
+| flaggems监控结果 | 0.877%    | 2.51%   | 52.86°C       | 17.207%        |
diff --git a/operation/benchmarks/reciprocal/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/reciprocal/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..e46f1bbcc
--- /dev/null
+++ b/operation/benchmarks/reciprocal/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.17TFLOPS       | 0.17TFLOPS        | 0.88% | 0.88% |
+| nativetorch | True    | 0.17TFLOPS      | 0.17TFLOPS      | 0.89%      | 0.89%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 6264.91us       | 6277.12us        | 159.62op/s | 159.31op/s | 914679.05us | 6360.73us |
+| nativetorch | 6191.12us       | 6194.18us        | 161.52op/s | 161.44op/s | 36679.35us | 6227.96us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1638.0W | 1716.0W | 110.31W   | /     | 281.63W       | 286.0W      | 3.29W        | 400W  |
+| flaggems监控结果 | 1638.0W | 1716.0W | 110.31W   | /     | 294.05W       | 299.0W      | 4.67W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.796%    | 2.494%   | 49.91°C       | 42.656%        |
+| flaggems监控结果 | 0.949%    | 2.488%   | 50.68°C       | 32.364%        |
diff --git a/operation/benchmarks/reciprocal/nvidia/A100_40_SXM/README.md b/operation/benchmarks/reciprocal/nvidia/A100_40_SXM/README.md
deleted file mode 100644
index a8755fb88..000000000
--- a/operation/benchmarks/reciprocal/nvidia/A100_40_SXM/README.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a31e96c235
-
-# 评测结果
-
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 2.37E-08    | 2.72TFLOPS       | 2.72TFLOPS        | 0.96% | 0.96% |
-| nativetorch | 2.15E-08    | 2.72TFLOPS      | 2.72TFLOPS      | 0.96%      | 0.96%    |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 5.15E-10    | 6188.41us       | 6190.08us        | 161.59op/s | 161.55op/s | 258560.12us | 6246.11us |
-| nativetorch | 4.38E-10    | 6191.76us       | 6195.2us        | 161.5op/s | 161.42op/s | 10662.18us | 6239.27us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1677.0W | 1716.0W | 39.0W   | /     | 279.84W       | 286.0W      | 4.71W        | 1677.0  |
-| flaggems监控结果 | 1716.0W | 1716.0W | 0.0W   | /     | 307.92W       | 312.0W      | 3.25W        | 1716.0  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 0.703%    | 1.3%   | 50.21°C       | 31.535%        |
-| flaggems监控结果 | 1.437%    | 1.298%   | 51.57°C       | 31.347%        |
diff --git a/operation/benchmarks/reciprocal/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/reciprocal/nvidia/A100_40_SXM/case_config.yaml
index 7d02883ab..bc4b04b42 100644
--- a/operation/benchmarks/reciprocal/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/reciprocal/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 50000
-SPECTFLOPS: 19.5
diff --git a/operation/benchmarks/relu/cambricon/MLU/case_config.yaml b/operation/benchmarks/relu/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/relu/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/relu/cambricon/MLU/env.sh b/operation/benchmarks/relu/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/relu/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/relu/case_config.yaml b/operation/benchmarks/relu/case_config.yaml
index 6ebcc4612..acc0f44fb 100644
--- a/operation/benchmarks/relu/case_config.yaml
+++ b/operation/benchmarks/relu/case_config.yaml
@@ -1,5 +1,4 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 50000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/relu/iluvatar/BI150/README.md b/operation/benchmarks/relu/iluvatar/BI150/README.md
new file mode 100644
index 000000000..c367205f1
--- /dev/null
+++ b/operation/benchmarks/relu/iluvatar/BI150/README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 0.00E+00    | 0.65TFLOPS       | 0.65TFLOPS        | 2.65% | 2.65% |
+| nativetorch | 0.00E+00    | 0.65TFLOPS      | 0.65TFLOPS      | 2.66%      | 2.66%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 0.00E+00    | 7408.1us       | 7413.82us        | 134.99op/s | 134.88op/s | 236651.48us | 7906.84us |
+| nativetorch | 0.00E+00    | 7380.5us       | 7390.14us        | 135.49op/s | 135.32op/s | 7733.95us | 7651.34us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2052.0W | 2071.0W | 32.91W   | /     | 162.85W       | 164.0W      | 2.86W        | 350W  |
+| flaggems监控结果 | 2066.25W | 2090.0W | 31.15W   | /     | 167.89W       | 168.0W      | 0.31W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 41.146%    | 2.388%   | 45.79°C       | 19.489%        |
+| flaggems监控结果 | 40.651%    | 2.391%   | 46.23°C       | 19.489%        |
\ No newline at end of file
diff --git a/operation/benchmarks/relu/iluvatar/BI150/case_config.yaml b/operation/benchmarks/relu/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..4958ee3ba
--- /dev/null
+++ b/operation/benchmarks/relu/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/relu/iluvatar/BI150/env.sh b/operation/benchmarks/relu/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/relu/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/relu/iluvatar/BI150/requirements.txt b/operation/benchmarks/relu/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/relu/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/relu/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/relu/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..bfd12215d
--- /dev/null
+++ b/operation/benchmarks/relu/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,3 @@
+Melements: 1
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/relu/kunlunxin/R300p/env.sh b/operation/benchmarks/relu/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/relu/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/relu/main.py b/operation/benchmarks/relu/main.py
index 1a08e5079..18dd3f94d 100644
--- a/operation/benchmarks/relu/main.py
+++ b/operation/benchmarks/relu/main.py
@@ -4,12 +4,12 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +23,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,42 +53,31 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation")
     m = case_config.Melements
     f = torch.nn.ReLU()
-    dtype = {"FP32": torch.float32}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(m, dtype=dtype[config.dataformat])
-
-        a_fp64 = a.to(torch.float64)
-        r_fp64 = f(a_fp64) 
-
-        a = a.to(0)
-        r_device = f(a).cpu()
-        mape = torch.mean(torch.where(r_fp64 == 0, torch.tensor(0.0), torch.abs(r_device - r_fp64) / torch.abs(r_fp64)))
-        #mape = torch.mean(torch.abs(r_device - r_fp64) / torch.abs(r_fp64))
-
-        mmape.append(mape)
-    
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
 
-    a = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat]).to(0)
+    a = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat], requires_grad=True).to(0)
 
     latency_nowarm, latency_warm, cputime, kerneltime = do_test(
-        f, (a, ), host_device_sync, config, case_config) 
+        f, (a, ), host_device_sync, config, case_config, bp=True) 
 
-    op2flops = lambda x: x * 9 * m * 1024 * 1024
+    op2flops = lambda x: x * m * 1024 * 1024
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "relu", *perf_result, mape, mape_std,
+                           config.spectflops, bp=True)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -88,6 +85,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
diff --git a/operation/benchmarks/relu/metax/C550_64/case_config.yaml b/operation/benchmarks/relu/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/relu/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/relu/metax/C550_64/env.sh b/operation/benchmarks/relu/metax/C550_64/env.sh
new file mode 100644
index 000000000..bb4cd2dce
--- /dev/null
+++ b/operation/benchmarks/relu/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_CONST_ARGS_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/relu/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/relu/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..d1bbb8ad6
--- /dev/null
+++ b/operation/benchmarks/relu/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.41TFLOPS       | 0.41TFLOPS        | 0.13% | 0.13% |
+| nativetorch | True    | 0.36TFLOPS      | 0.36TFLOPS      | 0.12%      | 0.12%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 7949.25us       | 7947.26us        | 125.8op/s | 125.83op/s | 1747451.78us | 3349.93us |
+| nativetorch | 8891.32us       | 8901.63us        | 112.47op/s | 112.34op/s | 15680.99us | 3162.26us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1638.0W | 1716.0W | 120.84W   | /     | 280.1W       | 283.0W      | 3.64W        | 400W  |
+| flaggems监控结果 | 1657.5W | 1716.0W | 101.32W   | /     | 317.92W       | 322.0W      | 4.01W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.979%    | 1.457%   | 52.13°C       | 21.43%        |
+| flaggems监控结果 | 1.529%    | 1.383%   | 53.2°C       | 21.243%        |
diff --git a/operation/benchmarks/relu/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/relu/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..aa12dfa25
--- /dev/null
+++ b/operation/benchmarks/relu/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.41TFLOPS       | 0.41TFLOPS        | 0.13% | 0.13% |
+| nativetorch | True    | 0.36TFLOPS      | 0.36TFLOPS      | 0.12%      | 0.12%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 7930.28us       | 7921.66us        | 126.1op/s | 126.24op/s | 816985.95us | 3327.31us |
+| nativetorch | 8929.59us       | 8942.59us        | 111.99op/s | 111.82op/s | 15540.94us | 3126.66us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1638.0W | 1716.0W | 120.84W   | /     | 283.48W       | 287.0W      | 4.09W        | 400W  |
+| flaggems监控结果 | 1657.5W | 1716.0W | 101.32W   | /     | 314.62W       | 318.0W      | 4.19W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 2.332%    | 1.402%   | 52.33°C       | 21.43%        |
+| flaggems监控结果 | 0.635%    | 1.386%   | 52.63°C       | 21.43%        |
diff --git a/operation/benchmarks/relu/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/relu/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..8c529aebc
--- /dev/null
+++ b/operation/benchmarks/relu/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.21TFLOPS       | 0.21TFLOPS        | 1.07% | 1.07% |
+| nativetorch | True    | 0.21TFLOPS      | 0.21TFLOPS      | 1.07%      | 1.07%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 15421.62us       | 15410.18us        | 64.84op/s | 64.89op/s | 822760.42us | 6496.41us |
+| nativetorch | 15435.04us       | 15446.02us        | 64.79op/s | 64.74op/s | 20672.24us | 6220.33us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1604.57W | 1638.0W | 81.88W   | /     | 262.4W       | 265.0W      | 2.68W        | 400W  |
+| flaggems监控结果 | 1638.0W | 1716.0W | 102.13W   | /     | 278.97W       | 283.0W      | 2.75W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.868%    | 1.649%   | 50.44°C       | 41.64%        |
+| flaggems监控结果 | 1.195%    | 1.609%   | 52.04°C       | 41.452%        |
diff --git a/operation/benchmarks/relu/nvidia/A100_40_SXM/README.md b/operation/benchmarks/relu/nvidia/A100_40_SXM/README.md
deleted file mode 100644
index 2e086e583..000000000
--- a/operation/benchmarks/relu/nvidia/A100_40_SXM/README.md
+++ /dev/null
@@ -1,57 +0,0 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:9168f2d031ecc1b31a9f658fb66dd6735b7306b3
-
-# 评测结果
-
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 0.00E+00    | 1.56TFLOPS       | 1.56TFLOPS        | 8.0% | 8.0% |
-| nativetorch | 0.00E+00    | 1.56TFLOPS      | 1.56TFLOPS      | 8.0%      | 8.0%    |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时>延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 0.00E+00    | 6191.06us       | 6194.18us        | 161.52op/s | 161.44op/s | 410813.62us | 6272.68us |
-| nativetorch | 0.00E+00    | 6194.68us       | 6198.27us        | 161.43op/s | 161.34op/s | 10396.25us | 6263.84us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单
-卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1638.0W | 1638.0W | 0.0W   | /     | 254.82W       | 259.0W      | 4.5W        | 1638.0  |
-| flaggems监控结果 | 1677.0W | 1716.0W | 39.0W   | /     | 289.42W       | 295.0W      | 3.19W        | 1677.0  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 0.71%    | 1.396%   | 48.55°C       | 41.64%        |
-| flaggems监控结果 | 0.726%    | 1.396%   | 51.23°C       | 31.347%        |
diff --git a/operation/benchmarks/relu/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/relu/nvidia/A100_40_SXM/case_config.yaml
index c7975e944..bc4b04b42 100644
--- a/operation/benchmarks/relu/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/relu/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 50000
-SPECTFLOPS: 19.5
\ No newline at end of file
diff --git a/operation/benchmarks/rsqrt/cambricon/MLU/case_config.yaml b/operation/benchmarks/rsqrt/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/rsqrt/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/rsqrt/cambricon/MLU/env.sh b/operation/benchmarks/rsqrt/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/rsqrt/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/rsqrt/case_config.yaml b/operation/benchmarks/rsqrt/case_config.yaml
index 6ebcc4612..acc0f44fb 100644
--- a/operation/benchmarks/rsqrt/case_config.yaml
+++ b/operation/benchmarks/rsqrt/case_config.yaml
@@ -1,5 +1,4 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 50000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/rsqrt/iluvatar/BI150/README.md b/operation/benchmarks/rsqrt/iluvatar/BI150/README.md
new file mode 100644
index 000000000..96829eaab
--- /dev/null
+++ b/operation/benchmarks/rsqrt/iluvatar/BI150/README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 3.20E-08    | 0.07TFLOPS       | 0.07TFLOPS        | 0.3% | 0.29% |
+| nativetorch | 2.13E-08    | 0.07TFLOPS      | 0.07TFLOPS      | 0.3%      | 0.3%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 9.48E-10    | 7399.51us       | 7417.8us        | 135.14op/s | 134.81op/s | 170667.23us | 7833.41us |
+| nativetorch | 5.43E-10    | 7358.94us       | 7380.47us        | 135.89op/s | 135.49op/s | 7495.17us | 7578.81us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2066.25W | 2090.0W | 41.14W   | /     | 179.99W       | 180.0W      | 0.12W        | 350W  |
+| flaggems监控结果 | 2066.25W | 2090.0W | 41.14W   | /     | 176.03W       | 177.0W      | 0.23W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 41.505%    | 2.388%   | 48.97°C       | 19.489%        |
+| flaggems监控结果 | 40.023%    | 2.391%   | 47.55°C       | 19.489%        |
\ No newline at end of file
diff --git a/operation/benchmarks/rsqrt/iluvatar/BI150/case_config.yaml b/operation/benchmarks/rsqrt/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..4958ee3ba
--- /dev/null
+++ b/operation/benchmarks/rsqrt/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/rsqrt/iluvatar/BI150/env.sh b/operation/benchmarks/rsqrt/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/rsqrt/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/rsqrt/iluvatar/BI150/requirements.txt b/operation/benchmarks/rsqrt/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/rsqrt/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/rsqrt/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/rsqrt/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..ec1c9e233
--- /dev/null
+++ b/operation/benchmarks/rsqrt/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,2 @@
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/rsqrt/kunlunxin/R300p/env.sh b/operation/benchmarks/rsqrt/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/rsqrt/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/rsqrt/main.py b/operation/benchmarks/rsqrt/main.py
index 63c355a80..ac6a84f09 100644
--- a/operation/benchmarks/rsqrt/main.py
+++ b/operation/benchmarks/rsqrt/main.py
@@ -4,12 +4,12 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +23,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,33 +53,21 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation"
-          )  # correctness is implemented casebycase
 
     m = case_config.Melements
 
-    dtype = {"FP32": torch.float32}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(m, dtype=dtype[config.dataformat])
-        a = torch.abs(a)
-
-        a_fp64 = a.to(torch.float64)
-        r_fp64 = torch.rsqrt(a_fp64)
-
-        a = a.to(0)
-        r_device = torch.rsqrt(a).cpu()
-        mape = torch.mean(torch.abs(r_device - r_fp64) / torch.abs(r_fp64))
-
-        mmape.append(mape)
-
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
 
     a = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat]).to(0)
     a = torch.abs(a)
@@ -79,11 +75,11 @@ def main(config, case_config):
     latency_nowarm, latency_warm, cputime, kerneltime = do_test(
         torch.rsqrt, (a, ), host_device_sync, config, case_config)
 
-    op2flops = lambda x: x * m * 1024 * 1024
+    op2flops =  lambda x: x * 2 * m * 1024 * 1024
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "rsqrt", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -91,6 +87,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
diff --git a/operation/benchmarks/rsqrt/metax/C550_64/case_config.yaml b/operation/benchmarks/rsqrt/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/rsqrt/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/rsqrt/metax/C550_64/env.sh b/operation/benchmarks/rsqrt/metax/C550_64/env.sh
new file mode 100644
index 000000000..fc849ff1e
--- /dev/null
+++ b/operation/benchmarks/rsqrt/metax/C550_64/env.sh
@@ -0,0 +1,4 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_CONST_ARGS_OPT=1
+export TRITON_ENABLE_FAST_DIVF=1
+export TRITON_ENABLE_MXC_SQRT_F32=1
\ No newline at end of file
diff --git a/operation/benchmarks/rsqrt/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/rsqrt/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..b317eb466
--- /dev/null
+++ b/operation/benchmarks/rsqrt/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.66TFLOPS       | 0.66TFLOPS        | 0.21% | 0.21% |
+| nativetorch | True    | 0.7TFLOPS      | 0.69TFLOPS      | 0.22%      | 0.22%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3229.95us       | 3241.98us        | 309.6op/s | 308.45op/s | 525605.34us | 3312.69us |
+| nativetorch | 3083.38us       | 3091.46us        | 324.32op/s | 323.47op/s | 17344.79us | 3101.13us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 265.48W       | 271.0W      | 4.17W        | 400W  |
+| flaggems监控结果 | 1638.0W | 1794.0W | 156.0W   | /     | 331.78W       | 336.0W      | 5.64W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.798%    | 2.31%   | 47.82°C       | 17.394%        |
+| flaggems监控结果 | 0.786%    | 2.309%   | 52.92°C       | 17.212%        |
diff --git a/operation/benchmarks/rsqrt/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/rsqrt/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..1f17a1c26
--- /dev/null
+++ b/operation/benchmarks/rsqrt/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.67TFLOPS       | 0.66TFLOPS        | 0.21% | 0.21% |
+| nativetorch | True    | 0.7TFLOPS      | 0.69TFLOPS      | 0.22%      | 0.22%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3226.61us       | 3237.89us        | 309.92op/s | 308.84op/s | 182819.27us | 3321.58us |
+| nativetorch | 3083.05us       | 3091.46us        | 324.35op/s | 323.47op/s | 17137.52us | 3101.54us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 279.13W       | 286.0W      | 4.96W        | 400W  |
+| flaggems监控结果 | 1638.0W | 1794.0W | 156.0W   | /     | 344.67W       | 352.0W      | 7.13W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.78%    | 2.316%   | 49.12°C       | 17.394%        |
+| flaggems监控结果 | 0.902%    | 2.311%   | 54.38°C       | 22.447%        |
diff --git a/operation/benchmarks/rsqrt/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/rsqrt/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..0902668e6
--- /dev/null
+++ b/operation/benchmarks/rsqrt/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.34TFLOPS       | 0.34TFLOPS        | 1.76% | 1.75% |
+| nativetorch | True    | 0.35TFLOPS      | 0.35TFLOPS      | 1.78%      | 1.78%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 6259.88us       | 6277.12us        | 159.75op/s | 159.31op/s | 1517689.63us | 6451.51us |
+| nativetorch | 6195.69us       | 6198.27us        | 161.4op/s | 161.34op/s | 20975.14us | 6230.02us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1586.0W | 1638.0W | 73.54W   | /     | 258.79W       | 263.0W      | 2.82W        | 400W  |
+| flaggems监控结果 | 1638.0W | 1716.0W | 110.31W   | /     | 295.22W       | 299.0W      | 3.26W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.733%    | 2.288%   | 48.56°C       | 42.656%        |
+| flaggems监控结果 | 1.059%    | 2.291%   | 50.04°C       | 32.369%        |
diff --git a/operation/benchmarks/rsqrt/nvidia/A100_40_SXM/README.md b/operation/benchmarks/rsqrt/nvidia/A100_40_SXM/README.md
deleted file mode 100644
index 54ca1c5aa..000000000
--- a/operation/benchmarks/rsqrt/nvidia/A100_40_SXM/README.md
+++ /dev/null
@@ -1,57 +0,0 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:9168f2d031ecc1b31a9f658fb66dd6735b7306b3
-
-# 评测结果
-
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 3.50E-08    | 0.17TFLOPS       | 0.17TFLOPS        | 0.89% | 0.89% |
-| nativetorch | 2.74E-08    | 0.17TFLOPS      | 0.17TFLOPS      | 0.89%      | 0.89%    |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时>延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 8.18E-10    | 6184.4us       | 6190.08us        | 161.7op/s | 161.55op/s | 185962.49us | 6251.48us |
-| nativetorch | 5.91E-10    | 6195.47us       | 6197.25us        | 161.41op/s | 161.36op/s | 6267.83us | 6237.22us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单
-卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1638.0W | 1638.0W | 0.0W   | /     | 258.61W       | 263.0W      | 4.54W        | 1638.0  |
-| flaggems监控结果 | 1716.0W | 1716.0W | 0.0W   | /     | 308.11W       | 312.0W      | 3.87W        | 1716.0  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 0.76%    | 1.39%   | 48.52°C       | 31.535%        |
-| flaggems监控结果 | 1.054%    | 1.392%   | 51.82°C       | 41.64%        |
diff --git a/operation/benchmarks/rsqrt/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/rsqrt/nvidia/A100_40_SXM/case_config.yaml
index c7975e944..bc4b04b42 100644
--- a/operation/benchmarks/rsqrt/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/rsqrt/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 50000
-SPECTFLOPS: 19.5
\ No newline at end of file
diff --git a/operation/benchmarks/rsub/cambricon/MLU/case_config.yaml b/operation/benchmarks/rsub/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/rsub/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/rsub/cambricon/MLU/env.sh b/operation/benchmarks/rsub/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/rsub/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/rsub/case_config.yaml b/operation/benchmarks/rsub/case_config.yaml
new file mode 100644
index 000000000..70fc45775
--- /dev/null
+++ b/operation/benchmarks/rsub/case_config.yaml
@@ -0,0 +1,5 @@
+Melements: 1024
+WARMUP: 100
+ITERS: 10000
+KERNELWARMUP: 10
+KERNELITERS: 1000
diff --git a/operation/benchmarks/rsub/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/rsub/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..ec1c9e233
--- /dev/null
+++ b/operation/benchmarks/rsub/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,2 @@
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/rsub/kunlunxin/R300p/env.sh b/operation/benchmarks/rsub/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/rsub/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/rsub/main.py b/operation/benchmarks/rsub/main.py
new file mode 100644
index 000000000..2e6649314
--- /dev/null
+++ b/operation/benchmarks/rsub/main.py
@@ -0,0 +1,101 @@
+ # Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+import torch
+import os
+import time
+from argparse import ArgumentParser, Namespace
+import yaml
+import sys
+import subprocess
+
+sys.path.append("..")
+from drivers.utils import *
+from drivers.calculate import *
+
+
+def parse_args():
+    parser = ArgumentParser(description=" ")
+
+    parser.add_argument("--vendor",
+                        type=str,
+                        required=True,
+                        help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
+
+    parser.add_argument("--dataformat",
+                        type=str,
+                        required=True,
+                        help="like FP32,FP16")
+
+    parser.add_argument("--oplib",
+                        type=str,
+                        required=True,
+                        help="impl like pytorch/flaggems/cpp")
+
+    parser.add_argument("--chip",
+                        type=str,
+                        required=True,
+                        help="chip like A100_40_SXM")
+
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
+    set_ieee_float32(config.vendor)
+
+    m = case_config.Melements
+
+    a = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat]).to(0)
+    b = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat]).to(0)
+
+    latency_nowarm, latency_warm, cputime, kerneltime = do_test(
+        torch.rsub, (a, b), host_device_sync, config, case_config) 
+
+    op2flops = lambda x: x * 2 * m * 1024 * 1024 
+
+    perf_result = cal_perf(cputime, kerneltime, op2flops,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
+                 latency_nowarm, latency_warm)
+
+
+if __name__ == "__main__":
+    config = parse_args()
+    with open("case_config.yaml", "r") as file:
+        case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
+    with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
+              "r") as file:
+        case_config_vendor = yaml.safe_load(file)
+    case_config.update(case_config_vendor)
+    case_config = Namespace(**case_config)
+
+    if config.oplib == "flaggems":
+        import flag_gems
+        flag_gems.enable()
+        print("Using flaggems")
+    else:
+        print("Using nativetorch")
+    main(config, case_config)
diff --git a/operation/benchmarks/rsub/metax/C550_64/case_config.yaml b/operation/benchmarks/rsub/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..615415ef9
--- /dev/null
+++ b/operation/benchmarks/rsub/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 10000
\ No newline at end of file
diff --git a/operation/benchmarks/rsub/metax/C550_64/env.sh b/operation/benchmarks/rsub/metax/C550_64/env.sh
new file mode 100644
index 000000000..0cdec082d
--- /dev/null
+++ b/operation/benchmarks/rsub/metax/C550_64/env.sh
@@ -0,0 +1 @@
+echo "METAX PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/rsub/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/rsub/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..c158e6c79
--- /dev/null
+++ b/operation/benchmarks/rsub/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.46TFLOPS       | 0.46TFLOPS        | 0.15% | 0.15% |
+| nativetorch | True    | 0.45TFLOPS      | 0.45TFLOPS      | 0.15%      | 0.15%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 4671.66us       | 4678.66us        | 214.06op/s | 213.74op/s | 831887.47us | 4774.65us |
+| nativetorch | 4742.81us       | 4746.24us        | 210.85op/s | 210.69op/s | 16621.52us | 4744.37us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 249.1W       | 253.0W      | 3.96W        | 400W  |
+| flaggems监控结果 | 1560.0W | 1716.0W | 156.0W   | /     | 294.0W       | 301.0W      | 5.39W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.638%    | 1.125%   | 41.88°C       | 21.425%        |
+| flaggems监控结果 | 0.663%    | 1.149%   | 49.1°C       | 21.238%        |
diff --git a/operation/benchmarks/rsub/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/rsub/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..1a82b610f
--- /dev/null
+++ b/operation/benchmarks/rsub/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.46TFLOPS       | 0.46TFLOPS        | 0.15% | 0.15% |
+| nativetorch | True    | 0.45TFLOPS      | 0.45TFLOPS      | 0.15%      | 0.15%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 4666.6us       | 4673.54us        | 214.29op/s | 213.97op/s | 825061.25us | 5049.62us |
+| nativetorch | 4745.23us       | 4746.24us        | 210.74op/s | 210.69op/s | 16639.46us | 4744.56us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 262.7W       | 267.0W      | 3.58W        | 400W  |
+| flaggems监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 292.3W       | 300.0W      | 4.41W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.556%    | 1.153%   | 43.19°C       | 21.425%        |
+| flaggems监控结果 | 0.907%    | 1.164%   | 45.1°C       | 26.478%        |
diff --git a/operation/benchmarks/rsub/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/rsub/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..867bbb206
--- /dev/null
+++ b/operation/benchmarks/rsub/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.23TFLOPS       | 0.23TFLOPS        | 1.18% | 1.18% |
+| nativetorch | True    | 0.23TFLOPS      | 0.23TFLOPS      | 1.16%      | 1.16%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 9331.11us       | 9315.33us        | 107.17op/s | 107.35op/s | 1966115.37us | 9438.81us |
+| nativetorch | 9472.98us       | 9479.17us        | 105.56op/s | 105.49op/s | 22972.82us | 9509.4us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 251.63W       | 257.0W      | 3.56W        | 400W  |
+| flaggems监控结果 | 1521.0W | 1638.0W | 117.0W   | /     | 271.47W       | 278.0W      | 4.86W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.788%    | 1.122%   | 45.26°C       | 41.635%        |
+| flaggems监控结果 | 0.919%    | 1.106%   | 46.89°C       | 41.447%        |
diff --git a/operation/benchmarks/rsub/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/rsub/nvidia/A100_40_SXM/case_config.yaml
new file mode 100644
index 000000000..fde4f8949
--- /dev/null
+++ b/operation/benchmarks/rsub/nvidia/A100_40_SXM/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 10000
diff --git a/operation/benchmarks/rsub/nvidia/A100_40_SXM/env.sh b/operation/benchmarks/rsub/nvidia/A100_40_SXM/env.sh
new file mode 100644
index 000000000..33786ec0d
--- /dev/null
+++ b/operation/benchmarks/rsub/nvidia/A100_40_SXM/env.sh
@@ -0,0 +1 @@
+echo "NVIDIA PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/rsub/nvidia/A100_40_SXM/requirements.txt b/operation/benchmarks/rsub/nvidia/A100_40_SXM/requirements.txt
new file mode 100644
index 000000000..330e27963
--- /dev/null
+++ b/operation/benchmarks/rsub/nvidia/A100_40_SXM/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/operation/benchmarks/sigmoid/cambricon/MLU/case_config.yaml b/operation/benchmarks/sigmoid/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/sigmoid/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/sigmoid/cambricon/MLU/env.sh b/operation/benchmarks/sigmoid/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/sigmoid/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/sigmoid/case_config.yaml b/operation/benchmarks/sigmoid/case_config.yaml
index 6ebcc4612..acc0f44fb 100644
--- a/operation/benchmarks/sigmoid/case_config.yaml
+++ b/operation/benchmarks/sigmoid/case_config.yaml
@@ -1,5 +1,4 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 50000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/sigmoid/iluvatar/BI150/README.md b/operation/benchmarks/sigmoid/iluvatar/BI150/README.md
new file mode 100644
index 000000000..5e426a762
--- /dev/null
+++ b/operation/benchmarks/sigmoid/iluvatar/BI150/README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 3.86E-08    | 0.07TFLOPS       | 0.07TFLOPS        | 0.29% | 0.29% |
+| nativetorch | 3.93E-08    | 0.07TFLOPS      | 0.07TFLOPS      | 0.3%      | 0.3%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 1.27E-09    | 7408.37us       | 7419.17us        | 134.98op/s | 134.79op/s | 358375.78us | 7980.38us |
+| nativetorch | 1.32E-09    | 7355.66us       | 7391.53us        | 135.95op/s | 135.29op/s | 7657.92us | 7585.98us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2066.25W | 2090.0W | 41.14W   | /     | 179.86W       | 180.0W      | 0.35W        | 350W  |
+| flaggems监控结果 | 2066.25W | 2090.0W | 41.14W   | /     | 178.0W       | 178.0W      | 0.0W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 43.185%    | 2.393%   | 49.66°C       | 19.489%        |
+| flaggems监控结果 | 40.879%    | 2.391%   | 48.55°C       | 19.489%        |
\ No newline at end of file
diff --git a/operation/benchmarks/sigmoid/iluvatar/BI150/case_config.yaml b/operation/benchmarks/sigmoid/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..4958ee3ba
--- /dev/null
+++ b/operation/benchmarks/sigmoid/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/sigmoid/iluvatar/BI150/env.sh b/operation/benchmarks/sigmoid/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/sigmoid/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/sigmoid/iluvatar/BI150/requirements.txt b/operation/benchmarks/sigmoid/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/sigmoid/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/sigmoid/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/sigmoid/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..bfd12215d
--- /dev/null
+++ b/operation/benchmarks/sigmoid/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,3 @@
+Melements: 1
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/sigmoid/kunlunxin/R300p/env.sh b/operation/benchmarks/sigmoid/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/sigmoid/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/sigmoid/main.py b/operation/benchmarks/sigmoid/main.py
index 7b526ac78..8ad13924e 100644
--- a/operation/benchmarks/sigmoid/main.py
+++ b/operation/benchmarks/sigmoid/main.py
@@ -4,12 +4,12 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +23,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,43 +53,32 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation"
-          )  # correctness is implemented casebycase
 
     m = case_config.Melements
 
-    dtype = {"FP32": torch.float32}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(m, dtype=dtype[config.dataformat])
-
-        a_fp64 = a.to(torch.float64)
-        r_fp64 = torch.sigmoid(a_fp64)
-
-        a = a.to(0)
-        r_device = torch.sigmoid(a).cpu()
-        mape = torch.mean(torch.abs(r_device - r_fp64) / torch.abs(r_fp64))
-
-        mmape.append(mape)
-    
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
 
-    a = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat]).to(0)
+    a = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat], requires_grad=True).to(0)
 
     latency_nowarm, latency_warm, cputime, kerneltime = do_test(
-        torch.sigmoid, (a, ), host_device_sync, config, case_config)
+        torch.sigmoid, (a, ), host_device_sync, config, case_config, bp=True)
 
-    op2flops = lambda x: x * m * 1024 * 1024
+    op2flops = lambda x: x * 3 * m * 1024 * 1024
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "sigmoid", *perf_result, mape, mape_std,
+                           config.spectflops, bp=True)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -89,6 +86,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
@@ -101,4 +99,4 @@ def main(config, case_config):
         print("Using flaggems")
     else:
         print("Using nativetorch")
-    main(config, case_config)
\ No newline at end of file
+    main(config, case_config)
diff --git a/operation/benchmarks/sigmoid/metax/C550_64/case_config.yaml b/operation/benchmarks/sigmoid/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/sigmoid/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/sigmoid/metax/C550_64/env.sh b/operation/benchmarks/sigmoid/metax/C550_64/env.sh
new file mode 100644
index 000000000..bb4cd2dce
--- /dev/null
+++ b/operation/benchmarks/sigmoid/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_CONST_ARGS_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/sigmoid/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/sigmoid/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..cd60de3bc
--- /dev/null
+++ b/operation/benchmarks/sigmoid/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.32TFLOPS       | 0.32TFLOPS        | 0.1% | 0.1% |
+| nativetorch | True    | 1.06TFLOPS      | 1.05TFLOPS      | 0.34%      | 0.34%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 30569.56us       | 30577.66us        | 32.71op/s | 32.7op/s | 2089529.7us | 17552.95us |
+| nativetorch | 9152.9us       | 9165.82us        | 109.25op/s | 109.1op/s | 12863.21us | 3178.18us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1653.6W | 1716.0W | 124.8W   | /     | 308.49W       | 315.0W      | 4.54W        | 400W  |
+| flaggems监控结果 | 1671.43W | 1716.0W | 81.88W   | /     | 283.47W       | 293.0W      | 5.67W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.921%    | 1.461%   | 52.55°C       | 21.43%        |
+| flaggems监控结果 | 1.16%    | 1.481%   | 52.63°C       | 51.744%        |
diff --git a/operation/benchmarks/sigmoid/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/sigmoid/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..8f8341d23
--- /dev/null
+++ b/operation/benchmarks/sigmoid/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.32TFLOPS       | 0.32TFLOPS        | 0.1% | 0.1% |
+| nativetorch | True    | 1.06TFLOPS      | 1.06TFLOPS      | 0.34%      | 0.34%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 30331.64us       | 30346.24us        | 32.97op/s | 32.95op/s | 888133.22us | 17287.71us |
+| nativetorch | 9118.67us       | 9132.03us        | 109.67op/s | 109.5op/s | 11798.23us | 3255.78us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1669.2W | 1716.0W | 93.6W   | /     | 320.12W       | 328.0W      | 5.74W        | 400W  |
+| flaggems监控结果 | 1682.57W | 1716.0W | 81.88W   | /     | 287.07W       | 297.0W      | 5.95W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.939%    | 1.456%   | 53.37°C       | 21.43%        |
+| flaggems监控结果 | 1.472%    | 1.48%   | 51.47°C       | 51.744%        |
diff --git a/operation/benchmarks/sigmoid/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/sigmoid/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..dac21df2d
--- /dev/null
+++ b/operation/benchmarks/sigmoid/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.63TFLOPS       | 0.63TFLOPS        | 3.22% | 3.21% |
+| nativetorch | True    | 0.63TFLOPS      | 0.63TFLOPS      | 3.21%      | 3.21%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 15409.24us       | 15415.3us        | 64.9op/s | 64.87op/s | 1009000.95us | 6679.12us |
+| nativetorch | 15429.96us       | 15439.87us        | 64.81op/s | 64.77op/s | 17516.76us | 6287.45us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1671.43W | 1716.0W | 81.88W   | /     | 286.63W       | 293.0W      | 4.1W        | 400W  |
+| flaggems监控结果 | 1660.29W | 1716.0W | 108.03W   | /     | 290.35W       | 297.0W      | 3.42W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.651%    | 1.661%   | 52.35°C       | 41.64%        |
+| flaggems监控结果 | 1.179%    | 1.665%   | 52.31°C       | 41.457%        |
diff --git a/operation/benchmarks/sigmoid/nvidia/A100_40_SXM/README.md b/operation/benchmarks/sigmoid/nvidia/A100_40_SXM/README.md
deleted file mode 100644
index 182f538f9..000000000
--- a/operation/benchmarks/sigmoid/nvidia/A100_40_SXM/README.md
+++ /dev/null
@@ -1,57 +0,0 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a31e96c235
-
-# 评测结果
-
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 3.83E-08    | 0.17TFLOPS       | 0.17TFLOPS        | 0.89% | 0.89% |
-| nativetorch | 3.49E-08    | 0.17TFLOPS      | 0.17TFLOPS      | 0.89%      | 0.89%    |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时>延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 8.56E-10    | 6184.88us       | 6188.03us        | 161.68op/s | 161.6op/s | 302165.88us | 6273.36us |
-| nativetorch | 7.91E-10    | 6183.92us       | 6188.03us        | 161.71op/s | 161.6op/s | 701797.94us | 6237.07us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单
-卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1716.0W | 1716.0W | 0.0W   | /     | 300.44W       | 305.0W      | 3.32W        | 1716.0  |
-| flaggems监控结果 | 1716.0W | 1716.0W | 0.0W   | /     | 313.47W       | 318.0W      | 3.29W        | 1716.0  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 0.821%    | 1.399%   | 51.01°C       | 31.535%        |
-| flaggems监控结果 | 0.642%    | 1.397%   | 52.08°C       | 31.352%        |
diff --git a/operation/benchmarks/sigmoid/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/sigmoid/nvidia/A100_40_SXM/case_config.yaml
index c7975e944..bc4b04b42 100644
--- a/operation/benchmarks/sigmoid/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/sigmoid/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 50000
-SPECTFLOPS: 19.5
\ No newline at end of file
diff --git a/operation/benchmarks/silu/cambricon/MLU/case_config.yaml b/operation/benchmarks/silu/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/silu/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/silu/cambricon/MLU/env.sh b/operation/benchmarks/silu/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/silu/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/silu/case_config.yaml b/operation/benchmarks/silu/case_config.yaml
new file mode 100644
index 000000000..acc0f44fb
--- /dev/null
+++ b/operation/benchmarks/silu/case_config.yaml
@@ -0,0 +1,5 @@
+Melements: 1024
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/silu/iluvatar/BI150/README.md b/operation/benchmarks/silu/iluvatar/BI150/README.md
new file mode 100644
index 000000000..ace317430
--- /dev/null
+++ b/operation/benchmarks/silu/iluvatar/BI150/README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 3.83E-08    | 0.65TFLOPS       | 0.65TFLOPS        | 2.66% | 2.65% |
+| nativetorch | 3.90E-08    | 0.66TFLOPS      | 0.65TFLOPS      | 2.67%      | 2.66%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 1.41E-09    | 7402.82us       | 7423.31us        | 135.08op/s | 134.71op/s | 345572.8us | 8428.09us |
+| nativetorch | 1.39E-09    | 7356.54us       | 7394.25us        | 135.93op/s | 135.24op/s | 7795.48us | 7771.99us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2066.25W | 2090.0W | 41.14W   | /     | 181.39W       | 182.0W      | 4.39W        | 350W  |
+| flaggems监控结果 | 2066.25W | 2090.0W | 41.14W   | /     | 177.99W       | 178.0W      | 0.12W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 44.625%    | 2.388%   | 49.5°C       | 19.489%        |
+| flaggems监控结果 | 42.222%    | 2.391%   | 48.0°C       | 19.489%        |
\ No newline at end of file
diff --git a/operation/benchmarks/silu/iluvatar/BI150/case_config.yaml b/operation/benchmarks/silu/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..4958ee3ba
--- /dev/null
+++ b/operation/benchmarks/silu/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/silu/iluvatar/BI150/env.sh b/operation/benchmarks/silu/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/silu/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/silu/iluvatar/BI150/requirements.txt b/operation/benchmarks/silu/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/silu/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/silu/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/silu/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..40654d3db
--- /dev/null
+++ b/operation/benchmarks/silu/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,3 @@
+Melements: 16
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/silu/kunlunxin/R300p/env.sh b/operation/benchmarks/silu/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/silu/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/silu/main.py b/operation/benchmarks/silu/main.py
new file mode 100644
index 000000000..a7e354ee0
--- /dev/null
+++ b/operation/benchmarks/silu/main.py
@@ -0,0 +1,111 @@
+ # Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+import torch
+import os
+import time
+from argparse import ArgumentParser, Namespace
+import yaml
+import sys
+import subprocess
+import math
+
+sys.path.append("..")
+from drivers.utils import *
+from drivers.calculate import *
+
+
+def parse_args():
+    parser = ArgumentParser(description=" ")
+
+    parser.add_argument("--vendor",
+                        type=str,
+                        required=True,
+                        help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
+
+    parser.add_argument("--dataformat",
+                        type=str,
+                        required=True,
+                        help="like FP32,FP16")
+
+    parser.add_argument("--oplib",
+                        type=str,
+                        required=True,
+                        help="impl like pytorch/flaggems/cpp")
+
+    parser.add_argument("--chip",
+                        type=str,
+                        required=True,
+                        help="chip like A100_40_SXM")
+
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
+    set_ieee_float32(config.vendor)
+
+    f = torch.nn.SiLU()
+
+    m = case_config.Melements
+    # default shape: (M, 1024, 1024)
+    shape = (m, 1024, 1024)
+
+    if config.vendor == 'kunlunxin':
+        # if `Shape' specified in `case_config.yaml', use it
+        if case_config.__contains__('Shape') and case_config.Shape is not None:
+            shape = case_config.Shape
+
+    a = torch.randn(shape, dtype=dtype[config.dataformat], requires_grad=True).to(0)
+    print(f'Shape for performance_test: {a.shape}')
+
+    latency_nowarm, latency_warm, cputime, kerneltime = do_test(
+        f, (a, ), host_device_sync, config, case_config, bp=True) 
+
+    op2flops = lambda x: x * 4 * math.prod(shape)
+
+    perf_result = cal_perf(cputime, kerneltime, op2flops,
+                           config.spectflops, bp=True)
+    print_result(config, config.case_name, *perf_result, correctness,
+                 latency_nowarm, latency_warm)
+
+
+if __name__ == "__main__":
+    config = parse_args()
+    with open("case_config.yaml", "r") as file:
+        case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
+    with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
+              "r") as file:
+        case_config_vendor = yaml.safe_load(file)
+    case_config.update(case_config_vendor)
+    case_config = Namespace(**case_config)
+
+    if config.oplib == "flaggems":
+        import flag_gems
+        flag_gems.enable()
+        print("Using flaggems")
+    else:
+        print("Using nativetorch")
+    main(config, case_config)
diff --git a/operation/benchmarks/silu/metax/C550_64/case_config.yaml b/operation/benchmarks/silu/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/silu/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/silu/metax/C550_64/env.sh b/operation/benchmarks/silu/metax/C550_64/env.sh
new file mode 100644
index 000000000..bb4cd2dce
--- /dev/null
+++ b/operation/benchmarks/silu/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_CONST_ARGS_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/silu/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/silu/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..3bb3eed48
--- /dev/null
+++ b/operation/benchmarks/silu/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 1.4TFLOPS       | 1.4TFLOPS        | 0.45% | 0.45% |
+| nativetorch | True    | 1.34TFLOPS      | 1.34TFLOPS      | 0.43%      | 0.43%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 9171.83us       | 9178.11us        | 109.03op/s | 108.95op/s | 1641604.38us | 3378.84us |
+| nativetorch | 9637.83us       | 9630.72us        | 103.76op/s | 103.83op/s | 11643.45us | 3240.97us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1716.0W | 1794.0W | 120.84W   | /     | 341.29W       | 349.0W      | 6.87W        | 400W  |
+| flaggems监控结果 | 1684.8W | 1794.0W | 144.67W   | /     | 343.85W       | 348.0W      | 5.2W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.412%    | 1.458%   | 53.36°C       | 21.43%        |
+| flaggems监控结果 | 0.839%    | 1.457%   | 53.74°C       | 21.248%        |
diff --git a/operation/benchmarks/silu/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/silu/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..50b8525a7
--- /dev/null
+++ b/operation/benchmarks/silu/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 1.42TFLOPS       | 1.42TFLOPS        | 0.46% | 0.46% |
+| nativetorch | True    | 1.35TFLOPS      | 1.35TFLOPS      | 0.43%      | 0.43%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 9061.96us       | 9074.69us        | 110.35op/s | 110.2op/s | 822519.32us | 3404.19us |
+| nativetorch | 9575.11us       | 9577.47us        | 104.44op/s | 104.41op/s | 1104649.34us | 3208.99us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1731.6W | 1794.0W | 124.8W   | /     | 347.28W       | 353.0W      | 4.23W        | 400W  |
+| flaggems监控结果 | 1731.6W | 1794.0W | 124.8W   | /     | 353.12W       | 359.0W      | 5.98W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.619%    | 1.447%   | 53.57°C       | 21.43%        |
+| flaggems监控结果 | 1.13%    | 1.457%   | 54.62°C       | 16.195%        |
diff --git a/operation/benchmarks/silu/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/silu/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..974f11daf
--- /dev/null
+++ b/operation/benchmarks/silu/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.84TFLOPS       | 0.83TFLOPS        | 4.29% | 4.28% |
+| nativetorch | True    | 0.83TFLOPS      | 0.83TFLOPS      | 4.24%      | 4.23%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 15394.14us       | 15431.68us        | 64.96op/s | 64.8op/s | 890548.61us | 6541.68us |
+| nativetorch | 15596.53us       | 15608.83us        | 64.12op/s | 64.07op/s | 16422.59us | 6213.16us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1671.43W | 1716.0W | 109.18W   | /     | 312.8W       | 317.0W      | 4.66W        | 400W  |
+| flaggems监控结果 | 1671.43W | 1716.0W | 109.18W   | /     | 312.79W       | 317.0W      | 3.31W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.144%    | 1.642%   | 52.65°C       | 41.64%        |
+| flaggems监控结果 | 1.926%    | 1.653%   | 52.27°C       | 31.352%        |
diff --git a/operation/benchmarks/silu/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/silu/nvidia/A100_40_SXM/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/silu/nvidia/A100_40_SXM/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/silu/nvidia/A100_40_SXM/env.sh b/operation/benchmarks/silu/nvidia/A100_40_SXM/env.sh
new file mode 100644
index 000000000..33786ec0d
--- /dev/null
+++ b/operation/benchmarks/silu/nvidia/A100_40_SXM/env.sh
@@ -0,0 +1 @@
+echo "NVIDIA PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/silu/nvidia/A100_40_SXM/requirements.txt b/operation/benchmarks/silu/nvidia/A100_40_SXM/requirements.txt
new file mode 100644
index 000000000..330e27963
--- /dev/null
+++ b/operation/benchmarks/silu/nvidia/A100_40_SXM/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/operation/benchmarks/sin/cambricon/MLU/case_config.yaml b/operation/benchmarks/sin/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/sin/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/sin/cambricon/MLU/env.sh b/operation/benchmarks/sin/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/sin/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/sin/case_config.yaml b/operation/benchmarks/sin/case_config.yaml
index 6ebcc4612..acc0f44fb 100644
--- a/operation/benchmarks/sin/case_config.yaml
+++ b/operation/benchmarks/sin/case_config.yaml
@@ -1,5 +1,4 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 50000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/sin/iluvatar/BI150/README.md b/operation/benchmarks/sin/iluvatar/BI150/README.md
new file mode 100644
index 000000000..9f4d25582
--- /dev/null
+++ b/operation/benchmarks/sin/iluvatar/BI150/README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 2.06E-08    | 0.07TFLOPS       | 0.07TFLOPS        | 0.29% | 0.29% |
+| nativetorch | 2.06E-08    | 0.07TFLOPS      | 0.07TFLOPS      | 0.3%      | 0.3%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 6.51E-10    | 7411.07us       | 7419.15us        | 134.93op/s | 134.79op/s | 401710.11us | 7934.4us |
+| nativetorch | 6.51E-10    | 7366.03us       | 7370.8us        | 135.76op/s | 135.67op/s | 7716.18us | 7658.1us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2080.5W | 2109.0W | 49.36W   | /     | 186.38W       | 187.0W      | 4.65W        | 350W  |
+| flaggems监控结果 | 2080.5W | 2109.0W | 49.36W   | /     | 192.9W       | 193.0W      | 0.29W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 44.02%    | 2.388%   | 51.99°C       | 30.432%        |
+| flaggems监控结果 | 40.995%    | 2.391%   | 53.44°C       | 20.856%        |
\ No newline at end of file
diff --git a/operation/benchmarks/sin/iluvatar/BI150/case_config.yaml b/operation/benchmarks/sin/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..4958ee3ba
--- /dev/null
+++ b/operation/benchmarks/sin/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/sin/iluvatar/BI150/env.sh b/operation/benchmarks/sin/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/sin/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/sin/iluvatar/BI150/requirements.txt b/operation/benchmarks/sin/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/sin/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/sin/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/sin/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..ec1c9e233
--- /dev/null
+++ b/operation/benchmarks/sin/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,2 @@
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/sin/kunlunxin/R300p/env.sh b/operation/benchmarks/sin/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..38a0db6a6
--- /dev/null
+++ b/operation/benchmarks/sin/kunlunxin/R300p/env.sh
@@ -0,0 +1,6 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+export XPU_enable_reorder=1
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/sin/main.py b/operation/benchmarks/sin/main.py
index 149319fae..0d129f37f 100644
--- a/operation/benchmarks/sin/main.py
+++ b/operation/benchmarks/sin/main.py
@@ -4,12 +4,12 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +23,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,32 +53,21 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation"
-          )  # correctness is implemented casebycase
 
     m = case_config.Melements
 
-    dtype = {"FP32": torch.float32}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(m, dtype=dtype[config.dataformat])
-
-        a_fp64 = a.to(torch.float64)
-        r_fp64 = torch.sin(a_fp64)
-
-        a = a.to(0)
-        r_device = torch.sin(a).cpu()
-        mape = torch.mean(torch.abs(r_device - r_fp64) / torch.abs(r_fp64))
-
-        mmape.append(mape)
-    
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
 
     a = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat]).to(0)
 
@@ -80,8 +77,8 @@ def main(config, case_config):
     op2flops = lambda x: x * m * 1024 * 1024
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "sin", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -89,6 +86,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
diff --git a/operation/benchmarks/sin/metax/C550_64/case_config.yaml b/operation/benchmarks/sin/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/sin/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/sin/metax/C550_64/env.sh b/operation/benchmarks/sin/metax/C550_64/env.sh
new file mode 100644
index 000000000..bb4cd2dce
--- /dev/null
+++ b/operation/benchmarks/sin/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_CONST_ARGS_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/sin/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/sin/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..9f1ef89ff
--- /dev/null
+++ b/operation/benchmarks/sin/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | : False    | 0.28TFLOPS       | 0.28TFLOPS        | 0.09% | 0.09% |
+| nativetorch | : False    | 0.33TFLOPS      | 0.33TFLOPS      | 0.11%      | 0.11%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3770.39us       | 3776.51us        | 265.22op/s | 264.79op/s | 1753860.08us | 3873.11us |
+| nativetorch | 3249.78us       | 3254.27us        | 307.71op/s | 307.29op/s | 16870.64us | 3264.82us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1638.0W | 1794.0W | 156.0W   | /     | 359.33W       | 363.0W      | 5.43W        | 400W  |
+| flaggems监控结果 | 1638.0W | 1794.0W | 156.0W   | /     | 386.53W       | 392.0W      | 7.37W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.744%    | 2.308%   | 54.8°C       | 17.394%        |
+| flaggems监控结果 | 0.641%    | 2.307%   | 62.28°C       | 17.212%        |
diff --git a/operation/benchmarks/sin/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/sin/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..e9414e5e5
--- /dev/null
+++ b/operation/benchmarks/sin/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | : False    | 0.28TFLOPS       | 0.28TFLOPS        | 0.09% | 0.09% |
+| nativetorch | : False    | 0.33TFLOPS      | 0.33TFLOPS      | 0.11%      | 0.11%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 3782.72us       | 3781.63us        | 264.36op/s | 264.44op/s | 2716894.22us | 3869.76us |
+| nativetorch | 3249.35us       | 3254.27us        | 307.75op/s | 307.29op/s | 17517.91us | 3269.59us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1638.0W | 1794.0W | 156.0W   | /     | 377.52W       | 384.0W      | 8.45W        | 400W  |
+| flaggems监控结果 | 1716.0W | 1872.0W | 168.5W   | /     | 396.84W       | 408.0W      | 8.66W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.671%    | 2.311%   | 55.98°C       | 17.394%        |
+| flaggems监控结果 | 0.68%    | 2.312%   | 62.67°C       | 17.212%        |
diff --git a/operation/benchmarks/sin/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/sin/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..d0be0af42
--- /dev/null
+++ b/operation/benchmarks/sin/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | : False    | 0.17TFLOPS       | 0.17TFLOPS        | 0.88% | 0.88% |
+| nativetorch | : False    | 0.17TFLOPS      | 0.17TFLOPS      | 0.89%      | 0.89%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 6254.92us       | 6273.02us        | 159.87op/s | 159.41op/s | 2450779.21us | 6365.54us |
+| nativetorch | 6161.1us       | 6163.46us        | 162.31op/s | 162.25op/s | 21843.53us | 6187.37us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1638.0W | 1716.0W | 110.31W   | /     | 316.76W       | 320.0W      | 3.64W        | 400W  |
+| flaggems监控结果 | 1690.0W | 1794.0W | 147.08W   | /     | 352.79W       | 357.0W      | 5.37W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.674%    | 2.291%   | 52.19°C       | 32.551%        |
+| flaggems监控结果 | 1.21%    | 2.291%   | 53.59°C       | 32.369%        |
diff --git a/operation/benchmarks/sin/nvidia/A100_40_SXM/README.md b/operation/benchmarks/sin/nvidia/A100_40_SXM/README.md
deleted file mode 100644
index 9e789742d..000000000
--- a/operation/benchmarks/sin/nvidia/A100_40_SXM/README.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a31e96c235
-
-# 评测结果
-
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 2.32E-08    | 2.72TFLOPS       | 2.72TFLOPS        | 0.96% | 0.96% |
-| nativetorch | 2.32E-08    | 2.72TFLOPS      | 2.72TFLOPS      | 0.96%      | 0.96%    |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 5.15E-10    | 6137.73us       | 6141.95us        | 162.93op/s | 162.81op/s | 351221.38us | 6210.37us |
-| nativetorch | 5.15E-10    | 6161.26us       | 6161.41us        | 162.3op/s | 162.3op/s | 10667.05us | 6174.69us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1716.0W | 1716.0W | 0.0W   | /     | 313.77W       | 319.0W      | 4.79W        | 1716.0  |
-| flaggems监控结果 | 1794.0W | 1794.0W | 0.0W   | /     | 365.06W       | 368.0W      | 5.26W        | 1794.0  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 0.757%    | 1.298%   | 52.17°C       | 31.535%        |
-| flaggems监控结果 | 0.811%    | 1.298%   | 54.22°C       | 41.64%        |
diff --git a/operation/benchmarks/sin/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/sin/nvidia/A100_40_SXM/case_config.yaml
index 7d02883ab..bc4b04b42 100644
--- a/operation/benchmarks/sin/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/sin/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 50000
-SPECTFLOPS: 19.5
diff --git a/operation/benchmarks/softmax/cambricon/MLU/case_config.yaml b/operation/benchmarks/softmax/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/softmax/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/softmax/cambricon/MLU/env.sh b/operation/benchmarks/softmax/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/softmax/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/softmax/case_config.yaml b/operation/benchmarks/softmax/case_config.yaml
new file mode 100644
index 000000000..2f117eee9
--- /dev/null
+++ b/operation/benchmarks/softmax/case_config.yaml
@@ -0,0 +1,5 @@
+Melements: 1024
+WARMUP: 10
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
diff --git a/operation/benchmarks/softmax/iluvatar/BI150/README.md b/operation/benchmarks/softmax/iluvatar/BI150/README.md
new file mode 100644
index 000000000..5aa4e5684
--- /dev/null
+++ b/operation/benchmarks/softmax/iluvatar/BI150/README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 1.02E-07    | 0.14TFLOPS       | 0.14TFLOPS        | 0.59% | 0.59% |
+| nativetorch | 7.96E-08    | 0.11TFLOPS      | 0.11TFLOPS      | 0.46%      | 0.46%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 1.36E-09    | 11114.98us       | 11120.17us        | 89.97op/s | 89.93op/s | 445973.49us | 12206.26us |
+| nativetorch | 1.67E-09    | 14279.23us       | 14298.75us        | 70.03op/s | 69.94op/s | 14581.57us | 14520.96us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2087.29W | 2109.0W | 38.58W   | /     | 183.82W       | 184.0W      | 1.52W        | 350W  |
+| flaggems监控结果 | 2071.0W | 2090.0W | 38.0W   | /     | 177.61W       | 178.0W      | 3.89W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 44.578%    | 2.583%   | 49.74°C       | 25.739%        |
+| flaggems监控结果 | 43.335%    | 2.578%   | 47.83°C       | 19.489%        |
\ No newline at end of file
diff --git a/operation/benchmarks/softmax/iluvatar/BI150/case_config.yaml b/operation/benchmarks/softmax/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..4958ee3ba
--- /dev/null
+++ b/operation/benchmarks/softmax/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/softmax/iluvatar/BI150/env.sh b/operation/benchmarks/softmax/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/softmax/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/softmax/iluvatar/BI150/requirements.txt b/operation/benchmarks/softmax/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/softmax/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/softmax/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/softmax/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..398fbc7c8
--- /dev/null
+++ b/operation/benchmarks/softmax/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,3 @@
+Shape: [4096, 256]
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/softmax/kunlunxin/R300p/env.sh b/operation/benchmarks/softmax/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/softmax/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/softmax/main.py b/operation/benchmarks/softmax/main.py
new file mode 100644
index 000000000..9010120a1
--- /dev/null
+++ b/operation/benchmarks/softmax/main.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+import torch
+import os
+import time
+from argparse import ArgumentParser, Namespace
+import yaml
+import sys
+import subprocess
+import math
+
+sys.path.append("..")
+from drivers.utils import *
+from drivers.calculate import *
+
+
+def parse_args():
+    parser = ArgumentParser(description=" ")
+
+    parser.add_argument("--vendor",
+                        type=str,
+                        required=True,
+                        help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
+
+    parser.add_argument("--dataformat",
+                        type=str,
+                        required=True,
+                        help="like FP32,FP16")
+
+    parser.add_argument("--oplib",
+                        type=str,
+                        required=True,
+                        help="impl like pytorch/flaggems/cpp")
+
+    parser.add_argument("--chip",
+                        type=str,
+                        required=True,
+                        help="chip like A100_40_SXM")
+
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
+    set_ieee_float32(config.vendor)
+
+    Melements = case_config.Melements
+    # default shape: (M, 1024, 1024)
+    shape = (Melements, 1024, 1024)
+
+    if config.vendor == 'kunlunxin':
+        # if `Shape' specified in `case_config.yaml', use it
+        if case_config.__contains__('Shape') and case_config.Shape is not None:
+            shape = case_config.Shape
+
+    a = torch.randn(shape, dtype=dtype[config.dataformat], requires_grad=True).to(0)
+    print(f'Shape for performance_test: {a.shape}')
+
+    f = torch.nn.Softmax(dim=1).to(0)
+
+    latency_nowarm, latency_warm, cputime, kerneltime = do_test(
+        f, (a, ), host_device_sync, config, case_config, bp=True)
+
+    op2flops = lambda x: x * 3 * math.prod(shape)
+
+    perf_result = cal_perf(cputime, kerneltime, op2flops,
+                           config.spectflops, bp=True)
+    print_result(config, config.case_name, *perf_result, correctness,
+                 latency_nowarm, latency_warm)
+
+
+if __name__ == "__main__":
+    config = parse_args()
+    with open("case_config.yaml", "r") as file:
+        case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
+    with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
+              "r") as file:
+        case_config_vendor = yaml.safe_load(file)
+    case_config.update(case_config_vendor)
+    case_config = Namespace(**case_config)
+
+    if config.oplib == "flaggems":
+        import flag_gems
+        flag_gems.enable()
+        print("Using flaggems")
+    else:
+        print("Using nativetorch")
+    main(config, case_config)
diff --git a/operation/benchmarks/softmax/metax/C550_64/case_config.yaml b/operation/benchmarks/softmax/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..529af74ce
--- /dev/null
+++ b/operation/benchmarks/softmax/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
\ No newline at end of file
diff --git a/operation/benchmarks/softmax/metax/C550_64/env.sh b/operation/benchmarks/softmax/metax/C550_64/env.sh
new file mode 100644
index 000000000..0cdec082d
--- /dev/null
+++ b/operation/benchmarks/softmax/metax/C550_64/env.sh
@@ -0,0 +1 @@
+echo "METAX PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/softmax/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/softmax/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..f780690da
--- /dev/null
+++ b/operation/benchmarks/softmax/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.62TFLOPS       | 0.62TFLOPS        | 0.2% | 0.2% |
+| nativetorch | True    | 0.34TFLOPS      | 0.34TFLOPS      | 0.11%      | 0.11%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 15654.77us       | 15666.18us        | 63.88op/s | 63.83op/s | 724583.06us | 4865.3us |
+| nativetorch | 28353.82us       | 28334.08us        | 35.27op/s | 35.29op/s | 1069128.0us | 12957.49us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1626.0W | 1638.0W | 41.57W   | /     | 249.24W       | 258.0W      | 7.14W        | 400W  |
+| flaggems监控结果 | 1628.25W | 1716.0W | 60.89W   | /     | 275.94W       | 289.0W      | 12.86W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.503%    | 1.473%   | 50.21°C       | 31.535%        |
+| flaggems监控结果 | 0.816%    | 1.464%   | 52.41°C       | 27.09%        |
diff --git a/operation/benchmarks/softmax/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/softmax/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..8ac8a7213
--- /dev/null
+++ b/operation/benchmarks/softmax/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.62TFLOPS       | 0.62TFLOPS        | 0.2% | 0.2% |
+| nativetorch | True    | 0.34TFLOPS      | 0.34TFLOPS      | 0.11%      | 0.11%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 15648.83us       | 15656.96us        | 63.9op/s | 63.87op/s | 715996.17us | 4876.97us |
+| nativetorch | 28592.8us       | 28576.77us        | 34.97op/s | 34.99op/s | 24113.42us | 12988.55us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1626.0W | 1638.0W | 41.57W   | /     | 255.22W       | 266.0W      | 8.96W        | 400W  |
+| flaggems监控结果 | 1628.25W | 1716.0W | 72.31W   | /     | 277.87W       | 293.0W      | 15.36W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.232%    | 1.473%   | 50.78°C       | 26.483%        |
+| flaggems监控结果 | 1.288%    | 1.464%   | 52.58°C       | 27.09%        |
diff --git a/operation/benchmarks/softmax/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/softmax/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..cf3c6171a
--- /dev/null
+++ b/operation/benchmarks/softmax/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.32TFLOPS       | 0.32TFLOPS        | 1.63% | 1.63% |
+| nativetorch | True    | 0.25TFLOPS      | 0.25TFLOPS      | 1.27%      | 1.27%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 30461.69us       | 30479.36us        | 32.83op/s | 32.81op/s | 1493095.44us | 9403.05us |
+| nativetorch | 39113.73us       | 39139.33us        | 25.57op/s | 25.55op/s | 517653.19us | 16028.46us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1628.82W | 1638.0W | 36.71W   | /     | 257.2W       | 275.0W      | 12.28W        | 400W  |
+| flaggems监控结果 | 1626.86W | 1638.0W | 40.18W   | /     | 254.58W       | 271.0W      | 10.57W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.037%    | 1.669%   | 50.57°C       | 61.849%        |
+| flaggems监控结果 | 0.878%    | 1.663%   | 50.09°C       | 52.351%        |
diff --git a/operation/benchmarks/softmax/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/softmax/nvidia/A100_40_SXM/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/softmax/nvidia/A100_40_SXM/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/softmax/nvidia/A100_40_SXM/env.sh b/operation/benchmarks/softmax/nvidia/A100_40_SXM/env.sh
new file mode 100644
index 000000000..f1ac0f6f2
--- /dev/null
+++ b/operation/benchmarks/softmax/nvidia/A100_40_SXM/env.sh
@@ -0,0 +1 @@
+echo "NVIDIA PLACEHOLDER ENV.SH"
\ No newline at end of file
diff --git a/operation/benchmarks/softmax/nvidia/A100_40_SXM/requirements.txt b/operation/benchmarks/softmax/nvidia/A100_40_SXM/requirements.txt
new file mode 100644
index 000000000..7248303e5
--- /dev/null
+++ b/operation/benchmarks/softmax/nvidia/A100_40_SXM/requirements.txt
@@ -0,0 +1 @@
+loguru
\ No newline at end of file
diff --git a/operation/benchmarks/sub/cambricon/MLU/case_config.yaml b/operation/benchmarks/sub/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/sub/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/sub/cambricon/MLU/env.sh b/operation/benchmarks/sub/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/sub/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/sub/case_config.yaml b/operation/benchmarks/sub/case_config.yaml
index 6ebcc4612..70fc45775 100644
--- a/operation/benchmarks/sub/case_config.yaml
+++ b/operation/benchmarks/sub/case_config.yaml
@@ -1,6 +1,5 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
-ITERS: 50000
+ITERS: 10000
 KERNELWARMUP: 10
-KERNELITERS: 1000
\ No newline at end of file
+KERNELITERS: 1000
diff --git a/operation/benchmarks/sub/iluvatar/BI150/README.md b/operation/benchmarks/sub/iluvatar/BI150/README.md
new file mode 100644
index 000000000..e1fa95834
--- /dev/null
+++ b/operation/benchmarks/sub/iluvatar/BI150/README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 1.63E-08    | 0.1TFLOPS       | 0.1TFLOPS        | 0.39% | 0.39% |
+| nativetorch | 1.63E-08    | 0.1TFLOPS      | 0.1TFLOPS      | 0.4%      | 0.4%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 7.77E-10    | 11201.1us       | 11214.15us        | 89.28op/s | 89.17op/s | 257699.96us | 11638.84us |
+| nativetorch | 7.77E-10    | 10871.35us       | 10917.02us        | 91.98op/s | 91.6op/s | 11248.25us | 11163.73us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2074.8W | 2090.0W | 30.4W   | /     | 163.72W       | 164.0W      | 2.89W        | 350W  |
+| flaggems监控结果 | 2090.0W | 2109.0W | 38.0W   | /     | 168.97W       | 169.0W      | 0.16W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 39.076%    | 2.579%   | 47.91°C       | 25.739%        |
+| flaggems监控结果 | 39.225%    | 2.588%   | 48.03°C       | 25.739%        |
\ No newline at end of file
diff --git a/operation/benchmarks/sub/iluvatar/BI150/case_config.yaml b/operation/benchmarks/sub/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..4958ee3ba
--- /dev/null
+++ b/operation/benchmarks/sub/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/sub/iluvatar/BI150/env.sh b/operation/benchmarks/sub/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/sub/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/sub/iluvatar/BI150/requirements.txt b/operation/benchmarks/sub/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/sub/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/sub/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/sub/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..ec1c9e233
--- /dev/null
+++ b/operation/benchmarks/sub/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,2 @@
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/sub/kunlunxin/R300p/env.sh b/operation/benchmarks/sub/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/sub/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/sub/main.py b/operation/benchmarks/sub/main.py
index bcbce3a94..4e131abbe 100644
--- a/operation/benchmarks/sub/main.py
+++ b/operation/benchmarks/sub/main.py
@@ -4,12 +4,12 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +23,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,32 +53,19 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation")
     m = case_config.Melements
-    dtype = {"FP32": torch.float32}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(m, dtype=dtype[config.dataformat])
-        b = torch.randn(m, dtype=dtype[config.dataformat])
-
-        a_fp64 = a.to(torch.float64)
-        b_fp64 = b.to(torch.float64)
-        r_fp64 = torch.sub(a_fp64, b_fp64) # 修改为torch.sub
-
-        a = a.to(0)
-        b = b.to(0)
-        r_device = torch.sub(a, b).cpu() # 同样修改为torch.sub
-        mape = torch.mean(torch.abs(r_device - r_fp64) / torch.abs(r_fp64))
-
-        mmape.append(mape)
-    
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
 
     a = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat]).to(0)
     b = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat]).to(0)
@@ -78,11 +73,11 @@ def main(config, case_config):
     latency_nowarm, latency_warm, cputime, kerneltime = do_test(
         torch.sub, (a, b), host_device_sync, config, case_config) # 调整为torch.sub
 
-    op2flops = lambda x: x * 2 * m * 1024 * 1024 # 根据减法的实际FLOPs调整
+    op2flops = lambda x: x * 2 * m * 1024 * 1024 
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "sub", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -90,6 +85,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
diff --git a/operation/benchmarks/sub/metax/C550_64/case_config.yaml b/operation/benchmarks/sub/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..fde4f8949
--- /dev/null
+++ b/operation/benchmarks/sub/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 10000
diff --git a/operation/benchmarks/sub/metax/C550_64/env.sh b/operation/benchmarks/sub/metax/C550_64/env.sh
new file mode 100644
index 000000000..bb4cd2dce
--- /dev/null
+++ b/operation/benchmarks/sub/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_CONST_ARGS_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/sub/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/sub/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..6a3d9556d
--- /dev/null
+++ b/operation/benchmarks/sub/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.46TFLOPS       | 0.46TFLOPS        | 0.15% | 0.15% |
+| nativetorch | True    | 0.45TFLOPS      | 0.45TFLOPS      | 0.15%      | 0.14%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 4671.76us       | 4678.66us        | 214.05op/s | 213.74op/s | 3437099.37us | 4779.86us |
+| nativetorch | 4745.56us       | 4749.31us        | 210.72op/s | 210.56op/s | 742265.72us | 4741.16us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 250.0W       | 253.0W      | 2.57W        | 400W  |
+| flaggems监控结果 | 1599.0W | 1638.0W | 39.0W   | /     | 291.5W       | 297.0W      | 4.27W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.122%    | 2.457%   | 41.92°C       | 21.425%        |
+| flaggems监控结果 | 18.559%    | 2.485%   | 48.09°C       | 89.607%        |
diff --git a/operation/benchmarks/sub/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/sub/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..8be347251
--- /dev/null
+++ b/operation/benchmarks/sub/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.46TFLOPS       | 0.46TFLOPS        | 0.15% | 0.15% |
+| nativetorch | True    | 0.45TFLOPS      | 0.45TFLOPS      | 0.15%      | 0.14%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 4666.51us       | 4674.56us        | 214.29op/s | 213.92op/s | 1590097.15us | 4774.58us |
+| nativetorch | 4729.07us       | 4748.29us        | 211.46op/s | 210.6op/s | 23312.03us | 4740.75us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1521.0W | 1638.0W | 117.0W   | /     | 260.7W       | 266.0W      | 4.22W        | 400W  |
+| flaggems监控结果 | 1482.0W | 1482.0W | 0.0W   | /     | 288.6W       | 295.0W      | 5.04W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.037%    | 2.727%   | 44.29°C       | 21.425%        |
+| flaggems监控结果 | 1.118%    | 2.661%   | 47.9°C       | 21.238%        |
diff --git a/operation/benchmarks/sub/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/sub/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..ceda28046
--- /dev/null
+++ b/operation/benchmarks/sub/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.23TFLOPS       | 0.23TFLOPS        | 1.18% | 1.18% |
+| nativetorch | True    | 0.23TFLOPS      | 0.23TFLOPS      | 1.16%      | 1.16%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 9331.27us       | 9348.1us        | 107.17op/s | 106.97op/s | 912822.65us | 9423.48us |
+| nativetorch | 9477.43us       | 9479.17us        | 105.51op/s | 105.49op/s | 31180.51us | 9514.0us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1599.0W | 1716.0W | 117.0W   | /     | 252.32W       | 257.0W      | 2.83W        | 400W  |
+| flaggems监控结果 | 1521.0W | 1638.0W | 117.0W   | /     | 271.53W       | 276.0W      | 3.84W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 10.908%    | 2.466%   | 45.34°C       | 89.607%        |
+| flaggems监控结果 | 1.26%    | 2.372%   | 47.0°C       | 41.447%        |
diff --git a/operation/benchmarks/sub/nvidia/A100_40_SXM/README.md b/operation/benchmarks/sub/nvidia/A100_40_SXM/README.md
deleted file mode 100644
index c9b912162..000000000
--- a/operation/benchmarks/sub/nvidia/A100_40_SXM/README.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a31e96c235
-
-# 评测结果
-
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 1.62E-08    | 3.68TFLOPS       | 3.68TFLOPS        | 1.12% | 1.12% |
-| nativetorch | 1.62E-08    | 3.68TFLOPS      | 3.68TFLOPS      | 1.12%      | 1.12%    |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 6.09E-10    | 9431.9us       | 9470.98us        | 106.02op/s | 105.59op/s | 1047320.43us | 9568.19us |
-| nativetorch | 6.09E-10    | 9472.43us       | 9478.14us        | 105.57op/s | 105.51op/s | 13811.08us | 9531.92us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1638.0W | 1638.0W | 0.0W   | /     | 258.57W       | 261.0W      | 1.9W        | 1638.0  |
-| flaggems监控结果 | 1703.68W | 1716.0W | 28.44W   | /     | 284.31W       | 287.0W      | 1.74W        | 1703.68  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 0.618%    | 1.293%   | 50.47°C       | 41.64%        |
-| flaggems监控结果 | 0.768%    | 1.293%   | 50.35°C       | 41.452%        |
diff --git a/operation/benchmarks/sub/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/sub/nvidia/A100_40_SXM/case_config.yaml
index 7ad6570ee..fde4f8949 100644
--- a/operation/benchmarks/sub/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/sub/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
-ITERS: 250000
-SPECTFLOPS: 19.5
+ITERS: 10000
diff --git a/operation/benchmarks/sum/cambricon/MLU/case_config.yaml b/operation/benchmarks/sum/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/sum/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/sum/cambricon/MLU/env.sh b/operation/benchmarks/sum/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/sum/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/sum/case_config.yaml b/operation/benchmarks/sum/case_config.yaml
index 1e14182d2..4d04f5499 100644
--- a/operation/benchmarks/sum/case_config.yaml
+++ b/operation/benchmarks/sum/case_config.yaml
@@ -1,5 +1,4 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
 ITERS: 50000
 KERNELWARMUP: 10
diff --git a/operation/benchmarks/sum/iluvatar/BI150/README.md b/operation/benchmarks/sum/iluvatar/BI150/README.md
new file mode 100644
index 000000000..01e2f295f
--- /dev/null
+++ b/operation/benchmarks/sum/iluvatar/BI150/README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 2.92E-07    | 0.32TFLOPS       | 0.32TFLOPS        | 1.32% | 1.3% |
+| nativetorch | 2.96E-07    | 0.18TFLOPS      | 0.18TFLOPS      | 0.73%      | 0.73%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 4.34E-07    | 3320.52us       | 3357.13us        | 301.16op/s | 297.87op/s | 776568.33us | 3613.17us |
+| nativetorch | 5.17E-07    | 5966.37us       | 6002.02us        | 167.61op/s | 166.61op/s | 6177.92us | 6224.33us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2071.0W | 2090.0W | 26.87W   | /     | 140.69W       | 141.0W      | 2.32W        | 350W  |
+| flaggems监控结果 | 2071.0W | 2109.0W | 38.0W   | /     | 171.48W       | 172.0W      | 0.5W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 39.298%    | 2.569%   | 44.95°C       | 6.989%        |
+| flaggems监控结果 | 44.153%    | 2.579%   | 50.27°C       | 6.989%        |
\ No newline at end of file
diff --git a/operation/benchmarks/sum/iluvatar/BI150/case_config.yaml b/operation/benchmarks/sum/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..4958ee3ba
--- /dev/null
+++ b/operation/benchmarks/sum/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/sum/iluvatar/BI150/env.sh b/operation/benchmarks/sum/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/sum/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/sum/iluvatar/BI150/requirements.txt b/operation/benchmarks/sum/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/sum/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/sum/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/sum/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..398fbc7c8
--- /dev/null
+++ b/operation/benchmarks/sum/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,3 @@
+Shape: [4096, 256]
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/sum/kunlunxin/R300p/env.sh b/operation/benchmarks/sum/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/sum/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/sum/main.py b/operation/benchmarks/sum/main.py
index e6bc234b0..264954135 100644
--- a/operation/benchmarks/sum/main.py
+++ b/operation/benchmarks/sum/main.py
@@ -4,12 +4,13 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
+import math
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,7 +24,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
-
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
     parser.add_argument("--dataformat",
                         type=str,
                         required=True,
@@ -45,41 +53,38 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
-    print("Test Correctness with 1M-times smaller operation"
-          )  # correctness is implemented casebycase
 
     Melements = case_config.Melements
+    # default shape: (M, 1024, 1024)
+    shape = (Melements, 1024, 1024)
 
-    dtype = {"FP32": torch.float32}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(Melements, dtype=dtype[config.dataformat])
-
-        a_fp64 = a.to(torch.float64)
-        r_fp64 = torch.sum(a)
-
-        a = a.to(0)
-
-        r_device = torch.sum(a).cpu()
-        mape = torch.mean(torch.abs(r_device - r_fp64) / torch.abs(r_fp64))
-        mmape.append(mape)
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
+    if config.vendor == 'kunlunxin':
+        # if `Shape' specified in `case_config.yaml', use it
+        if case_config.__contains__('Shape') and case_config.Shape is not None:
+            shape = case_config.Shape
 
-    a = torch.randn(Melements * 1024 * 1024, dtype=dtype[config.dataformat]).to(0)
+    a = torch.randn(shape, dtype=dtype[config.dataformat]).to(0)
+    print(f'Shape for performance_test: {a.shape}')
 
     latency_nowarm, latency_warm, cputime, kerneltime = do_test(
         torch.sum, (a, ), host_device_sync, config, case_config)
 
-    op2flops = lambda x: x * 2 * Melements * 1024 * 1024
+    op2flops = lambda x: x * math.prod(shape)
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "sum", *perf_result, mape, mape_std,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -87,6 +92,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
diff --git a/operation/benchmarks/linear/nvidia/A100_40_SXM/README.md b/operation/benchmarks/sum/metax/C550_64/README.md
similarity index 57%
rename from operation/benchmarks/linear/nvidia/A100_40_SXM/README.md
rename to operation/benchmarks/sum/metax/C550_64/README.md
index 68f30dc26..740b1e3c6 100644
--- a/operation/benchmarks/linear/nvidia/A100_40_SXM/README.md
+++ b/operation/benchmarks/sum/metax/C550_64/README.md
@@ -1,11 +1,11 @@
 # 参评AI芯片信息
 
-* 厂商：Nvidia
+* 厂商：Metax
 
 
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
+* 产品名称：C550
+* 产品型号：曦云®C550 64G
+* TDP：350W
 
 # 所用服务器配置
 
@@ -13,12 +13,12 @@
 
 
 * 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
+* 服务器型号：Nettrix X640 G40
+* 操作系统版本：Ubuntu 20.04.1 LTS
+* 操作系统内核：linux5.4.0-42-generic
+* CPU：Intel(R) Xeon(R) Gold 6348-112core
+* docker版本：27.0.3
+* 内存：2.0TiB
 * 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
 
 # 算子库版本
@@ -31,8 +31,8 @@ https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a3
 
 | 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
 | ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 1.792E-4    | 254.32TFLOPS       | 255.77TFLOPS        | 81.51% | 81.98% |
-| nativetorch | 1.792E-4    | 253.99TFLOPS       | 255.90TFLOPS      | 81.41%      | 82.02%    |
+| flaggems | 5.949E-7    |        |         | 4.3% | 4.28% |
+| nativetorch | 6.540E-7    |        |       | 3.66%      | 3.66%    |
 
 说明：kerneltime采用triton.testing.do\_bench接口给出，准确度低于nsys等profiling工具
 
@@ -40,19 +40,19 @@ https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a3
 
 | 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
 | ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 5.275E-6    | 4323.39us       | 4298.75us        | 231.30 op/s | 232.63 op/s | 2052473.44 us | 4263.51 us |
-| nativetorch | 5.275E-6    | 4328.95us       | 4296.7us        | 231.00 op/s | 232.74 op/s | 5814.00 us | 4458.83 us |
+| flaggems | 2.694E-6    |        |        |  |  |  |  |
+| nativetorch | 3.505E-7    |        |        |  |  |  | |
 
 ## 能耗监控结果
 
 | 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
 | ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| flaggems监控结果 | 1872.0W | 1872.0W | 0.0W    | /     | 397.52W       | 413.0W       | 8.25W        | 400W  |
-| nativetorch监控结果 | 1872.0W | 1872.0W | 0.0W    | /     | 398.02W       | 411.0W       | 3.3W        | 400W  |
+| flaggems监控结果 | 200.54W | 217.2W | 24.39W    | /     | 72.0W      | 72.0W       | 0.0W        | 350W  |
+| nativetorch监控结果 | 208.38W | 216.0W | 7.87W    | /     | 72.0W       | 72.0W       | 0.0W        | 350W  |
 
 ## 其他重要监控结果
 
 | 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
 | ---- | --------- | -------- | ------------ | -------------- |
-| flaggems监控结果 | 0.837%    | 1.373%   | 64.30°C      | 3.920%        |
-| nativetorch监控结果 | 1.211%    | 1.45%   | 64.41°C      | 3.427%        |
+| flaggems监控结果 | 1.34%    | 1.994%   | 44.0°C      | 18.599%        |
+| nativetorch监控结果 | 1.46%    | 1.985%   | 43.0°C      | 18.599%        |
diff --git a/operation/benchmarks/sum/metax/C550_64/case_config.yaml b/operation/benchmarks/sum/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/sum/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/sum/metax/C550_64/env.sh b/operation/benchmarks/sum/metax/C550_64/env.sh
new file mode 100644
index 000000000..0cdec082d
--- /dev/null
+++ b/operation/benchmarks/sum/metax/C550_64/env.sh
@@ -0,0 +1 @@
+echo "METAX PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/sum/metax/C550_64/requirements.txt b/operation/benchmarks/sum/metax/C550_64/requirements.txt
new file mode 100644
index 000000000..7248303e5
--- /dev/null
+++ b/operation/benchmarks/sum/metax/C550_64/requirements.txt
@@ -0,0 +1 @@
+loguru
\ No newline at end of file
diff --git a/operation/benchmarks/sum/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/sum/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..dd21b9574
--- /dev/null
+++ b/operation/benchmarks/sum/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.74TFLOPS       | 0.73TFLOPS        | 0.24% | 0.24% |
+| nativetorch | True    | 0.69TFLOPS      | 0.68TFLOPS      | 0.22%      | 0.22%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 1443.0us       | 1464.32us        | 693.0op/s | 682.91op/s | 1634242.67us | 1503.25us |
+| nativetorch | 1548.04us       | 1568.77us        | 645.98op/s | 637.44op/s | 26875.04us | 1564.32us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 269.0W       | 274.0W      | 3.44W        | 400W  |
+| flaggems监控结果 | 1521.0W | 1638.0W | 117.0W   | /     | 281.0W       | 286.0W      | 3.25W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.704%    | 2.466%   | 45.56°C       | 6.273%        |
+| flaggems监控结果 | 0.925%    | 2.453%   | 46.41°C       | 6.086%        |
diff --git a/operation/benchmarks/sum/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/sum/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..6da58b02b
--- /dev/null
+++ b/operation/benchmarks/sum/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.74TFLOPS       | 0.73TFLOPS        | 0.24% | 0.23% |
+| nativetorch | True    | 0.69TFLOPS      | 0.68TFLOPS      | 0.22%      | 0.22%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 1443.06us       | 1465.34us        | 692.97op/s | 682.43op/s | 1409158.96us | 1509.98us |
+| nativetorch | 1546.68us       | 1567.74us        | 646.54op/s | 637.86op/s | 20192.56us | 1595.54us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1599.0W | 1716.0W | 117.0W   | /     | 280.13W       | 284.0W      | 3.38W        | 400W  |
+| flaggems监控结果 | 1560.0W | 1716.0W | 156.0W   | /     | 287.07W       | 290.0W      | 3.15W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 2.197%    | 2.451%   | 47.08°C       | 6.273%        |
+| flaggems监控结果 | 1.11%    | 3.802%   | 45.44°C       | 6.086%        |
diff --git a/operation/benchmarks/sum/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/sum/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..7c2b5795a
--- /dev/null
+++ b/operation/benchmarks/sum/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 801377f03ba4649bc2d839ff34e38be66ee8a633
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.37TFLOPS       | 0.37TFLOPS        | 1.92% | 1.9% |
+| nativetorch | True    | 0.36TFLOPS      | 0.36TFLOPS      | 1.87%      | 1.85%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 2874.56us       | 2896.9us        | 347.88op/s | 345.2op/s | 1238680.68us | 2937.97us |
+| nativetorch | 2952.23us       | 2973.7us        | 338.73op/s | 336.28op/s | 28175.53us | 2983.18us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1521.0W | 1638.0W | 117.0W   | /     | 265.3W       | 267.0W      | 2.38W        | 400W  |
+| flaggems监控结果 | 1560.0W | 1638.0W | 78.0W   | /     | 273.79W       | 276.0W      | 2.54W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.787%    | 2.399%   | 46.41°C       | 11.326%        |
+| flaggems监控结果 | 0.724%    | 2.394%   | 47.08°C       | 11.933%        |
diff --git a/operation/benchmarks/sum/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/sum/nvidia/A100_40_SXM/case_config.yaml
index 7d02883ab..bc4b04b42 100644
--- a/operation/benchmarks/sum/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/sum/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 50000
-SPECTFLOPS: 19.5
diff --git a/operation/benchmarks/tanh/cambricon/MLU/case_config.yaml b/operation/benchmarks/tanh/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/tanh/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/tanh/cambricon/MLU/env.sh b/operation/benchmarks/tanh/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/tanh/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/tanh/case_config.yaml b/operation/benchmarks/tanh/case_config.yaml
index ff98d295e..4d04f5499 100644
--- a/operation/benchmarks/tanh/case_config.yaml
+++ b/operation/benchmarks/tanh/case_config.yaml
@@ -1,6 +1,5 @@
 Melements: 1024
-SPECTFLOPS: 10000
 WARMUP: 100
-ITERS: 100000
+ITERS: 50000
 KERNELWARMUP: 10
-KERNELITERS: 1000
\ No newline at end of file
+KERNELITERS: 1000
diff --git a/operation/benchmarks/tanh/iluvatar/BI150/README.md b/operation/benchmarks/tanh/iluvatar/BI150/README.md
new file mode 100644
index 000000000..53ef0ac4b
--- /dev/null
+++ b/operation/benchmarks/tanh/iluvatar/BI150/README.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：ILUVATAR
+
+* 产品名称：BI150
+* 产品型号：BI150
+* TDP：W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：1
+* 服务器型号：
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.4.0-148-generic
+* CPU：
+* docker版本：20.10.25
+* 内存：
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+FlagGems:>联系邮箱: contact-us@iluvatar.com获取版本(FlagGems-0710_pointwise_use_tid)
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | 2.78E-08    | 0.07TFLOPS       | 0.07TFLOPS        | 0.29% | 0.29% |
+| nativetorch | 2.78E-08    | 0.07TFLOPS      | 0.07TFLOPS      | 0.3%      | 0.3%    |
+
+## 其他评测结果
+
+| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
+| flaggems | 9.98E-10    | 7412.57us       | 7423.31us        | 134.91op/s | 134.71op/s | 355959.99us | 8004.26us |
+| nativetorch | 9.98E-10    | 7354.15us       | 7394.27us        | 135.98op/s | 135.24op/s | 7635.33us | 7614.73us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 2094.75W | 2128.0W | 47.26W   | /     | 182.53W       | 183.0W      | 3.84W        | 350W  |
+| flaggems监控结果 | 2085.25W | 2109.0W | 41.14W   | /     | 188.82W       | 189.0W      | 0.45W        | 350W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 45.425%    | 2.388%   | 51.75°C       | 19.489%        |
+| flaggems监控结果 | 43.139%    | 2.391%   | 53.07°C       | 19.489%        |
\ No newline at end of file
diff --git a/operation/benchmarks/tanh/iluvatar/BI150/case_config.yaml b/operation/benchmarks/tanh/iluvatar/BI150/case_config.yaml
new file mode 100644
index 000000000..4958ee3ba
--- /dev/null
+++ b/operation/benchmarks/tanh/iluvatar/BI150/case_config.yaml
@@ -0,0 +1,6 @@
+Melements: 512
+SPECTFLOPS: 24.576
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
\ No newline at end of file
diff --git a/operation/benchmarks/tanh/iluvatar/BI150/env.sh b/operation/benchmarks/tanh/iluvatar/BI150/env.sh
new file mode 100644
index 000000000..f8afe15fd
--- /dev/null
+++ b/operation/benchmarks/tanh/iluvatar/BI150/env.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages:$PYTHONPATH
+export LD_LIBRARY_PATH=/usr/local/corex/lib64:$LD_LIBRARY_PATH
+export PATH=/usr/local/corex/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/corex/lib64/python3/dist-packages/bin:$PATH
diff --git a/operation/benchmarks/tanh/iluvatar/BI150/requirements.txt b/operation/benchmarks/tanh/iluvatar/BI150/requirements.txt
new file mode 100644
index 000000000..173a80076
--- /dev/null
+++ b/operation/benchmarks/tanh/iluvatar/BI150/requirements.txt
@@ -0,0 +1 @@
+#loguru
diff --git a/operation/benchmarks/tanh/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/tanh/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..bfd12215d
--- /dev/null
+++ b/operation/benchmarks/tanh/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,3 @@
+Melements: 1
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/tanh/kunlunxin/R300p/env.sh b/operation/benchmarks/tanh/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/tanh/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/tanh/main.py b/operation/benchmarks/tanh/main.py
index b536fb1cc..6ddf6c4f4 100644
--- a/operation/benchmarks/tanh/main.py
+++ b/operation/benchmarks/tanh/main.py
@@ -4,12 +4,12 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import torch
-import torch.distributed as dist
 import os
 import time
 from argparse import ArgumentParser, Namespace
 import yaml
 import sys
+import subprocess
 
 sys.path.append("..")
 from drivers.utils import *
@@ -23,6 +23,14 @@ def parse_args():
                         type=str,
                         required=True,
                         help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
 
     parser.add_argument("--dataformat",
                         type=str,
@@ -45,43 +53,32 @@ def parse_args():
 
 
 def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
     set_ieee_float32(config.vendor)
 
-    print("Test Correctness with 1M-times smaller operation"
-          )  # correctness is implemented casebycase
 
     m = case_config.Melements
 
-    dtype = {"FP32": torch.float32}
-
-    mmape = []
-
-    torch.manual_seed(42)
-    for i in range(100):
-        a = torch.randn(m, dtype=dtype[config.dataformat])
-
-        a_fp64 = a.to(torch.float64)
-        r_fp64 = torch.tanh(a_fp64)
-
-        a = a.to(0)
-        r_device = torch.tanh(a).cpu()
-        mape = torch.mean(torch.abs(r_device - r_fp64) / torch.abs(r_fp64))
-
-        mmape.append(mape)
-    
-    mape = torch.mean(torch.tensor(mmape))
-    mape_std = torch.std(torch.tensor(mmape))
 
-    a = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat]).to(0)
+    a = torch.randn(m, 1024, 1024, dtype=dtype[config.dataformat], requires_grad=True).to(0)
 
     latency_nowarm, latency_warm, cputime, kerneltime = do_test(
-        torch.tanh, (a, ), host_device_sync, config, case_config)
+        torch.tanh, (a, ), host_device_sync, config, case_config, bp=True)
 
     op2flops = lambda x: x * m * 1024 * 1024
 
     perf_result = cal_perf(cputime, kerneltime, op2flops,
-                           case_config.SPECTFLOPS)
-    print_result(config, "tanh", *perf_result, mape, mape_std,
+                           config.spectflops, bp=True)
+    print_result(config, config.case_name, *perf_result, correctness,
                  latency_nowarm, latency_warm)
 
 
@@ -89,6 +86,7 @@ def main(config, case_config):
     config = parse_args()
     with open("case_config.yaml", "r") as file:
         case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
     with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
               "r") as file:
         case_config_vendor = yaml.safe_load(file)
@@ -101,4 +99,4 @@ def main(config, case_config):
         print("Using flaggems")
     else:
         print("Using nativetorch")
-    main(config, case_config)
\ No newline at end of file
+    main(config, case_config)
diff --git a/operation/benchmarks/tanh/metax/C550_64/case_config.yaml b/operation/benchmarks/tanh/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/tanh/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/tanh/metax/C550_64/env.sh b/operation/benchmarks/tanh/metax/C550_64/env.sh
new file mode 100644
index 000000000..bb4cd2dce
--- /dev/null
+++ b/operation/benchmarks/tanh/metax/C550_64/env.sh
@@ -0,0 +1,2 @@
+echo "METAX PLACEHOLDER ENV.SH"
+export TRITON_CONST_ARGS_OPT=1
\ No newline at end of file
diff --git a/operation/benchmarks/tanh/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/tanh/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..a50585f41
--- /dev/null
+++ b/operation/benchmarks/tanh/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.11TFLOPS       | 0.11TFLOPS        | 0.03% | 0.03% |
+| nativetorch | True    | 0.35TFLOPS      | 0.35TFLOPS      | 0.11%      | 0.11%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 30568.42us       | 30579.71us        | 32.71op/s | 32.7op/s | 1633862.32us | 17591.63us |
+| nativetorch | 9142.71us       | 9156.61us        | 109.38op/s | 109.21op/s | 9868.85us | 3183.64us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1653.6W | 1716.0W | 124.8W   | /     | 310.05W       | 316.0W      | 5.17W        | 400W  |
+| flaggems监控结果 | 1688.14W | 1716.0W | 81.31W   | /     | 288.8W       | 300.0W      | 6.9W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.136%    | 1.388%   | 52.62°C       | 21.43%        |
+| flaggems监控结果 | 0.828%    | 1.464%   | 51.15°C       | 51.744%        |
diff --git a/operation/benchmarks/tanh/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/tanh/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..ff8cda03b
--- /dev/null
+++ b/operation/benchmarks/tanh/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.11TFLOPS       | 0.11TFLOPS        | 0.03% | 0.03% |
+| nativetorch | True    | 0.35TFLOPS      | 0.35TFLOPS      | 0.11%      | 0.11%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 30332.6us       | 30341.12us        | 32.97op/s | 32.96op/s | 1049242.69us | 17319.36us |
+| nativetorch | 9089.02us       | 9103.36us        | 110.02op/s | 109.85op/s | 9753.31us | 3178.95us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1669.2W | 1716.0W | 93.6W   | /     | 322.48W       | 329.0W      | 5.61W        | 400W  |
+| flaggems监控结果 | 1693.71W | 1716.0W | 62.04W   | /     | 293.33W       | 306.0W      | 7.55W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.504%    | 1.38%   | 53.66°C       | 26.483%        |
+| flaggems监控结果 | 1.419%    | 1.464%   | 51.32°C       | 51.744%        |
diff --git a/operation/benchmarks/tanh/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/tanh/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..f3beeb33d
--- /dev/null
+++ b/operation/benchmarks/tanh/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 3c10679326b32ea5f037db50cc397d41c0ff1934
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 0.21TFLOPS       | 0.21TFLOPS        | 1.07% | 1.07% |
+| nativetorch | True    | 0.21TFLOPS      | 0.21TFLOPS      | 1.07%      | 1.07%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | 15413.82us       | 15420.42us        | 64.88op/s | 64.85op/s | 879982.58us | 6784.51us |
+| nativetorch | 15429.3us       | 15438.85us        | 64.81op/s | 64.77op/s | 14390.52us | 6251.96us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1671.43W | 1716.0W | 109.18W   | /     | 287.06W       | 295.0W      | 3.88W        | 400W  |
+| flaggems监控结果 | 1682.57W | 1716.0W | 81.88W   | /     | 302.75W       | 311.0W      | 4.56W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 1.567%    | 1.639%   | 52.24°C       | 41.64%        |
+| flaggems监控结果 | 1.525%    | 1.645%   | 52.04°C       | 41.457%        |
diff --git a/operation/benchmarks/tanh/nvidia/A100_40_SXM/README.md b/operation/benchmarks/tanh/nvidia/A100_40_SXM/README.md
deleted file mode 100644
index 6ea053042..000000000
--- a/operation/benchmarks/tanh/nvidia/A100_40_SXM/README.md
+++ /dev/null
@@ -1,57 +0,0 @@
-# 参评AI芯片信息
-
-* 厂商：Nvidia
-
-
-* 产品名称：A100
-* 产品型号：A100-40GiB-SXM
-* TDP：400W
-
-# 所用服务器配置
-
-* 服务器数量：1
-
-
-* 单服务器内使用卡数：1
-* 服务器型号：DGX A100
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：linux5.4.0-113
-* CPU：AMD EPYC7742-64core
-* docker版本：20.10.16
-* 内存：1TiB
-* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
-
-# 算子库版本
-
-https://github.com/FlagOpen/FlagGems. Commit ID:982781081f5d62856064ae986e8927a31e96c235
-
-# 评测结果
-
-## 核心评测结果
-
-| 评测项  | 平均相对误差(with FP64-CPU) | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
-| ---- | -------------- | -------------- | ------------ | ------ | ----- |
-| flaggems | 3.76E-08    | 0.17TFLOPS       | 0.17TFLOPS        | 0.89% | 0.89% |
-| nativetorch | 3.76E-08    | 0.17TFLOPS      | 0.17TFLOPS      | 0.89%      | 0.89%    |
-
-## 其他评测结果
-
-| 评测项  | 相对误差(with FP64-CPU)标准差 | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时>延 |
-| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | ------------ |
-| flaggems | 9.52E-10    | 6169.25us       | 6171.65us        | 162.09op/s | 162.03op/s | 301271.92us | 6246.96us |
-| nativetorch | 9.52E-10    | 6188.82us       | 6194.18us        | 161.58op/s | 161.44op/s | 12297.66us | 6213.04us |
-
-## 能耗监控结果
-
-| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单
-卡TDP |
-| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
-| nativetorch监控结果 | 1716.0W | 1716.0W | 0.0W   | /     | 303.74W       | 308.0W      | 4.99W        | 1716.0  |
-| flaggems监控结果 | 1794.0W | 1794.0W | 0.0W   | /     | 344.47W       | 348.0W      | 3.68W        | 1794.0  |
-
-## 其他重要监控结果
-
-| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
-| ---- | --------- | -------- | ------------ | -------------- |
-| nativetorch监控结果 | 0.756%    | 1.395%   | 51.51°C       | 31.535%        |
-| flaggems监控结果 | 0.811%    | 1.397%   | 53.03°C       | 31.352%        |
diff --git a/operation/benchmarks/tanh/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/tanh/nvidia/A100_40_SXM/case_config.yaml
index c7975e944..bc4b04b42 100644
--- a/operation/benchmarks/tanh/nvidia/A100_40_SXM/case_config.yaml
+++ b/operation/benchmarks/tanh/nvidia/A100_40_SXM/case_config.yaml
@@ -1,2 +1 @@
 ITERS: 50000
-SPECTFLOPS: 19.5
\ No newline at end of file
diff --git a/operation/benchmarks/triu/cambricon/MLU/case_config.yaml b/operation/benchmarks/triu/cambricon/MLU/case_config.yaml
new file mode 100644
index 000000000..6d9cc52ac
--- /dev/null
+++ b/operation/benchmarks/triu/cambricon/MLU/case_config.yaml
@@ -0,0 +1 @@
+SPECTFLOPS: 999999
diff --git a/operation/benchmarks/triu/cambricon/MLU/env.sh b/operation/benchmarks/triu/cambricon/MLU/env.sh
new file mode 100644
index 000000000..f7da59c6e
--- /dev/null
+++ b/operation/benchmarks/triu/cambricon/MLU/env.sh
@@ -0,0 +1 @@
+echo "CAMBRICON PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/triu/case_config.yaml b/operation/benchmarks/triu/case_config.yaml
new file mode 100644
index 000000000..53cadd1fd
--- /dev/null
+++ b/operation/benchmarks/triu/case_config.yaml
@@ -0,0 +1,6 @@
+M: 1024
+N: 1024
+WARMUP: 100
+ITERS: 50000
+KERNELWARMUP: 10
+KERNELITERS: 1000
diff --git a/operation/benchmarks/triu/kunlunxin/R300p/case_config.yaml b/operation/benchmarks/triu/kunlunxin/R300p/case_config.yaml
new file mode 100644
index 000000000..e40de9acf
--- /dev/null
+++ b/operation/benchmarks/triu/kunlunxin/R300p/case_config.yaml
@@ -0,0 +1,3 @@
+Shape: [1024, 1024]
+ITERS: 50
+SPECTFLOPS: 9999
diff --git a/operation/benchmarks/triu/kunlunxin/R300p/env.sh b/operation/benchmarks/triu/kunlunxin/R300p/env.sh
new file mode 100644
index 000000000..6e1e159ef
--- /dev/null
+++ b/operation/benchmarks/triu/kunlunxin/R300p/env.sh
@@ -0,0 +1,5 @@
+echo "KUNLUNXIN ENV.SH start"
+
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+
+echo "KUNLUNXIN ENV.SH end"
diff --git a/operation/benchmarks/triu/main.py b/operation/benchmarks/triu/main.py
new file mode 100644
index 000000000..b1ffe07a1
--- /dev/null
+++ b/operation/benchmarks/triu/main.py
@@ -0,0 +1,110 @@
+ # Copyright (c) 2024 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+import torch
+import os
+import time
+from argparse import ArgumentParser, Namespace
+import yaml
+import sys
+import subprocess
+
+sys.path.append("..")
+from drivers.utils import *
+from drivers.calculate import *
+
+
+def parse_args():
+    parser = ArgumentParser(description=" ")
+
+    parser.add_argument("--vendor",
+                        type=str,
+                        required=True,
+                        help="vendor name like nvidia")
+    parser.add_argument("--case_name",
+                        type=str,
+                        required=True,
+                        help="op name like mm")
+    parser.add_argument("--spectflops",
+                        type=str,
+                        required=True,
+                        help="spectflops of current dataformat")
+
+    parser.add_argument("--dataformat",
+                        type=str,
+                        required=True,
+                        help="like FP32,FP16")
+
+    parser.add_argument("--oplib",
+                        type=str,
+                        required=True,
+                        help="impl like pytorch/flaggems/cpp")
+
+    parser.add_argument("--chip",
+                        type=str,
+                        required=True,
+                        help="chip like A100_40_SXM")
+
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+def main(config, case_config):
+    correctness = do_correctness(config.case_name)
+    correctness = correctness == 0
+    dtype = {
+        "FP32": torch.float32,
+        "FP16": torch.float16,
+        "BF16": torch.bfloat16,
+        "INT32": torch.int32,
+        "INT16": torch.int16,
+        "BOOL": torch.bool
+        }
+    set_ieee_float32(config.vendor)
+
+
+    M = case_config.M
+    N = case_config.N
+    # default shape: (M * 50, N * 50)
+    shape = (M * 50, N * 50)
+
+    if config.vendor == 'kunlunxin':
+        # if `Shape' specified in `case_config.yaml', use it
+        if case_config.__contains__('Shape') and case_config.Shape is not None:
+            shape = case_config.Shape
+
+    a = torch.randn(shape ,  dtype=dtype[config.dataformat]).to(0)
+    print(f'Shape for performance_test: {a.shape}')
+
+    latency_nowarm, latency_warm, cputime, kerneltime = do_test(
+        torch.triu, (a, ), host_device_sync, config, case_config)
+
+    op2flops = lambda x: (x * shape[0] ) * (x * shape[1]  - 1) / 2
+
+    perf_result = cal_perf(cputime, kerneltime, op2flops,
+                           config.spectflops)
+    print_result(config, config.case_name, *perf_result, correctness,
+                 latency_nowarm, latency_warm)
+
+
+if __name__ == "__main__":
+    config = parse_args()
+    with open("case_config.yaml", "r") as file:
+        case_config = yaml.safe_load(file)
+    adapt_torch(config.vendor)
+    with open(os.path.join(config.vendor, config.chip, "case_config.yaml"),
+              "r") as file:
+        case_config_vendor = yaml.safe_load(file)
+    case_config.update(case_config_vendor)
+    case_config = Namespace(**case_config)
+
+    if config.oplib == "flaggems":
+        import flag_gems
+        flag_gems.enable()
+        print("Using flaggems")
+    else:
+        print("Using nativetorch")
+    main(config, case_config)
diff --git a/operation/benchmarks/triu/metax/C550_64/case_config.yaml b/operation/benchmarks/triu/metax/C550_64/case_config.yaml
new file mode 100644
index 000000000..529af74ce
--- /dev/null
+++ b/operation/benchmarks/triu/metax/C550_64/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
\ No newline at end of file
diff --git a/operation/benchmarks/triu/metax/C550_64/env.sh b/operation/benchmarks/triu/metax/C550_64/env.sh
new file mode 100644
index 000000000..0cdec082d
--- /dev/null
+++ b/operation/benchmarks/triu/metax/C550_64/env.sh
@@ -0,0 +1 @@
+echo "METAX PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/triu/nvidia/A100_40_SXM/BF16_README.md b/operation/benchmarks/triu/nvidia/A100_40_SXM/BF16_README.md
new file mode 100644
index 000000000..1a1ff731d
--- /dev/null
+++ b/operation/benchmarks/triu/nvidia/A100_40_SXM/BF16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 7042de1d8fb6f978596322faaeda6b55ca1ae5ec
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 22.12TFLOPS       | 22.1TFLOPS        | 7.09% | 7.08% |
+| nativetorch | True    | 19.58TFLOPS      | 19.55TFLOPS      | 6.28%      | 6.26%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 7697.01us       | 7700.48us        | 129.92op/s | 129.86op/s | 3971079.6us | 7770.98us |
+| nativetorch | 8182.05us       | 8188.93us        | 122.22op/s | 122.12op/s | 26088.56us | 8204.55us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1731.6W | 1794.0W | 124.8W   | /     | 376.76W       | 383.0W      | 6.57W        | 400W  |
+| flaggems监控结果 | 1599.0W | 1638.0W | 67.55W   | /     | 265.81W       | 272.0W      | 3.74W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.615%    | 1.06%   | 57.15°C       | 38.038%        |
+| flaggems监控结果 | 0.624%    | 1.061%   | 50.24°C       | 38.827%        |
diff --git a/operation/benchmarks/triu/nvidia/A100_40_SXM/FP16_README.md b/operation/benchmarks/triu/nvidia/A100_40_SXM/FP16_README.md
new file mode 100644
index 000000000..a335dd7ae
--- /dev/null
+++ b/operation/benchmarks/triu/nvidia/A100_40_SXM/FP16_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 7042de1d8fb6f978596322faaeda6b55ca1ae5ec
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 22.15TFLOPS       | 22.13TFLOPS        | 7.1% | 7.09% |
+| nativetorch | True    | 19.58TFLOPS      | 19.55TFLOPS      | 6.28%      | 6.26%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 7692.04us       | 7695.36us        | 130.0op/s | 129.95op/s | 3833847.05us | 7858.99us |
+| nativetorch | 8182.08us       | 8188.93us        | 122.22op/s | 122.12op/s | 27354.22us | 8201.75us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1700.4W | 1794.0W | 151.25W   | /     | 382.7W       | 390.0W      | 7.61W        | 400W  |
+| flaggems监控结果 | 1606.8W | 1638.0W | 62.4W   | /     | 268.49W       | 273.0W      | 4.53W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.626%    | 1.072%   | 60.06°C       | 50.555%        |
+| flaggems监控结果 | 0.631%    | 1.071%   | 50.51°C       | 38.827%        |
diff --git a/operation/benchmarks/triu/nvidia/A100_40_SXM/FP32_README.md b/operation/benchmarks/triu/nvidia/A100_40_SXM/FP32_README.md
new file mode 100644
index 000000000..2492772d4
--- /dev/null
+++ b/operation/benchmarks/triu/nvidia/A100_40_SXM/FP32_README.md
@@ -0,0 +1,53 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数: 1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: 7042de1d8fb6f978596322faaeda6b55ca1ae5ec
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | True    | 5.54TFLOPS       | 5.54TFLOPS        | 28.4% | 28.4% |
+| nativetorch | True    | 7.78TFLOPS      | 7.79TFLOPS      | 39.92%      | 39.96%    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- | 
+| flaggems | 15385.57us       | 15384.58us        | 65.0op/s | 65.0op/s | 4664630.35us | 15567.45us |
+| nativetorch | 12976.67us       | 12969.98us        | 77.06op/s | 77.1op/s | 32750.17us | 13024.22us |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | 1755.0W | 1872.0W | 125.37W   | /     | 389.41W       | 396.0W      | 5.18W        | 400W  |
+| flaggems监控结果 | 1604.57W | 1638.0W | 81.88W   | /     | 253.09W       | 259.0W      | 6.71W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | 0.609%    | 1.014%   | 61.34°C       | 99.894%        |
+| flaggems监控结果 | 0.642%    | 1.011%   | 49.59°C       | 75.831%        |
diff --git a/operation/benchmarks/triu/nvidia/A100_40_SXM/case_config.yaml b/operation/benchmarks/triu/nvidia/A100_40_SXM/case_config.yaml
new file mode 100644
index 000000000..bc4b04b42
--- /dev/null
+++ b/operation/benchmarks/triu/nvidia/A100_40_SXM/case_config.yaml
@@ -0,0 +1 @@
+ITERS: 50000
diff --git a/operation/benchmarks/triu/nvidia/A100_40_SXM/env.sh b/operation/benchmarks/triu/nvidia/A100_40_SXM/env.sh
new file mode 100644
index 000000000..33786ec0d
--- /dev/null
+++ b/operation/benchmarks/triu/nvidia/A100_40_SXM/env.sh
@@ -0,0 +1 @@
+echo "NVIDIA PLACEHOLDER ENV.SH"
diff --git a/operation/benchmarks/triu/nvidia/A100_40_SXM/requirements.txt b/operation/benchmarks/triu/nvidia/A100_40_SXM/requirements.txt
new file mode 100644
index 000000000..330e27963
--- /dev/null
+++ b/operation/benchmarks/triu/nvidia/A100_40_SXM/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/operation/configs/host.yaml b/operation/configs/host.yaml
index 279479f44..4895e4d37 100644
--- a/operation/configs/host.yaml
+++ b/operation/configs/host.yaml
@@ -1,6 +1,10 @@
 FLAGPERF_PATH: "/home/FlagPerf/operation"
 FLAGPERF_LOG_PATH: "result"
+##nvidia,iluvatar,or other
 VENDOR: "nvidia"
+# VENDOR: "cambricon"
+# VENDOR: "metax"
+# VENDOR: "kunlunxin"
 FLAGPERF_LOG_LEVEL: "info"
 HOSTS: ["192.168.1.2"]
 NPROC_PER_NODE: 1
@@ -8,17 +12,29 @@ SSH_PORT: "22"
 HOSTS_PORTS: ["2222"]
 MASTER_PORT: "29501"
 SHM_SIZE: "32G"
-ACCE_CONTAINER_OPT: " --gpus all"
+# only for iluvatar,dual process operation, modify device id,0 or 1
+DEVICE: 0
 # for nvidia, using " -- gpus all"
+# for metax, using " --device=/dev/dri --device=/dev/mxcd --group-add video"
+# for kunlunxin, using "--device=/dev/xpu0 --device=/dev/xpu1 --device=/dev/xpu2 --device=/dev/xpu3 --device=/dev/xpu4 --device=/dev/xpu5 --device=/dev/xpu6 --device=/dev/xpu7 --device=/dev/xpuctrl"
+# for cambricon, using " --device=/dev/cambricon_dev0:/dev/cambricon_dev0 --device=/dev/cambricon_dev1:/dev/cambricon_dev1 --device=/dev/cambricon_dev2:/dev/cambricon_dev2 --device=/dev/cambricon_dev3:/dev/cambricon_dev3  --device=/dev/cambricon_dev4:/dev/cambricon_dev4  --device=/dev/cambricon_dev5:/dev/cambricon_dev5  --device=/dev/cambricon_dev6:/dev/cambricon_dev6   --device=/dev/cambricon_dev7:/dev/cambricon_dev7  --device=/dev/cambricon_ctl "
+# for iluvatar, using ""
 # for xxx, using
+ACCE_CONTAINER_OPT: " --gpus all"
 PIP_SOURCE: "https://mirror.baidu.com/pypi/simple"
 CLEAR_CACHES: True
 # for nvidia, using "CUDA_VISIBLE_DEVICES"
+# for metax, using "MACA_VISIBLE_DEVICES"
+# for cambricon, using "MLU_VISIBLE_DEVICES"
 # for xxx, using
 ACCE_VISIBLE_DEVICE_ENV_NAME: "CUDA_VISIBLE_DEVICES"
 # "operation:dataFormat:chip": "docker_images"
 # now only support flaggems and nativepytorch
 CASES: 
-    "mm:FP16:nativetorch:A100_40_SXM": "ngctorch2403"
+    "mm:FP16:312:nativetorch:A100_40_SXM": "ngctorch2403"
 #    "mm:FP16:flaggems:A100_40_SXM": "ngctorch2403"
 #    "mm:FP16:nativetorch:A100_40_SXM": "ngctorch2403"
+#    'exp:FP32:nativetorch:R300p" : "xpytorch029"
+#    'exp:FP32:flaggems:R300p" : "xpytorch029"
+#    "abs:FP32:nativetorch:BI150": "bi150-410"
+#    "argmax:BF16:312:flaggems:MLU": "camtorch0830"
diff --git a/operation/container_main.py b/operation/container_main.py
index 6e6dc3757..787e99e9a 100644
--- a/operation/container_main.py
+++ b/operation/container_main.py
@@ -23,7 +23,7 @@ def parse_args():
                         type=int,
                         required=True,
                         help="number of node")
-                        
+
     parser.add_argument("--nproc_per_node",
                         type=int,
                         required=True,
@@ -43,27 +43,21 @@ def parse_args():
                         type=str,
                         required=True,
                         help="log level")
-    
+
     parser.add_argument("--master_port",
                         type=int,
                         required=True,
                         help="master port")
-    
+
     parser.add_argument("--master_addr",
                         type=str,
                         required=True,
                         help="master ip")
-                        
-    parser.add_argument("--host_addr",
-                        type=str,
-                        required=True,
-                        help="my ip")
-    
-    parser.add_argument("--node_rank",
-                        type=int,
-                        required=True,
-                        help="my rank")
-                        
+
+    parser.add_argument("--host_addr", type=str, required=True, help="my ip")
+
+    parser.add_argument("--node_rank", type=int, required=True, help="my rank")
+
     parser.add_argument("--perf_path",
                         type=str,
                         required=True,
@@ -84,40 +78,46 @@ def write_pid_file(pid_file_path, pid_file):
     file_d = open(pid_file_path, "w")
     file_d.write("%s\n" % os.getpid())
     file_d.close()
-    
+
 
 if __name__ == "__main__":
     config = parse_args()
-    
-    logfile = os.path.join(config.log_dir, config.case_name, config.host_addr + "_noderank" + str(config.node_rank), "container_main.log.txt")
+
+    logfile = os.path.join(
+        config.log_dir, config.case_name,
+        config.host_addr + "_noderank" + str(config.node_rank),
+        "container_main.log.txt")
     logger.remove()
     logger.add(logfile, level=config.log_level)
     logger.add(sys.stdout, level=config.log_level)
-    
+
     logger.info(config)
     write_pid_file(config.log_dir, "start_base_task.pid")
-    logger.info("Success Writing PID file at " + os.path.join(config.log_dir, "start_base_task.pid"))
-    
-    op, dataformat, oplib, chip = config.case_name.split(":")
-    
+    logger.info("Success Writing PID file at " +
+                os.path.join(config.log_dir, "start_base_task.pid"))
+
+    op, dataformat, spectflops, oplib, chip = config.case_name.split(":")
+
     case_dir = os.path.join(config.perf_path, "benchmarks", op)
     start_cmd = "cd " + case_dir + ";python3 main.py "
     start_cmd += " --vendor=" + config.vendor
+    start_cmd += " --case_name=" + op
+    start_cmd += " --spectflops=" + spectflops
     start_cmd += " --dataformat=" + dataformat
     start_cmd += " --oplib=" + oplib
     start_cmd += " --chip=" + chip
-    
-    script_log_file = os.path.join(os.path.dirname(logfile), "operation.log.txt")
+
+    script_log_file = os.path.join(os.path.dirname(logfile),
+                                   "operation.log.txt")
 
     logger.info(start_cmd)
     logger.info(script_log_file)
-    
+
     f = open(script_log_file, "w")
     p = subprocess.Popen(start_cmd,
                          shell=True,
                          stdout=f,
                          stderr=subprocess.STDOUT)
     p.wait()
-    f.close() 
-    logger.info("Task Finish")    
-  
+    f.close()
+    logger.info("Task Finish")
diff --git a/operation/helper/main.sh b/operation/helper/main.sh
new file mode 100644
index 000000000..48fd6fc76
--- /dev/null
+++ b/operation/helper/main.sh
@@ -0,0 +1,123 @@
+#!/bin/bash
+
+# ==========================修改点1: START==========================
+VENDOR="nvidia"
+ACCE_CONTAINER_OPT=" --gpus all"
+ACCE_VISIBLE_DEVICE_ENV_NAME="CUDA_VISIBLE_DEVICES"
+SSH_PORT="22"
+HOSTS_PORTS="[\"2222\"]"
+MASTER_PORT="29501"
+TDP="400W"
+
+ip_address="10.1.2.155"
+chip_name="A100_40_SXM"
+env_name="ngctorch2403"
+
+declare -A spec_tflops_dict
+spec_tflops_dict["BF16"]=312
+spec_tflops_dict["FP16"]=312
+spec_tflops_dict["FP32"]=19.5
+spec_tflops_dict["INT32"]=19.5
+spec_tflops_dict["INT16"]=-1
+#=============================STOP==========================
+
+declare -A op_dict
+# Assign list values to dictionary keys
+# ==========================修改点2: START==========================
+op_dict["abs"]="FP32 FP16 BF16"
+op_dict["add"]="FP32 FP16 BF16"
+op_dict["addmm"]="FP32 FP16 BF16"
+op_dict["all"]="FP32 FP16 BF16"
+op_dict["amax"]="FP32 FP16 BF16"
+op_dict["argmax"]="FP32 FP16 BF16"
+op_dict["bitwise_and"]="INT32 INT16"
+op_dict["bitwise_not"]="INT32 INT16"
+op_dict["bitwise_or"]="INT32 INT16"
+op_dict["bmm"]="FP32 FP16 BF16"
+op_dict["cos"]="FP32 FP16 BF16"
+op_dict["cross_entropy_loss"]="FP32 FP16 BF16"
+op_dict["div"]="FP32 FP16 BF16"
+op_dict["dropout"]="FP32 FP16 BF16"
+op_dict["eq"]="FP32 FP16 BF16"
+op_dict["exp"]="FP32 FP16 BF16"
+op_dict["ge"]="FP32 FP16 BF16"
+op_dict["gelu"]="FP32 FP16 BF16"
+op_dict["group_norm"]="FP32 FP16 BF16"
+op_dict["gt"]="FP32 FP16 BF16"
+op_dict["isinf"]="FP32 FP16 BF16"
+op_dict["isnan"]="FP32 FP16 BF16"
+op_dict["layer_norm"]="FP32 FP16 BF16"
+op_dict["le"]="FP32 FP16 BF16"
+op_dict["linear"]="FP32 FP16 BF16"
+op_dict["log_softmax"]="FP32 FP16 BF16"
+op_dict["lt"]="FP32 FP16 BF16"
+op_dict["max"]="FP32 FP16 BF16"
+op_dict["mean"]="FP32 FP16 BF16"
+op_dict["min"]="FP32 FP16 BF16"
+op_dict["mm"]="FP32 FP16 BF16"
+op_dict["mul"]="FP32 FP16 BF16"
+op_dict["mv"]="FP32 FP16 BF16"
+op_dict["native_dropout"]="FP32 FP16 BF16"
+op_dict["native_group_norm"]="FP32 FP16 BF16"
+op_dict["ne"]="FP32 FP16 BF16"
+op_dict["neg"]="FP32 FP16 BF16"
+op_dict["pow"]="FP32 FP16 BF16"
+op_dict["prod"]="FP32 FP16 BF16"
+op_dict["reciprocal"]="FP32 FP16 BF16"
+op_dict["relu"]="FP32 FP16 BF16"
+op_dict["rsqrt"]="FP32 FP16 BF16"
+op_dict["sigmoid"]="FP32 FP16 BF16"
+op_dict["silu"]="FP32 FP16 BF16"
+op_dict["sin"]="FP32 FP16 BF16"
+op_dict["softmax"]="FP32 FP16 BF16"
+op_dict["sub"]="FP32 FP16 BF16"
+op_dict["sum"]="FP32 FP16 BF16"
+op_dict["tanh"]="FP32 FP16 BF16"
+op_dict["triu"]="FP32 FP16 BF16"
+#=============================STOP==========================
+
+
+export VENDOR
+export ACCE_CONTAINER_OPT
+export ACCE_VISIBLE_DEVICE_ENV_NAME
+export SSH_PORT
+export HOSTS_PORTS
+export MASTER_PORT
+export TDP
+
+file="overall.data"
+
+# 检查文件是否存在
+if [ -e "$file" ]; then
+  rm "$file"
+  echo "$file 已被删除"
+fi
+touch "$file"
+
+total=0
+success=0
+fail=0
+
+# Iterate over op_dict
+for key in "${!op_dict[@]}"; do
+    IFS=' ' read -r -a value_list <<< "${op_dict[$key]}"
+    # Iterate over values for each key
+    for value in "${value_list[@]}"; do
+        # Your code here
+        echo "Running operation: $key with data format: $value"
+        total=$((total + 1))
+        # Example command using the variables
+        bash run.sh --op_name "$key" --data_format "$value" --ip_address "$ip_address" --chip_name "$chip_name" --env_name "$env_name" --spec_tflops "${spec_tflops_dict[$value]}"
+        if [ $? -eq 0 ]; then
+            echo "success: ${key} ${value}" >> $file
+            success=$((success + 1))
+        else
+            echo "fail: ${key} ${value}" >> $file
+            fail=$((fail + 1))
+        fi
+    done
+done
+
+echo -e "\n\n\ntotal: ${total}" >> $file
+echo "success: ${success}" >> $file
+echo "fail: ${fail}" >> $file
diff --git a/operation/helper/render.py b/operation/helper/render.py
new file mode 100644
index 000000000..e47054e9a
--- /dev/null
+++ b/operation/helper/render.py
@@ -0,0 +1,162 @@
+import re
+import sys
+import os
+from jinja2 import Environment, FileSystemLoader
+
+# Read TDP from environment variable
+tdp = os.environ.get('TDP')
+if tdp:
+    single_card_tdp = tdp
+else:
+    single_card_tdp = ''
+
+# Regular expressions for extracting desired values
+# 用这个dict 从日志中提取数据
+regex_dict = {
+    # Core evaluation results
+    'correctness': r'Correctness with CPU golden Reference:(.*)',
+    'tflops': r'cputime=[0-9.]+\s+us,\s+throughput=[0-9.]+\s+op/s,\s+equals to (.*?) TFLOPS\s+',
+    'kernel_clock': r'kerneltime=[0-9.]+\s+us,\s+throughput=[0-9.]+\s+op/s,\s+equals to (.*?) TFLOPS\s+',
+    'fu_cputime': r'cputime=(.*?),',
+    'kerneltime': r'FLOPS utilization: cputime=.*kerneltime=(.*?)\s+',
+    # Other evaluation results
+    'cpu_time': r'cputime=(.*?) us',
+    'kernel_time': r'kerneltime=(.*?) us',
+    'cpu_ops': r'cputime=.*, throughput=(.*?) op/s',
+    'kernel_ops': r'kerneltime=.*, throughput=(.*?) op/s',
+    'no_warmup_delay': r'no warmup=(.*?) us',
+    'warmup_delay': r'no warmup=[0-9.]+\s+us,\s+warmup=(.*?) us',
+    # Power monitoring results
+    'ave_system_power': r'AVERAGE: (.*?) Watts',
+    'max_system_power': r'MAX: (.*?) Watts',
+    'system_power_stddev': r'STD DEVIATION: (.*?) Watts',
+    'single_card_avg_power': r'RANK.* AVERAGE: (.*?) Watts',
+    'single_card_max_power': r'RANK.* MAX: (.*?) Watts',
+    'single_card_power_stddev': r'RANK.* STD DEVIATION: (.*?) Watts',
+    # Other important monitoring results
+    'avg_cpu_usage': r'SYSTEM CPU:\s+.*AVERAGE:\s+(\d+\.\d+)\s+%',
+    'avg_mem_usage': r'SYSTEM MEMORY:\s+.*AVERAGE:\s+(\d+\.\d+)\s+%',
+    'single_card_avg_temp': r'AI-chip TEMPERATURE:\s+.*AVERAGE: (.*?) °C',
+    'max_gpu_memory_usage_per_card': r'AI-chip MEMORY:\s+.*AVERAGE:\s+\d+\.\d+ %,\s+MAX:\s+(\d+\.\d+) %',
+}
+
+# 用这个dict格式化生成最后的数据
+format_dict = {
+    # Core evaluation results
+    'correctness': None,
+    'tflops': ["TFLOPS"],
+    'kernel_clock': ["TFLOPS"],
+    'fu_cputime': None,
+    'kerneltime': None,
+    # Other evaluation results
+    'cpu_time': ["us"],
+    'kernel_time': ["us"],
+    'cpu_ops': ["2F", 'op/s'],
+    'kernel_ops': ["2F", 'op/s'],
+    'no_warmup_delay': ['us'],
+    'warmup_delay': ['us'],
+    # Power monitoring results
+    'ave_system_power': ['W'],
+    'max_system_power': ['W'],
+    'system_power_stddev': ['W'],
+    'single_card_avg_power': ['W'],
+    'single_card_max_power': ['W'],
+    'single_card_power_stddev': ['W'],
+    # Other important monitoring results
+    'avg_cpu_usage': ['%'],
+    'avg_mem_usage': ['%'],
+    'single_card_avg_temp': ['°C'],
+    'max_gpu_memory_usage_per_card': ['%'],
+}
+
+def read_log_from_file(file_name):
+    try:
+        with open(file_name, 'r') as file:
+            log_text = file.read()
+        return log_text
+    except FileNotFoundError:
+        print(f"File '{file_name}' not found.")
+        return None
+
+
+def render(extracted_values, readme_file_path):
+    current_path = os.path.dirname(os.path.abspath(__file__))
+    print(current_path)
+    template_path = os.path.join(current_path)
+    env = Environment(loader=FileSystemLoader(template_path))
+    template = env.get_template('template.md')
+    rendered_text = template.render(extracted_values)
+    dest_file_path = os.path.join(readme_file_path, "README.md")
+    with open(dest_file_path, 'w') as file:
+        file.write(rendered_text)
+
+# Function to extract values using regular expressions
+def extract_values_from_log(log_text, regex_dict):
+    extracted_values = {}
+    for key, regex_pattern in regex_dict.items():
+        match = re.search(regex_pattern, log_text)
+        if match:
+            match_text = match.group(0)
+            line_number = log_text[:match.start()].count('\n') + 1
+            extracted_values[key] = match.group(1).strip()
+        else:
+            extracted_values[key] = None
+    return extracted_values
+
+
+
+def format_values(extracted_values, format_dict):
+    formatted_values = {}
+    for key, value in extracted_values.items():
+        if key in format_dict:
+            if format_dict[key]:
+                for format_type in format_dict[key]:
+                    if format_type == "2E":
+                        value = float(value)
+                        formatted_values[key] = f"{value:.2E}"
+                    elif format_type == "2F":
+                        value = float(value)
+                        formatted_values[key] = str(round(value, 2))
+                        print(formatted_values[key])
+                    else:
+                        formatted_values[key] = f"{value}{format_type}" if key not in formatted_values.keys() else f"{formatted_values[key]}{format_type}"
+            else:
+                formatted_values[key] = value
+        else:
+            formatted_values[key] = None
+    return formatted_values
+
+# Read log from file specified in command line argument
+if __name__ == "__main__":
+    if len(sys.argv) >=2 :
+        file_name = sys.argv[1]
+        data_type = sys.argv[2]
+        readme_file_path = sys.argv[3]
+        log_text = read_log_from_file(file_name)
+        log_text = log_text.split("analysis logs")[1]
+        if log_text:
+            extracted_values = extract_values_from_log(log_text, regex_dict)
+            for key, value in extracted_values.items():
+                print(f"{key}: {value}")
+            
+            extracted_values = format_values(extracted_values, format_dict)
+            extracted_values = {f"{data_type}_{key}": value for key, value in extracted_values.items()}
+            extracted_values[f"{data_type}_single_card_tdp"] = single_card_tdp
+            data_file = os.path.join(readme_file_path, "data.json")
+
+            if os.path.exists(data_file):
+                with open(data_file, 'r') as file:
+                    data = file.read()
+                # Merge the values from data file with extracted_values
+                data_values = eval(data)
+                extracted_values.update(data_values)
+                with open(data_file, 'w') as file:
+                    file.write(str(extracted_values))
+                if len(extracted_values.keys()) >= 44:
+                    render(extracted_values, readme_file_path)
+            else:
+                # Write extracted_values to data file
+                with open(data_file, 'w+') as file:
+                    file.write(str(extracted_values))
+    else:
+        print("Please provide a file name as a command line argument.")
diff --git a/operation/helper/requirments.txt b/operation/helper/requirments.txt
new file mode 100644
index 000000000..33afd9222
--- /dev/null
+++ b/operation/helper/requirments.txt
@@ -0,0 +1,2 @@
+pyyaml
+jinja2
diff --git a/operation/helper/run.sh b/operation/helper/run.sh
new file mode 100644
index 000000000..a5c79e6ea
--- /dev/null
+++ b/operation/helper/run.sh
@@ -0,0 +1,167 @@
+#!/bin/bash
+
+# 函数: 修改配置
+modify_config() {
+    echo "Modifying configuration..."
+    local case_type=$1
+    local result_dir=$2
+    echo "case_type is: ${case_type}"
+    echo "result_dir is: ${result_dir}"
+
+    sed -i "s|^VENDOR:.*$|VENDOR: \"$VENDOR\"|" "${OPERATIONDIR}/configs/host.yaml"
+    sed -i "s|^SSH_PORT:.*$|SSH_PORT: \"$SSH_PORT\"|" "${OPERATIONDIR}/configs/host.yaml"
+    sed -i "s|^HOSTS_PORTS:.*$|HOSTS_PORTS: $HOSTS_PORTS|" "${OPERATIONDIR}/configs/host.yaml"
+    sed -i "s|^MASTER_PORT:.*$|MASTER_PORT: \"$MASTER_PORT\"|" "${OPERATIONDIR}/configs/host.yaml"
+    sed -i "s|^ACCE_CONTAINER_OPT:.*$|ACCE_CONTAINER_OPT: \"$ACCE_CONTAINER_OPT\"|" "${OPERATIONDIR}/configs/host.yaml"
+    sed -i "s|^ACCE_VISIBLE_DEVICE_ENV_NAME:.*$|ACCE_VISIBLE_DEVICE_ENV_NAME: \"$ACCE_VISIBLE_DEVICE_ENV_NAME\"|" "${OPERATIONDIR}/configs/host.yaml"
+
+    # 修改 operation/configs/host.yaml
+    sed -i "s|^FLAGPERF_PATH:.*$|FLAGPERF_PATH: \"$OPERATIONDIR\"|" "${OPERATIONDIR}/configs/host.yaml"
+
+    sed -i "s|^FLAGPERF_LOG_PATH:.*$|FLAGPERF_LOG_PATH: \"$result_dir\/logs\"|" "${OPERATIONDIR}/configs/host.yaml"
+
+    # 修改ip
+    sed -i "s|^HOSTS:.*$|HOSTS: [\"$ip_address\"]|" "${OPERATIONDIR}/configs/host.yaml"
+
+    # 替换FP  和 替换case 类型等
+    #  "eq:FP32:nativetorch:A100_40_SXM": "ngctorch2403"
+    sed -i "s|^    \".*:.*:.*:.*\": \".*\"|    \"$op_name:$data_format:$spec_tflops:$case_type:$chip_name\": \"$env_name\"|" "${OPERATIONDIR}/configs/host.yaml"
+    # 备份一下, 方便排查问题
+    cp "${OPERATIONDIR}/configs/host.yaml" "${result_dir}/bak_${case_type}_host.yaml"
+   }
+
+
+parse_log() {
+    local result_dir=$1
+    log_dir="${OPERATIONDIR}/${result_dir}/logs"
+    latest_folder=$(ls -td "$log_dir"/*/ | head -n 1)
+    echo "log dir is: ${latest_folder}"
+    log_file_path="${latest_folder}flagperf_run.log"
+    readme_file_path="${OPERATIONDIR}/${result_dir}"
+    if [ -f "$log_file_path" ]; then
+        cd "${CURRENTDIR}"
+        python render.py "${log_file_path}" "${case_type}" "${readme_file_path}"
+        if [ $? -eq 0 ]; then
+            echo "markdown渲染成功"
+        else
+            echo "markdown渲染失败"
+            exit 1
+        fi
+    else
+        echo "error: log dir not exist"
+        exit 1
+    fi
+}
+
+run_cases_and_gen_readme() {
+    local case_type=$1
+    local result_dir=$2
+    # 执行测试
+    cd "$OPERATIONDIR"
+    echo "-------------------current dir---------------------"
+    echo `pwd`
+    echo "start to run..."
+    python run.py
+    # 检查上一条命令的执行结果
+    if [ $? -eq 0 ]; then
+        echo "执行成功"
+        parse_log $result_dir
+    else
+        echo "执行失败"
+        exit 1
+    fi
+
+}
+
+
+
+main() {
+    result_dir="results/${op_name}_${data_format}"
+    cd "$OPERATIONDIR"
+    mkdir -p $result_dir
+    if [ -f "${result_dir}/data.json" ]; then
+        rm "${result_dir}/data.json"
+    fi
+    # 调用修改配置函数
+    case_type=("nativetorch" "flaggems")
+    for case_type in ${case_type[@]}
+    do
+	cd "$OPERATIONDIR"
+        modify_config "$case_type" "$result_dir"
+        run_cases_and_gen_readme "$case_type" "$result_dir"
+    done
+}
+
+# Initialize variables with default values
+data_format=""
+op_name=""
+ip_address=""
+chip_name=""
+env_name=""
+spec_tflops=0
+
+usage() {
+    echo "Usage: $0 --op_name <op_name> --data_format <data_format> --ip_address <ip_address>  --chip_name <chip_name> --env_name <env_name> --spec_tflops <spec_tflops>"
+    exit 1
+}
+
+
+# Parse command line options
+while [ $# -gt 0 ]; do
+    case "$1" in
+        --data_format)
+            data_format="$2"
+            shift 2
+            ;;
+        --op_name)
+            op_name="$2"
+            shift 2
+            ;;
+        --ip_address)
+            ip_address="$2"
+            shift 2
+            ;;
+        --chip_name)
+            chip_name="$2"
+            shift 2
+            ;;
+        --env_name)
+            env_name="$2"
+            shift 2
+            ;; 
+        --spec_tflops)
+            spec_tflops="$2"
+            shift 2
+            ;; 
+        *)
+            echo "Unknown option: $1"
+            usage
+            ;;
+    esac
+done
+
+# Check if required options are provided
+if [ -z "$data_format" ] || [ -z "$op_name" ] || [ -z "$ip_address" ] || [ -z "$chip_name" ] || [ -z "$env_name" ] || [ -z "$spec_tflops" ]; then
+    echo "Error: Missing required options."
+    usage
+fi
+
+# Display parsed options
+echo "data_format: $data_format"
+echo "op_name: $op_name"
+echo "ip_address: $ip_address"
+echo "chip_name: $chip_name"
+echo "env_name: $env_name"
+echo "spec_tflops: $spec_tflops"
+
+
+# Read env vars
+VENDOR="$VENDOR"
+ACCE_CONTAINER_OPT="$ACCE_CONTAINER_OPT"
+ACCE_VISIBLE_DEVICE_ENV_NAME="$ACCE_VISIBLE_DEVICE_ENV_NAME"
+
+CURRENTDIR=$(pwd)
+# 获取当前路径的上一级目录
+OPERATIONDIR=$(dirname "$CURRENTDIR")
+
+main
diff --git a/operation/helper/template.md b/operation/helper/template.md
new file mode 100644
index 000000000..9fb5daba8
--- /dev/null
+++ b/operation/helper/template.md
@@ -0,0 +1,54 @@
+# 参评AI芯片信息
+
+* 厂商：XXX
+
+
+* 产品名称：XXX
+* 产品型号：XXXX
+* TDP：XXXX
+
+# 所用服务器配置
+
+* 服务器数量：XXX
+* 单服务器内使用卡数: XXX
+* 服务器型号：XXX
+* 操作系统版本：XXX
+* 操作系统内核：XXX
+* CPU：XXX
+* docker版本：XXX
+* 内存：XXX
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 算子库版本
+
+https://github.com/FlagOpen/FlagGems. Commit ID: XXX
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | correctness | TFLOPS(cpu wall clock) | TFLOPS(kernel clock) | FU(FLOPS Utilization)-cputime | FU-kerneltime |
+| ---- | -------------- | -------------- | ------------ | ------ | ----- |
+| flaggems | {{ flaggems_correctness }}    | {{ flaggems_tflops }}       | {{ flaggems_kernel_clock}}        | {{ flaggems_fu_cputime }} | {{ flaggems_kerneltime }} |
+| nativetorch | {{ nativetorch_correctness }}    | {{ nativetorch_tflops }}      | {{ nativetorch_kernel_clock}}      | {{ nativetorch_fu_cputime }}      | {{ nativetorch_kerneltime }}    |
+
+## 其他评测结果
+
+| 评测项  | cputime | kerneltime | cputime吞吐 | kerneltime吞吐 | 无预热时延 | 预热后时延 |
+| ---- | -------------- | -------------- | ------------ | ------------ | -------------- | -------------- |
+| flaggems | {{ flaggems_cpu_time }}       | {{ flaggems_kernel_time }}        | {{ flaggems_cpu_ops }} | {{ flaggems_kernel_ops }} | {{ flaggems_no_warmup_delay }} | {{ flaggems_warmup_delay }} |
+| nativetorch | {{ nativetorch_cpu_time }}       | {{ nativetorch_kernel_time }}        | {{ nativetorch_cpu_ops }} | {{ nativetorch_kernel_ops }} | {{ nativetorch_no_warmup_delay }} | {{ nativetorch_warmup_delay }} |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| nativetorch监控结果 | {{ nativetorch_ave_system_power }} | {{ nativetorch_max_system_power }} | {{ nativetorch_system_power_stddev }}   | /     | {{ nativetorch_single_card_avg_power }}       | {{ nativetorch_single_card_max_power}}      | {{ nativetorch_single_card_power_stddev}}        | {{ nativetorch_single_card_tdp}}  |
+| flaggems监控结果 | {{ flaggems_ave_system_power }} | {{ flaggems_max_system_power }} | {{ flaggems_system_power_stddev }}   | /     | {{ flaggems_single_card_avg_power }}       | {{ flaggems_single_card_max_power}}      | {{ flaggems_single_card_power_stddev}}        | {{ flaggems_single_card_tdp}}  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡最大显存占用 |
+| ---- | --------- | -------- | ------------ | -------------- |
+| nativetorch监控结果 | {{nativetorch_avg_cpu_usage}}    | {{nativetorch_avg_mem_usage}}   | {{nativetorch_single_card_avg_temp}}       | {{nativetorch_max_gpu_memory_usage_per_card}}        |
+| flaggems监控结果 | {{flaggems_avg_cpu_usage}}    | {{flaggems_avg_mem_usage}}   | {{flaggems_single_card_avg_temp}}       | {{flaggems_max_gpu_memory_usage_per_card}}        |
diff --git a/operation/run.py b/operation/run.py
index 30a85ccd8..f3682af93 100644
--- a/operation/run.py
+++ b/operation/run.py
@@ -43,8 +43,7 @@ def check_cluster_health():
     if len(bad_hosts) != 0:
         for bad_host in bad_hosts:
             RUN_LOGGER.error("Check " + bad_host + " failed. ssh command exit "
-                                                   "with: " + str(
-                bad_hosts[bad_host]))
+                             "with: " + str(bad_hosts[bad_host]))
         RUN_LOGGER.error("Check hosts in the cluster......[FAILED] [EXIT]")
         sys.exit(3)
     RUN_LOGGER.info("Check hosts in the cluster......[SUCCESS]")
@@ -65,7 +64,8 @@ def check_cluster_deploy_path(dp_path):
                     "...[SUCCESS]")
 
 
-def prepare_docker_image_cluster(dp_path, image_mgr, framework, nnodes, config):
+def prepare_docker_image_cluster(dp_path, image_mgr, framework, nnodes,
+                                 config):
     '''Prepare docker image in registry and in the cluster.
     '''
     vendor = config.VENDOR
@@ -201,9 +201,9 @@ def start_tasks_in_cluster(dp_path, container_name, config, base_args,
     nnodes = len(config.HOSTS)
     framework = config.CASES[case]
 
-    op, df, oplib, chip = case.split(":")
-    env_dir = os.path.join(
-        config.FLAGPERF_PATH, "benchmarks", op, config.VENDOR, chip)
+    op, df, spectflops, oplib, chip = case.split(":")
+    env_dir = os.path.join(config.FLAGPERF_PATH, "benchmarks", op,
+                           config.VENDOR, chip)
 
     env_shell = os.path.join(env_dir, "env.sh")
     req_file = os.path.join(env_dir, "requirements.txt")
@@ -221,6 +221,8 @@ def start_tasks_in_cluster(dp_path, container_name, config, base_args,
                      + "2>&1"
 
     if os.path.isfile(env_shell):
+        if config.VENDOR == "iluvatar":
+            start_cmd += " && export CUDA_VISIBLE_DEVICES=" + str(config.DEVICE) 
         start_cmd += " && source " + env_shell \
                      + " > " + abs_log_path + "/env.log.txt " \
                      + "2>&1"
@@ -232,7 +234,8 @@ def start_tasks_in_cluster(dp_path, container_name, config, base_args,
     start_cmd += " \""
 
     RUN_LOGGER.debug("Run cmd in the cluster to start tasks, cmd=" + start_cmd)
-    CLUSTER_MGR.run_command_some_hosts_distribution_info(start_cmd, nnodes, 15, "base")
+    CLUSTER_MGR.run_command_some_hosts_distribution_info(
+        start_cmd, nnodes, 15, "base")
     # Wait a moment for starting tasks.
     time.sleep(60)
 
@@ -254,7 +257,7 @@ def wait_for_finish(dp_path, container_name, pid_file_path, nnodes):
         if len(bad_hosts) == nnodes:
             break
         time.sleep(10)
-        
+
 
 def prepare_containers_env_cluster(dp_path, case_log_dir, container_name,
                                    image_name, nnodes, config):
@@ -305,8 +308,8 @@ def get_valid_cases(config):
             "No valid cases found in config/host.yaml because config.CASES is not a dict...[EXIT]"
         )
         sys.exit(4)
-    RUN_LOGGER.debug(
-        "Check configs of all test cases: " + ",".join(config.CASES))
+    RUN_LOGGER.debug("Check configs of all test cases: " +
+                     ",".join(config.CASES))
     valid_cases = []
     cases_config_error = []
     for case in config.CASES:
@@ -337,19 +340,18 @@ def collect_and_merge_logs(curr_log_path, cases, nnodes):
                              ",".join(failed_hosts))
             get_all = False
         else:
-            RUN_LOGGER.info("Case " + case + 
-                            ", get all logs in dir: " + case_log_dir)
+            RUN_LOGGER.info("Case " + case + ", get all logs in dir: " +
+                            case_log_dir)
 
     if get_all:
         RUN_LOGGER.info("Congrats! See all logs in " + curr_log_path)
     else:
         RUN_LOGGER.warning("Sorry! Not all logs have been collected in " +
                            curr_log_path)
-                           
+
 
 def summary_logs(config, case_log_dir):
-    analysis_module_path = os.path.join("vendors",
-                                        config.VENDOR,
+    analysis_module_path = os.path.join("vendors", config.VENDOR,
                                         config.VENDOR + "_analysis")
     analysis_module_path = analysis_module_path.replace("/", ".")
     analysis_module = importlib.import_module(analysis_module_path)
@@ -361,29 +363,34 @@ def summary_logs(config, case_log_dir):
         result[host] = {}
         monitor_log_dir = os.path.join(case_log_dir,
                                        host + "_noderank" + str(noderank))
-        
-        # vendor monitor results like temp/power        
+
+        # vendor monitor results like temp/power
         vendor_monitor_path = os.path.join(monitor_log_dir,
                                            config.VENDOR + "_monitor.log")
         vendor_log = analysis_log(vendor_monitor_path, config)
         result[host]["vendor"] = vendor_log
-        
-        
+
         # system monitor results like CPU/MEM/POWER
         for index in ["cpu", "mem", "pwr"]:
-            monitor_path = os.path.join(monitor_log_dir, index + "_monitor.log")
+            monitor_path = os.path.join(monitor_log_dir,
+                                        index + "_monitor.log")
             with open(monitor_path, 'r') as file:
-                sys_log = [float(line.split("\t")[1][:-1]) for line in file if "\t" in line]
+                sys_log = [
+                    float(line.split("\t")[1][:-1]) for line in file
+                    if "\t" in line
+                ]
             result[host][index] = sys_log
-        
+
         # FlagPerf Result
-        flagperf_result_path = os.path.join(monitor_log_dir, "operation.log.txt")
+        flagperf_result_path = os.path.join(monitor_log_dir,
+                                            "operation.log.txt")
         with open(flagperf_result_path, 'r') as file:
-            key_lines = [line.strip() for line in file if 'FlagPerf Result' in line]
+            key_lines = [
+                line.strip() for line in file if 'FlagPerf Result' in line
+            ]
         result[host]["flagperf"] = key_lines
-        
+
         noderank += 1
-        
 
     return result
 
@@ -411,7 +418,7 @@ def analysis_log(key_logs):
             pwr_series = key_logs[host]["vendor"]["power"][node]
             kmeans_series = []
             for item in pwr_series:
-                if(np.max(pwr_series)-item) <= (item-np.min(pwr_series)):
+                if (np.max(pwr_series) - item) <= (item - np.min(pwr_series)):
                     kmeans_series.append(item)
             pwr_series = kmeans_series
             RUN_LOGGER.info(
@@ -425,7 +432,8 @@ def analysis_log(key_logs):
             temp_series = key_logs[host]["vendor"]["temp"][node]
             kmeans_series = []
             for item in temp_series:
-                if(np.max(temp_series)-item) <= (item-np.min(temp_series)):
+                if (np.max(temp_series) - item) <= (item -
+                                                    np.min(temp_series)):
                     kmeans_series.append(item)
             temp_series = kmeans_series
             RUN_LOGGER.info(
@@ -468,7 +476,6 @@ def analysis_log(key_logs):
                         np.std(mem_series) * 100 /
                         key_logs[host]["vendor"]["max_mem"], 3)))
         noderank += 1
-    
 
 
 def print_welcome_msg():
@@ -536,7 +543,8 @@ def main():
 
         # Prepare docker image.
         image_mgr = image_manager.ImageManager(
-            "flagperf-operation-" + config.VENDOR + "-" + framework, "t_" + VERSION)
+            "flagperf-operation-" + config.VENDOR + "-" + framework,
+            "t_" + VERSION)
         image_name = image_mgr.repository + ":" + image_mgr.tag
         nnodes = len(config.HOSTS)
         RUN_LOGGER.info("=== 2.1 Prepare docker image:" + image_name + " ===")
@@ -549,7 +557,8 @@ def main():
         # Set command to start docker container in the cluster
         container_name = image_mgr.repository + "-" + image_mgr.tag \
                          + "-container"
-
+        if config.VENDOR == "iluvatar":
+            container_name = container_name + "_device_" + str(config.DEVICE)
         # Set command to start train script in container in the cluster
         log_dir_container = os.path.join(config.FLAGPERF_LOG_PATH,
                                          timestamp_log_dir)
@@ -566,21 +575,20 @@ def main():
         RUN_LOGGER.info("-== Testcase " + case + " starts ==-")
         RUN_LOGGER.info("1) Prepare container environments in cluster...")
         case_log_dir = os.path.join(curr_log_path, case)
-        if not prepare_containers_env_cluster(dp_path, case_log_dir,
-                                              container_name, image_name,
-                                              nnodes, config):
+        if not prepare_containers_env_cluster(
+                dp_path, case_log_dir, container_name, image_name, nnodes,
+                config):
             RUN_LOGGER.error("1) Prepare container environments in cluster"
                              "...[FAILED]. Ignore case " + case)
             continue
         RUN_LOGGER.info("2) Start tasks in the cluster...")
 
-        start_tasks_in_cluster(dp_path, container_name, config,
-                               base_args, curr_log_path, case)
+        start_tasks_in_cluster(dp_path, container_name, config, base_args,
+                               curr_log_path, case)
 
         # Wait until start_xxx_task.py finished.
         RUN_LOGGER.info("3) Waiting for tasks end in the cluster...")
-        pid_file_path = os.path.join(
-            log_dir_container, "start_base_task.pid")
+        pid_file_path = os.path.join(log_dir_container, "start_base_task.pid")
         wait_for_finish(dp_path, container_name, pid_file_path, nnodes)
 
         RUN_LOGGER.info("3) Training tasks end in the cluster...")
@@ -598,7 +606,7 @@ def main():
     RUN_LOGGER.debug(key_logs)
     jsonfile = os.path.join(dp_path, curr_log_path, "detail_result.json")
     json.dump(key_logs, open(jsonfile, "w"))
-    
+
     RUN_LOGGER.info("3) analysis logs")
     analysis_log(key_logs)
 
diff --git a/operation/vendors/cambricon/cambricon_analysis.py b/operation/vendors/cambricon/cambricon_analysis.py
new file mode 100644
index 000000000..37d803157
--- /dev/null
+++ b/operation/vendors/cambricon/cambricon_analysis.py
@@ -0,0 +1,25 @@
+def analysis_log(logpath, config):
+    logfile = open(logpath)
+
+    result = {"temp": {}, "power": {}, "mem": {}}
+    for mluID in range(config.NPROC_PER_NODE):
+        for monitor_index in result.keys():
+            result[monitor_index][mluID] = []
+
+    max_mem = None
+    next_mlu_id = 0
+
+    for line in logfile.readlines():
+        if "C" in line:
+            if max_mem is None:
+                max_mem = float(line.split(" ")[3])
+                result["max_mem"] = max_mem
+            temp = float(line.split(" ")[0][:-1])
+            power = float(line.split(" ")[1])
+            mem = float(line.split(" ")[2])
+            result["temp"][next_mlu_id].append(temp)
+            result["power"][next_mlu_id].append(power)
+            result["mem"][next_mlu_id].append(mem)
+            next_mlu_id = (next_mlu_id + 1) % config.NPROC_PER_NODE
+
+    return result
diff --git a/operation/vendors/cambricon/cambricon_monitor.py b/operation/vendors/cambricon/cambricon_monitor.py
new file mode 100644
index 000000000..9f7e99419
--- /dev/null
+++ b/operation/vendors/cambricon/cambricon_monitor.py
@@ -0,0 +1,256 @@
+# ！/usr/bin/env python3
+# encoding: utf-8
+'''
+Usage:  python3 sys-monitor.py -o operation -l [log_path]
+            -o, --operation     start|stop|restart|status
+            -l, --log           log path , ./logs/ default
+'''
+
+import os
+import sys
+import time
+import signal
+import atexit
+import argparse
+import datetime
+from multiprocessing import Process
+import subprocess
+import schedule
+
+
+class Daemon:
+    '''
+    daemon subprocess class.
+    usage: subclass this daemon and override the run() method.
+    sys-monitor.pid: in the /tmp/, auto del when unexpected exit.
+    verbose: debug mode, disabled default.
+    '''
+
+    def __init__(self,
+                 pid_file,
+                 log_file,
+                 err_file,
+                 mlu_log,
+                 log_path,
+                 rate=5,
+                 stdin=os.devnull,
+                 stdout=os.devnull,
+                 stderr=os.devnull,
+                 home_dir='.',
+                 umask=0o22,
+                 verbose=0):
+        self.stdin = stdin
+        self.stdout = stdout
+        self.stderr = stderr
+        self.home_dir = home_dir
+        self.verbose = verbose
+        self.pidfile = pid_file
+        self.logfile = log_file
+        self.errfile = err_file
+        self.mlufile = mlu_log
+        self.logpath = log_path
+        self.rate = rate
+        self.umask = umask
+        self.verbose = verbose
+        self.daemon_alive = True
+
+    def get_pid(self):
+        try:
+            with open(self.pidfile, 'r') as pf:
+                pid = int(pf.read().strip())
+        except IOError:
+            pid = None
+        except SystemExit:
+            pid = None
+        return pid
+
+    def del_pid(self):
+        if os.path.exists(self.pidfile):
+            os.remove(self.pidfile)
+
+    def run(self):
+        '''
+        NOTE: override the method in subclass
+        '''
+
+        def mlu_mon(file):
+            TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
+            cmd = "cnmon |grep 'Default'|awk '{print $3,$4,$9,$11,$14}'"
+            process = subprocess.Popen(cmd,
+                                       shell=True,
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.STDOUT,
+                                       encoding='utf-8')
+            try:
+                out = process.communicate(timeout=10)
+            except subprocess.TimeoutExpired:
+                process.kill()
+                out = process.communicate()
+
+            if process.returncode != 0:
+                result = "error"
+            result = TIMESTAMP + "\n" + out[0] + "\n"
+            with open(file, 'a') as f:
+                f.write(result)
+
+        def timer_mlu_mon():
+            mlu_process = Process(target=mlu_mon, args=(self.mlufile, ))
+            mlu_process.start()
+
+        schedule.every(self.rate).seconds.do(timer_mlu_mon)
+        while True:
+            schedule.run_pending()
+            time.sleep(5)
+
+    def daemonize(self):
+        if self.verbose >= 1:
+            print('daemon process starting ...')
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #1 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        os.chdir(self.home_dir)
+        os.setsid()
+        os.umask(self.umask)
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #2 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        sys.stdout.flush()
+        sys.stderr.flush()
+        si = open(self.stdin, 'r')
+        so = open(self.stdout, 'a+')
+        if self.stderr:
+            se = open(self.stderr, 'a+')
+        else:
+            se = so
+        os.dup2(si.fileno(), sys.stdin.fileno())
+        os.dup2(so.fileno(), sys.stdout.fileno())
+        os.dup2(se.fileno(), sys.stderr.fileno())
+        atexit.register(self.del_pid)
+        pid = str(os.getpid())
+        with open(self.pidfile, 'w+') as f:
+            f.write('%s\n' % pid)
+
+    def start(self):
+        if not os.path.exists(self.logpath):
+            os.makedirs(self.logpath)
+        elif os.path.exists(self.mlufile):
+            os.remove(self.mlufile)
+        if self.verbose >= 1:
+            print('ready to start ......')
+        # check for a pid file to see if the daemon already runs
+        pid = self.get_pid()
+        if pid:
+            msg = 'pid file %s already exists, is it already running?\n'
+            sys.stderr.write(msg % self.pidfile)
+            sys.exit(1)
+        # start the daemon
+        self.daemonize()
+        self.run()
+
+    def stop(self):
+        if self.verbose >= 1:
+            print('stopping ...')
+        pid = self.get_pid()
+        if not pid:
+            msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile
+            sys.stderr.write(msg)
+            if os.path.exists(self.pidfile):
+                os.remove(self.pidfile)
+            return
+        # try to kill the daemon process
+        try:
+            i = 0
+            while 1:
+                os.kill(pid, signal.SIGTERM)
+                time.sleep(1)
+                i = i + 1
+                if i % 10 == 0:
+                    os.kill(pid, signal.SIGHUP)
+        except OSError as err:
+            err = str(err)
+            if err.find('No such process') > 0:
+                if os.path.exists(self.pidfile):
+                    os.remove(self.pidfile)
+            else:
+                print(str(err))
+                sys.exit(1)
+            if self.verbose >= 1:
+                print('Stopped!')
+
+    def restart(self):
+        self.stop()
+        self.start()
+
+    def status(self):
+        pid = self.get_pid()
+        if pid:
+            if os.path.exists('/proc/%d' % pid):
+                return pid
+        return False
+
+
+def parse_args():
+    ''' Check script input parameter. '''
+    parse = argparse.ArgumentParser(description='Sys monitor script')
+    parse.add_argument('-o',
+                       type=str,
+                       metavar='[operation]',
+                       required=True,
+                       help='start|stop|restart|status')
+    parse.add_argument('-l',
+                       type=str,
+                       metavar='[log_path]',
+                       required=False,
+                       default='./logs/',
+                       help='log path')
+    args = parse.parse_args()
+    return args
+
+
+def main():
+    sample_rate1 = 5
+    args = parse_args()
+    operation = args.o
+    log_path = args.l
+    pid_fn = str('/tmp/mlu_monitor.pid')
+    log_fn = str(log_path + '/cambricon_monitor.log')
+    err_fn = str(log_path + '/cambricon_monitor.err')
+    # result for mlu
+    mlu_fn = str(log_path + '/cambricon_monitor.log')
+
+    subdaemon = Daemon(pid_fn,
+                       log_fn,
+                       err_fn,
+                       mlu_fn,
+                       log_path,
+                       verbose=1,
+                       rate=sample_rate1)
+    if operation == 'start':
+        subdaemon.start()
+    elif operation == 'stop':
+        subdaemon.stop()
+    elif operation == 'restart':
+        subdaemon.restart()
+    elif operation == 'status':
+        pid = subdaemon.status()
+        if pid:
+            print('process [%s] is running ......' % pid)
+        else:
+            print('daemon process [%s] stopped' % pid)
+    else:
+        print("invalid argument!")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/operation/vendors/cambricon/camtorch0830/Dockerfile b/operation/vendors/cambricon/camtorch0830/Dockerfile
new file mode 100644
index 000000000..7404c1072
--- /dev/null
+++ b/operation/vendors/cambricon/camtorch0830/Dockerfile
@@ -0,0 +1,7 @@
+FROM cambricon-base/pytorch:v24.04-torch2.3.0-catch1.20.0-ubuntu22.04-py310_20240830
+RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple"
+RUN /bin/bash -c "uname -a"
+RUN /bin/bash -c alias python3=python
+RUN apt-get update
+RUN pip3 install loguru schedule  munch
+ENV FLAGGEMS_WORK_DIR  /workspaces
diff --git a/operation/vendors/cambricon/camtorch0830/camtorch0830_install.sh b/operation/vendors/cambricon/camtorch0830/camtorch0830_install.sh
new file mode 100644
index 000000000..f9c3e5ed2
--- /dev/null
+++ b/operation/vendors/cambricon/camtorch0830/camtorch0830_install.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+mkdir -p $FLAGGEMS_WORK_DIR && cd $FLAGGEMS_WORK_DIR
+rm -rf FlagGems
+git clone https://mirror.ghproxy.com/https://github.com/FlagOpen/FlagGems.git
+cd FlagGems
+git checkout v2.0-perf-cambricon 
+pip install -e .
+/etc/init.d/ssh restart
diff --git a/operation/vendors/iluvatar/bi150-410/Dockerfile b/operation/vendors/iluvatar/bi150-410/Dockerfile
new file mode 100644
index 000000000..70d4c1fc3
--- /dev/null
+++ b/operation/vendors/iluvatar/bi150-410/Dockerfile
@@ -0,0 +1,9 @@
+FROM zibo.harbor.iluvatar.com.cn:30000/saas/bi150-4.1.0-x86-ubuntu20.04-py3.10-customer-flagperf-triton:v1.0
+#RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple"
+#RUN /bin/bash -c "uname -a"
+#RUN /bin/bash -c alias python3=python
+#RUN apt-get update
+#RUN pip3 install loguru
+#RUN pip3 install pycuda
+#RUN pip3 install schedule
+#RUN pip3 install munch
diff --git a/operation/vendors/iluvatar/bi150-410/bi150-410_install.sh b/operation/vendors/iluvatar/bi150-410/bi150-410_install.sh
new file mode 100644
index 000000000..2a00aa0fe
--- /dev/null
+++ b/operation/vendors/iluvatar/bi150-410/bi150-410_install.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+echo "nothing to do"
diff --git a/operation/vendors/iluvatar/dvfs.sh b/operation/vendors/iluvatar/dvfs.sh
new file mode 100644
index 000000000..df4f848ef
--- /dev/null
+++ b/operation/vendors/iluvatar/dvfs.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+flag=$1
+
+if [[ $flag != 0 ]] && [[ $flag != 1 ]]; then
+    echo "Wrong target flag: $flag"
+    exit
+fi
+
+# a=$(ixsmi -q|grep 'Bus Id'|awk '{print $NF}');
+a=$(lspci|grep 1e3e|awk '{print $1}')
+for i in ${a[@]};
+do
+    # bus_id=${i/0000/};
+    bus_id=0000:${i}
+    # echo "---before set---"
+    # cat /sys/bus/pci/devices/${bus_id,,}/itr_debug
+    cmd="echo perf_mode $flag > /sys/bus/pci/devices/${bus_id,,}/itr_debug"
+    echo $cmd
+    eval $cmd
+    if [[ $flag == 0 ]]; then
+        printf "Turn off DVFS mode: "
+    else
+        printf "Turn on DVFS mode: "
+    fi
+    if [[ $? == 0 ]]; then
+        echo "Success"
+    else
+        echo "Failed"
+    fi
+    # echo "---after  set---"
+    # cat /sys/bus/pci/devices/${bus_id,,}/itr_debug
+done
diff --git a/operation/vendors/iluvatar/iluvatar_analysis.py b/operation/vendors/iluvatar/iluvatar_analysis.py
new file mode 100644
index 000000000..2a32d3cc2
--- /dev/null
+++ b/operation/vendors/iluvatar/iluvatar_analysis.py
@@ -0,0 +1,27 @@
+def analysis_log(logpath, config):
+    logfile = open(logpath)
+
+    result = {"temp": {}, "power": {}, "mem": {}}
+    for gpuID in range(config.NPROC_PER_NODE):
+        for monitor_index in result.keys():
+            result[monitor_index][gpuID] = []
+
+    max_mem = None
+    next_gpu_id = 0
+
+    for line in logfile.readlines():
+        if "MiB" in line:
+            if max_mem is None:
+                max_mem = float(line.split(" ")[3][:-3])
+                result["max_mem"] = max_mem
+            temp = float(line.split(" ")[0][:-1])
+            if next_gpu_id % 2 != 0:
+                power = float(line.split(" ")[1][:-1])
+                result["power"][next_gpu_id].append(power)
+                result["power"][next_gpu_id-1].append(power)
+            mem = float(line.split(" ")[2][:-3])
+            result["temp"][next_gpu_id].append(temp)
+            result["mem"][next_gpu_id].append(mem)
+            next_gpu_id = (next_gpu_id + 1) % config.NPROC_PER_NODE
+
+    return result
diff --git a/operation/vendors/iluvatar/iluvatar_monitor.py b/operation/vendors/iluvatar/iluvatar_monitor.py
new file mode 100644
index 000000000..e1ffaf620
--- /dev/null
+++ b/operation/vendors/iluvatar/iluvatar_monitor.py
@@ -0,0 +1,256 @@
+# ！/usr/bin/env python3
+# encoding: utf-8
+'''
+Usage:  python3 sys-monitor.py -o operation -l [log_path]
+            -o, --operation     start|stop|restart|status
+            -l, --log           log path , ./logs/ default
+'''
+
+import os
+import sys
+import time
+import signal
+import atexit
+import argparse
+import datetime
+from multiprocessing import Process
+import subprocess
+import schedule
+
+
+class Daemon:
+    '''
+    daemon subprocess class.
+    usage: subclass this daemon and override the run() method.
+    sys-monitor.pid: in the /tmp/, auto del when unexpected exit.
+    verbose: debug mode, disabled default.
+    '''
+
+    def __init__(self,
+                 pid_file,
+                 log_file,
+                 err_file,
+                 gpu_log,
+                 log_path,
+                 rate=5,
+                 stdin=os.devnull,
+                 stdout=os.devnull,
+                 stderr=os.devnull,
+                 home_dir='.',
+                 umask=0o22,
+                 verbose=0):
+        self.stdin = stdin
+        self.stdout = stdout
+        self.stderr = stderr
+        self.home_dir = home_dir
+        self.verbose = verbose
+        self.pidfile = pid_file
+        self.logfile = log_file
+        self.errfile = err_file
+        self.gpufile = gpu_log
+        self.logpath = log_path
+        self.rate = rate
+        self.umask = umask
+        self.verbose = verbose
+        self.daemon_alive = True
+
+    def get_pid(self):
+        try:
+            with open(self.pidfile, 'r') as pf:
+                pid = int(pf.read().strip())
+        except IOError:
+            pid = None
+        except SystemExit:
+            pid = None
+        return pid
+
+    def del_pid(self):
+        if os.path.exists(self.pidfile):
+            os.remove(self.pidfile)
+
+    def run(self):
+        '''
+        NOTE: override the method in subclass
+        '''
+
+        def gpu_mon(file):
+            TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
+            cmd = "export PATH=/usr/local/corex/bin:$PATH; export LD_LIBRARY_PATH=/usr/local/corex/lib; ixsmi |grep 'Default'|awk '{print $3,$5,$9,$11,$13}'"
+            process = subprocess.Popen(cmd,
+                                       shell=True,
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.STDOUT,
+                                       encoding='utf-8')
+            try:
+                out = process.communicate(timeout=10)
+            except subprocess.TimeoutExpired:
+                process.kill()
+                out = process.communicate()
+
+            if process.returncode != 0:
+                result = "error"
+            result = TIMESTAMP + "\n" + out[0] + "\n"
+            with open(file, 'a') as f:
+                f.write(result)
+
+        def timer_gpu_mon():
+            gpu_process = Process(target=gpu_mon, args=(self.gpufile, ))
+            gpu_process.start()
+
+        schedule.every(self.rate).seconds.do(timer_gpu_mon)
+        while True:
+            schedule.run_pending()
+            time.sleep(5)
+
+    def daemonize(self):
+        if self.verbose >= 1:
+            print('daemon process starting ...')
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #1 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        os.chdir(self.home_dir)
+        os.setsid()
+        os.umask(self.umask)
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #2 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        sys.stdout.flush()
+        sys.stderr.flush()
+        si = open(self.stdin, 'r')
+        so = open(self.stdout, 'a+')
+        if self.stderr:
+            se = open(self.stderr, 'a+')
+        else:
+            se = so
+        os.dup2(si.fileno(), sys.stdin.fileno())
+        os.dup2(so.fileno(), sys.stdout.fileno())
+        os.dup2(se.fileno(), sys.stderr.fileno())
+        atexit.register(self.del_pid)
+        pid = str(os.getpid())
+        with open(self.pidfile, 'w+') as f:
+            f.write('%s\n' % pid)
+
+    def start(self):
+        if not os.path.exists(self.logpath):
+            os.makedirs(self.logpath)
+        elif os.path.exists(self.gpufile):
+            os.remove(self.gpufile)
+        if self.verbose >= 1:
+            print('ready to start ......')
+        # check for a pid file to see if the daemon already runs
+        pid = self.get_pid()
+        if pid:
+            msg = 'pid file %s already exists, is it already running?\n'
+            sys.stderr.write(msg % self.pidfile)
+            sys.exit(1)
+        # start the daemon
+        self.daemonize()
+        self.run()
+
+    def stop(self):
+        if self.verbose >= 1:
+            print('stopping ...')
+        pid = self.get_pid()
+        if not pid:
+            msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile
+            sys.stderr.write(msg)
+            if os.path.exists(self.pidfile):
+                os.remove(self.pidfile)
+            return
+        # try to kill the daemon process
+        try:
+            i = 0
+            while 1:
+                os.kill(pid, signal.SIGTERM)
+                time.sleep(1)
+                i = i + 1
+                if i % 10 == 0:
+                    os.kill(pid, signal.SIGHUP)
+        except OSError as err:
+            err = str(err)
+            if err.find('No such process') > 0:
+                if os.path.exists(self.pidfile):
+                    os.remove(self.pidfile)
+            else:
+                print(str(err))
+                sys.exit(1)
+            if self.verbose >= 1:
+                print('Stopped!')
+
+    def restart(self):
+        self.stop()
+        self.start()
+
+    def status(self):
+        pid = self.get_pid()
+        if pid:
+            if os.path.exists('/proc/%d' % pid):
+                return pid
+        return False
+
+
+def parse_args():
+    ''' Check script input parameter. '''
+    parse = argparse.ArgumentParser(description='Sys monitor script')
+    parse.add_argument('-o',
+                       type=str,
+                       metavar='[operation]',
+                       required=True,
+                       help='start|stop|restart|status')
+    parse.add_argument('-l',
+                       type=str,
+                       metavar='[log_path]',
+                       required=False,
+                       default='./logs/',
+                       help='log path')
+    args = parse.parse_args()
+    return args
+
+
+def main():
+    sample_rate1 = 5
+    args = parse_args()
+    operation = args.o
+    log_path = args.l
+    pid_fn = str('/tmp/gpu_monitor.pid')
+    log_fn = str(log_path + '/iluvatar_monitor.log')
+    err_fn = str(log_path + '/niluvatar_monitor.err')
+    # result for gpu
+    gpu_fn = str(log_path + '/iluvatar_monitor.log')
+
+    subdaemon = Daemon(pid_fn,
+                       log_fn,
+                       err_fn,
+                       gpu_fn,
+                       log_path,
+                       verbose=1,
+                       rate=sample_rate1)
+    if operation == 'start':
+        subdaemon.start()
+    elif operation == 'stop':
+        subdaemon.stop()
+    elif operation == 'restart':
+        subdaemon.restart()
+    elif operation == 'status':
+        pid = subdaemon.status()
+        if pid:
+            print('process [%s] is running ......' % pid)
+        else:
+            print('daemon process [%s] stopped' % pid)
+    else:
+        print("invalid argument!")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/operation/vendors/kunlunxin/kunlunxin_analysis.py b/operation/vendors/kunlunxin/kunlunxin_analysis.py
new file mode 100644
index 000000000..ebee86b2d
--- /dev/null
+++ b/operation/vendors/kunlunxin/kunlunxin_analysis.py
@@ -0,0 +1,25 @@
+def analysis_log(logpath, config):
+    logfile = open(logpath)
+
+    result = {"temp": {}, "power": {}, "mem": {}}
+    for gpuID in range(config.NPROC_PER_NODE):
+        for monitor_index in result.keys():
+            result[monitor_index][gpuID] = []
+
+    max_mem = None
+    next_gpu_id = 0
+
+    for line in logfile.readlines():
+        if "MiB" in line:
+            if max_mem is None:
+                max_mem = float(line.split(" ")[3][:-3])
+                result["max_mem"] = max_mem
+            temp = float(line.split(" ")[0][:-1])
+            power = float(line.split(" ")[1][:-1])
+            mem = float(line.split(" ")[2][:-3])
+            result["temp"][next_gpu_id].append(temp)
+            result["power"][next_gpu_id].append(power)
+            result["mem"][next_gpu_id].append(mem)
+            next_gpu_id = (next_gpu_id + 1) % config.NPROC_PER_NODE
+
+    return result
diff --git a/operation/vendors/kunlunxin/kunlunxin_monitor.py b/operation/vendors/kunlunxin/kunlunxin_monitor.py
new file mode 100644
index 000000000..c031e63d2
--- /dev/null
+++ b/operation/vendors/kunlunxin/kunlunxin_monitor.py
@@ -0,0 +1,256 @@
+# ！/usr/bin/env python3
+# encoding: utf-8
+'''
+Usage:  python3 sys-monitor.py -o operation -l [log_path]
+            -o, --operation     start|stop|restart|status
+            -l, --log           log path , ./logs/ default
+'''
+
+import os
+import sys
+import time
+import signal
+import atexit
+import argparse
+import datetime
+from multiprocessing import Process
+import subprocess
+import schedule
+
+
+class Daemon:
+    '''
+    daemon subprocess class.
+    usage: subclass this daemon and override the run() method.
+    sys-monitor.pid: in the /tmp/, auto del when unexpected exit.
+    verbose: debug mode, disabled default.
+    '''
+
+    def __init__(self,
+                 pid_file,
+                 log_file,
+                 err_file,
+                 gpu_log,
+                 log_path,
+                 rate=5,
+                 stdin=os.devnull,
+                 stdout=os.devnull,
+                 stderr=os.devnull,
+                 home_dir='.',
+                 umask=0o22,
+                 verbose=0):
+        self.stdin = stdin
+        self.stdout = stdout
+        self.stderr = stderr
+        self.home_dir = home_dir
+        self.verbose = verbose
+        self.pidfile = pid_file
+        self.logfile = log_file
+        self.errfile = err_file
+        self.gpufile = gpu_log
+        self.logpath = log_path
+        self.rate = rate
+        self.umask = umask
+        self.verbose = verbose
+        self.daemon_alive = True
+
+    def get_pid(self):
+        try:
+            with open(self.pidfile, 'r') as pf:
+                pid = int(pf.read().strip())
+        except IOError:
+            pid = None
+        except SystemExit:
+            pid = None
+        return pid
+
+    def del_pid(self):
+        if os.path.exists(self.pidfile):
+            os.remove(self.pidfile)
+
+    def run(self):
+        '''
+        NOTE: override the method in subclass
+        '''
+
+        def gpu_mon(file):
+            TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
+            cmd = "xpu-smi -m | awk '{print $5\"C\",$9\"W\",$18\"MiB\",$19\"MiB\",$20\"%\"}'"
+            process = subprocess.Popen(cmd,
+                                       shell=True,
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.STDOUT,
+                                       encoding='utf-8')
+            try:
+                out = process.communicate(timeout=10)
+            except subprocess.TimeoutExpired:
+                process.kill()
+                out = process.communicate()
+
+            if process.returncode != 0:
+                result = "error"
+            result = TIMESTAMP + "\n" + out[0] + "\n"
+            with open(file, 'a') as f:
+                f.write(result)
+
+        def timer_gpu_mon():
+            gpu_process = Process(target=gpu_mon, args=(self.gpufile, ))
+            gpu_process.start()
+
+        schedule.every(self.rate).seconds.do(timer_gpu_mon)
+        while True:
+            schedule.run_pending()
+            time.sleep(5)
+
+    def daemonize(self):
+        if self.verbose >= 1:
+            print('daemon process starting ...')
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #1 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        os.chdir(self.home_dir)
+        os.setsid()
+        os.umask(self.umask)
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #2 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        sys.stdout.flush()
+        sys.stderr.flush()
+        si = open(self.stdin, 'r')
+        so = open(self.stdout, 'a+')
+        if self.stderr:
+            se = open(self.stderr, 'a+')
+        else:
+            se = so
+        os.dup2(si.fileno(), sys.stdin.fileno())
+        os.dup2(so.fileno(), sys.stdout.fileno())
+        os.dup2(se.fileno(), sys.stderr.fileno())
+        atexit.register(self.del_pid)
+        pid = str(os.getpid())
+        with open(self.pidfile, 'w+') as f:
+            f.write('%s\n' % pid)
+
+    def start(self):
+        if not os.path.exists(self.logpath):
+            os.makedirs(self.logpath)
+        elif os.path.exists(self.gpufile):
+            os.remove(self.gpufile)
+        if self.verbose >= 1:
+            print('ready to start ......')
+        # check for a pid file to see if the daemon already runs
+        pid = self.get_pid()
+        if pid:
+            msg = 'pid file %s already exists, is it already running?\n'
+            sys.stderr.write(msg % self.pidfile)
+            sys.exit(1)
+        # start the daemon
+        self.daemonize()
+        self.run()
+
+    def stop(self):
+        if self.verbose >= 1:
+            print('stopping ...')
+        pid = self.get_pid()
+        if not pid:
+            msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile
+            sys.stderr.write(msg)
+            if os.path.exists(self.pidfile):
+                os.remove(self.pidfile)
+            return
+        # try to kill the daemon process
+        try:
+            i = 0
+            while 1:
+                os.kill(pid, signal.SIGTERM)
+                time.sleep(1)
+                i = i + 1
+                if i % 10 == 0:
+                    os.kill(pid, signal.SIGHUP)
+        except OSError as err:
+            err = str(err)
+            if err.find('No such process') > 0:
+                if os.path.exists(self.pidfile):
+                    os.remove(self.pidfile)
+            else:
+                print(str(err))
+                sys.exit(1)
+            if self.verbose >= 1:
+                print('Stopped!')
+
+    def restart(self):
+        self.stop()
+        self.start()
+
+    def status(self):
+        pid = self.get_pid()
+        if pid:
+            if os.path.exists('/proc/%d' % pid):
+                return pid
+        return False
+
+
+def parse_args():
+    ''' Check script input parameter. '''
+    parse = argparse.ArgumentParser(description='Sys monitor script')
+    parse.add_argument('-o',
+                       type=str,
+                       metavar='[operation]',
+                       required=True,
+                       help='start|stop|restart|status')
+    parse.add_argument('-l',
+                       type=str,
+                       metavar='[log_path]',
+                       required=False,
+                       default='./logs/',
+                       help='log path')
+    args = parse.parse_args()
+    return args
+
+
+def main():
+    sample_rate1 = 5
+    args = parse_args()
+    operation = args.o
+    log_path = args.l
+    pid_fn = str('/tmp/kunlunxin_monitor.pid')
+    log_fn = str(log_path + '/kunlunxin_monitor.log')
+    err_fn = str(log_path + '/kunlunxin_monitor.err')
+    # result for gpu
+    gpu_fn = str(log_path + '/kunlunxin_monitor.log')
+
+    subdaemon = Daemon(pid_fn,
+                       log_fn,
+                       err_fn,
+                       gpu_fn,
+                       log_path,
+                       verbose=1,
+                       rate=sample_rate1)
+    if operation == 'start':
+        subdaemon.start()
+    elif operation == 'stop':
+        subdaemon.stop()
+    elif operation == 'restart':
+        subdaemon.restart()
+    elif operation == 'status':
+        pid = subdaemon.status()
+        if pid:
+            print('process [%s] is running ......' % pid)
+        else:
+            print('daemon process [%s] stopped' % pid)
+    else:
+        print("invalid argument!")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/operation/vendors/kunlunxin/xpytorch029/Dockerfile b/operation/vendors/kunlunxin/xpytorch029/Dockerfile
new file mode 100644
index 000000000..7ecbad8f3
--- /dev/null
+++ b/operation/vendors/kunlunxin/xpytorch029/Dockerfile
@@ -0,0 +1,19 @@
+#FROM iregistry.baidu-int.com/xmlir/xmlir_ubuntu_2004_x86_64:v0.29
+FROM xpytorch-flaggems:v0.1
+RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple"
+RUN /bin/bash -c "uname -a"
+RUN /bin/bash -c alias python3=python
+ENV PATH /root/miniconda/envs/python38_torch201_cuda/bin:$PATH
+ENV PATH /usr/local/xpu/bin:$PATH
+RUN /bin/bash -c 'wget -O /tmp/xre.tar.gz https://klx-sdk-release-public.su.bcebos.com/xre/kl3-release/5.0.15.1/xre-Linux-x86_64-5.0.15.1.tar.gz && cd /tmp && tar zxf xre.tar.gz && cp -a xre-Linux-x86_64-5.0.15.1 /usr/local/xpu'
+
+#RUN apt-get update
+RUN pip3 install loguru
+#RUN pip3 install pycuda
+RUN pip3 install schedule
+RUN pip3 install munch
+#RUN /bin/bash -c 'wget -O /tmp/xpytorch.run https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/R300_plus/latest/xpytorch-cp38-torch201-ubuntu2004-x64.run && bash /tmp/xpytorch.run'
+#RUN /bin/bash -c 'source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda'
+
+ENV TRITON_XPU_ARCH 3
+ENV CUDART_DUMMY_REGISTER 1
diff --git a/operation/vendors/kunlunxin/xpytorch029/xpytorch029_install.sh b/operation/vendors/kunlunxin/xpytorch029/xpytorch029_install.sh
new file mode 100644
index 000000000..cd46c78b2
--- /dev/null
+++ b/operation/vendors/kunlunxin/xpytorch029/xpytorch029_install.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+set -x
+
+# conda env
+source /root/miniconda/etc/profile.d/conda.sh && conda activate python38_torch201_cuda
+pip install pytest loguru schedule
+
+# xpytorch install
+wget -q -O xpytorch.run https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/R300_plus/latest/flaggems/xpytorch-cp38-torch201-ubuntu2004-x64.run && bash xpytorch.run &> install-xpytorch.log
+CUDART_DUMMY_REGISTER=1 python -m torch_xmlir --doctor
+CUDART_DUMMY_REGISTER=1 python -c "import torch; print(torch.rand(2,3).cuda())"
+
+# xpu triton
+wget -q -O triton-2.1.0-cp38-cp38-linux_x86_64.whl https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/R300_plus/latest/flaggems/triton-2.1.0-cp38-cp38-linux_x86_64.whl && pip install --no-deps --force-reinstall ./triton-2.1.0-cp38-cp38-linux_x86_64.whl &> install-triton.log
+pip show triton
+cp -v /root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/triton/testing.py /root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/triton/testing.py.bak
+wget -q -O /root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/triton/testing.py https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/R300_plus/latest/flaggems/triton.testing.py
+
+
+# FlagGems
+test -d FlagGems && mv FlagGems FlagGems.bak
+git clone https://mirror.ghproxy.com/https://github.com/FlagOpen/FlagGems.git
+#git clone https://github.com/FlagOpen/FlagGems.git
+cd FlagGems
+git checkout v2.0-perf-klx
+pip install -e . --no-deps
+
+
+# test flaggems
+export TRITON_XPU_ARCH=3
+export CUDART_DUMMY_REGISTER=1
+cd /home/FlagGems && python -m pytest -s tests/test_binary_pointwise_ops.py::test_accuracy_add[dtype0-0.001-shape0]
diff --git a/operation/vendors/metax/metax_analysis.py b/operation/vendors/metax/metax_analysis.py
new file mode 100755
index 000000000..83f3b47fb
--- /dev/null
+++ b/operation/vendors/metax/metax_analysis.py
@@ -0,0 +1,38 @@
+def analysis_log(logpath, config):
+    logfile = open(logpath)
+
+    result = {"temp": {}, "power": {}, "mem": {}}
+    for gpuID in range(config.NPROC_PER_NODE):
+        for monitor_index in result.keys():
+            result[monitor_index][gpuID] = []
+
+    max_mem = None
+    next_gpu_id = 0
+
+    max_usage = 0.0
+    for line in logfile.readlines():
+        if "MiB" in line:
+            if max_mem is None:
+                usage_and_maxusage = line.split(" ")[2]
+                result["max_mem"] = float(usage_and_maxusage.split("/")[1])
+
+            temp_str = line.split(" ")[0]
+            temp =  (float(temp_str[:-1]))
+            power_str = line.split(" ")[1]
+            power =  (float(power_str[:-1]))
+            #temp = line.split(" ")[0]
+            #power = line.split(" ")[1]
+            usage_and_maxusage = line.split(" ")[2]
+            usage = float(usage_and_maxusage.split("/")[0])
+            max_usage = max(max_usage, usage)
+            max_mem = float(usage_and_maxusage.split("/")[1])
+            print(" temp vuale:", temp)
+            print(" power value:", power)
+            print (" max_usage value:", max_usage)
+            print ("max mem value:", max_mem)
+            result["temp"][next_gpu_id].append(temp)
+            result["power"][next_gpu_id].append(power)
+            result["mem"][next_gpu_id].append(max_usage)
+            next_gpu_id = (next_gpu_id + 1) % config.NPROC_PER_NODE
+
+    return result
\ No newline at end of file
diff --git a/operation/vendors/metax/metax_monitor.py b/operation/vendors/metax/metax_monitor.py
new file mode 100755
index 000000000..e3822df2f
--- /dev/null
+++ b/operation/vendors/metax/metax_monitor.py
@@ -0,0 +1,256 @@
+# ！/usr/bin/env python3
+# encoding: utf-8
+'''
+Usage:  python3 sys-monitor.py -o operation -l [log_path]
+            -o, --operation     start|stop|restart|status
+            -l, --log           log path , ./logs/ default
+'''
+
+import os
+import sys
+import time
+import signal
+import atexit
+import argparse
+import datetime
+from multiprocessing import Process
+import subprocess
+import schedule
+
+
+class Daemon:
+    '''
+    daemon subprocess class.
+    usage: subclass this daemon and override the run() method.
+    sys-monitor.pid: in the /tmp/, auto del when unexpected exit.
+    verbose: debug mode, disabled default.
+    '''
+
+    def __init__(self,
+                 pid_file,
+                 log_file,
+                 err_file,
+                 gpu_log,
+                 log_path,
+                 rate=5,
+                 stdin=os.devnull,
+                 stdout=os.devnull,
+                 stderr=os.devnull,
+                 home_dir='.',
+                 umask=0o22,
+                 verbose=0):
+        self.stdin = stdin
+        self.stdout = stdout
+        self.stderr = stderr
+        self.home_dir = home_dir
+        self.verbose = verbose
+        self.pidfile = pid_file
+        self.logfile = log_file
+        self.errfile = err_file
+        self.gpufile = gpu_log
+        self.logpath = log_path
+        self.rate = rate
+        self.umask = umask
+        self.verbose = verbose
+        self.daemon_alive = True
+
+    def get_pid(self):
+        try:
+            with open(self.pidfile, 'r') as pf:
+                pid = int(pf.read().strip())
+        except IOError:
+            pid = None
+        except SystemExit:
+            pid = None
+        return pid
+
+    def del_pid(self):
+        if os.path.exists(self.pidfile):
+            os.remove(self.pidfile)
+
+    def run(self):
+        '''
+        NOTE: override the method in subclass
+        '''
+
+        def gpu_mon(file):
+            TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
+            cmd = "mx-smi |grep 'MiB' -m 1 | awk '{print $2, $3, $5,$6}' && mx-smi |grep 'MXC' -m 1 | awk '{print $7}'"
+            process = subprocess.Popen(cmd,
+                                       shell=True,
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.STDOUT,
+                                       encoding='utf-8')
+            try:
+                out = process.communicate(timeout=10)
+            except subprocess.TimeoutExpired:
+                process.kill()
+                out = process.communicate()
+
+            if process.returncode != 0:
+                result = "error"
+            result = TIMESTAMP + "\n" + out[0] + "\n"
+            with open(file, 'a') as f:
+                f.write(result)
+
+        def timer_gpu_mon():
+            gpu_process = Process(target=gpu_mon, args=(self.gpufile, ))
+            gpu_process.start()
+
+        schedule.every(self.rate).seconds.do(timer_gpu_mon)
+        while True:
+            schedule.run_pending()
+            time.sleep(5)
+
+    def daemonize(self):
+        if self.verbose >= 1:
+            print('daemon process starting ...')
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #1 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        os.chdir(self.home_dir)
+        os.setsid()
+        os.umask(self.umask)
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #2 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        sys.stdout.flush()
+        sys.stderr.flush()
+        si = open(self.stdin, 'r')
+        so = open(self.stdout, 'a+')
+        if self.stderr:
+            se = open(self.stderr, 'a+')
+        else:
+            se = so
+        os.dup2(si.fileno(), sys.stdin.fileno())
+        os.dup2(so.fileno(), sys.stdout.fileno())
+        os.dup2(se.fileno(), sys.stderr.fileno())
+        atexit.register(self.del_pid)
+        pid = str(os.getpid())
+        with open(self.pidfile, 'w+') as f:
+            f.write('%s\n' % pid)
+
+    def start(self):
+        if not os.path.exists(self.logpath):
+            os.makedirs(self.logpath)
+        elif os.path.exists(self.gpufile):
+            os.remove(self.gpufile)
+        if self.verbose >= 1:
+            print('ready to start ......')
+        # check for a pid file to see if the daemon already runs
+        pid = self.get_pid()
+        if pid:
+            msg = 'pid file %s already exists, is it already running?\n'
+            sys.stderr.write(msg % self.pidfile)
+            sys.exit(1)
+        # start the daemon
+        self.daemonize()
+        self.run()
+
+    def stop(self):
+        if self.verbose >= 1:
+            print('stopping ...')
+        pid = self.get_pid()
+        if not pid:
+            msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile
+            sys.stderr.write(msg)
+            if os.path.exists(self.pidfile):
+                os.remove(self.pidfile)
+            return
+        # try to kill the daemon process
+        try:
+            i = 0
+            while 1:
+                os.kill(pid, signal.SIGTERM)
+                time.sleep(1)
+                i = i + 1
+                if i % 10 == 0:
+                    os.kill(pid, signal.SIGHUP)
+        except OSError as err:
+            err = str(err)
+            if err.find('No such process') > 0:
+                if os.path.exists(self.pidfile):
+                    os.remove(self.pidfile)
+            else:
+                print(str(err))
+                sys.exit(1)
+            if self.verbose >= 1:
+                print('Stopped!')
+
+    def restart(self):
+        self.stop()
+        self.start()
+
+    def status(self):
+        pid = self.get_pid()
+        if pid:
+            if os.path.exists('/proc/%d' % pid):
+                return pid
+        return False
+
+
+def parse_args():
+    ''' Check script input parameter. '''
+    parse = argparse.ArgumentParser(description='Sys monitor script')
+    parse.add_argument('-o',
+                       type=str,
+                       metavar='[operation]',
+                       required=True,
+                       help='start|stop|restart|status')
+    parse.add_argument('-l',
+                       type=str,
+                       metavar='[log_path]',
+                       required=False,
+                       default='./logs/',
+                       help='log path')
+    args = parse.parse_args()
+    return args
+
+
+def main():
+    sample_rate1 = 5
+    args = parse_args()
+    operation = args.o
+    log_path = args.l
+    pid_fn = str('/tmp/gpu_monitor.pid')
+    log_fn = str(log_path + '/metax_monitor.log')
+    err_fn = str(log_path + '/metax_monitor.err')
+    # result for gpu
+    gpu_fn = str(log_path + '/metax_monitor.log')
+
+    subdaemon = Daemon(pid_fn,
+                       log_fn,
+                       err_fn,
+                       gpu_fn,
+                       log_path,
+                       verbose=1,
+                       rate=sample_rate1)
+    if operation == 'start':
+        subdaemon.start()
+    elif operation == 'stop':
+        subdaemon.stop()
+    elif operation == 'restart':
+        subdaemon.restart()
+    elif operation == 'status':
+        pid = subdaemon.status()
+        if pid:
+            print('process [%s] is running ......' % pid)
+        else:
+            print('daemon process [%s] stopped' % pid)
+    else:
+        print("invalid argument!")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/operation/vendors/metax/ngctorch2403/Dockerfile b/operation/vendors/metax/ngctorch2403/Dockerfile
new file mode 100755
index 000000000..68ed048db
--- /dev/null
+++ b/operation/vendors/metax/ngctorch2403/Dockerfile
@@ -0,0 +1,10 @@
+FROM flagperf-operation-metax-ngctorch2403:v1.0
+RUN /bin/bash -c "pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple"
+RUN /bin/bash -c "uname -a"
+RUN /bin/bash -c alias python3=python
+ENV MACA_VISIBLE_DEVICES=7
+RUN apt-get update
+RUN pip3 install loguru
+#RUN pip3 install pycuda
+RUN pip3 install schedule
+RUN pip3 install munch
diff --git a/operation/vendors/metax/ngctorch2403/ngctorch2403_install.sh b/operation/vendors/metax/ngctorch2403/ngctorch2403_install.sh
new file mode 100755
index 000000000..39abb5e3b
--- /dev/null
+++ b/operation/vendors/metax/ngctorch2403/ngctorch2403_install.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+current_dir=$(pwd)
+echo "=====>$current_dir"
+script_dir=$(dirname "$(realpath "$0")")
+echo "script dir :$script_dir"
+
+cd /workspace/docker_image/FlagGems
+#git clone https://mirror.ghproxy.com/https://github.com/FlagOpen/FlagGems.git
+#git checkout .
+pip3 install .
\ No newline at end of file
diff --git a/training/ascend/llava1.5_7b-deepspeed-torch/README.md b/training/ascend/llava1.5_7b-deepspeed-torch/README.md
new file mode 100644
index 000000000..d1f7b9f49
--- /dev/null
+++ b/training/ascend/llava1.5_7b-deepspeed-torch/README.md
@@ -0,0 +1,58 @@
+### Ascend 配置与运行信息参考
+#### 环境配置
+- ##### Atlas 800T A2硬件环境
+    - 机器型号: Atlas 800T A2
+    - 加速卡型号: Atlas 800T A2
+    - CPU型号: KunPeng 920
+    - 多机网络类型、带宽: 此评测样例无需多机网络
+    
+- ##### Atlas 800T A2软件环境
+   - OS版本：Ubuntu 22.04 LTS
+   - OS kernel版本: 5.15.0-25-generic     
+   - 加速卡驱动版本：24.1.rc2
+   - Docker 版本：此评测样例无需docker环境
+   - 训练框架版本：deepspeed 0.13.1
+
+- ##### 并行策略
+
+   - 并行技术：sharded data parallel
+   - 实施者：deepspeed ZeRO-DP
+   - 实施细节：ZeRO-DP Pretrain-stage:O2 Finetune-stage:O3
+
+- ##### 优化策略
+
+   - flash attention 2
+
+### 运行情况
+
+* 输入批尺寸
+  1. local_batchsize(micro_batchsize)，简写为LBS，即实际进入模型的张量批尺寸，为config_Atlas800TA2x1x8.py中所写，在本case中pretrain阶段为32，finetune阶段为16
+  2. seqlength(max_position_embedding)，简写为MPE，即实际进入模型的序列长度，为config_Atlas800TA2x1x8.py中所写，在本case中默认为2048。这里需注意，llava1.5实际训练时，实际序列长度并非都为2048，本case在计算MFU时，统计每条数据进入模型的实际序列长度求取平均值作为实际序列长度
+  3. gradient_accumulate_steps，简写为GAS，即梯度累加步数，为config_Atlas800TA2x1x8.py中所写，在本case中默认为1
+  4. global_batchsize恒等于local_batchsize\*gradient_accumulate_steps\*data_parallel_size，简写为GBS。在本case中，只存在数据并行，因此data_parallel_size=world_size
+
+- ##### 优化策略
+
+   - 优化方案：flash attention 2
+
+
+* 通用指标
+
+| 指标名称     | 指标值                     | 特殊说明                           |
+| ------------ | -------------------------- | ---------------------------------- |
+| 任务类别     | 多模态大模型               |                                    |
+| 模型         | llava1.5_7b                  |                                    |
+| 数据集       | LAION-CC-SBU、llava的混合指令微调数据                | |
+| 数据精度     |bf16                        |                                    |
+| 超参修改     | fix_hp,见“性能指标”        | 运行必要特殊超参 |
+| 硬件设备简称 | Atlas 800T A2    |                                    |
+| 硬件存储使用 | mem,见“性能指标”           | 通常称为“显存”,单位为GiB           |
+| 计算使用率 | MFU,见“性能指标”           | 参见PaLM论文定义 |
+| **吞吐量**   | **token/p/s,见“性能指标”** | 平均单卡每秒处理的token数          |
+| MMMU（val）结果           | acc(推理/验证)   | MMMU（val）回答准确率                   |
+* 性能指标
+
+| 配置                |  fix_hp           | token/p/s | loss | mem       |acc(MMMU) |MFU       |
+| ------------------- | ---------------- | ------ | ------- | --------- | --------- |--------- |
+| Atlas 800T A2单机8卡（1x8）（pretrain）  |  /  | 3448 | 0.0272 | 59/64 | - | 15.47% |
+| Atlas 800T A2单机8卡（1x8）（finetune）  |  /  | 2182 | 0.1452 | 59/64 | 27.10% | 26.54% |
diff --git a/training/ascend/llava1.5_7b-deepspeed-torch/config/config_Atlas800TA2x1x8.py b/training/ascend/llava1.5_7b-deepspeed-torch/config/config_Atlas800TA2x1x8.py
new file mode 100644
index 000000000..0aae74504
--- /dev/null
+++ b/training/ascend/llava1.5_7b-deepspeed-torch/config/config_Atlas800TA2x1x8.py
@@ -0,0 +1,54 @@
+# ascend npu flashattention
+import transformers
+import os
+
+cwd = os.getcwd()
+
+os.chdir(os.path.dirname(__file__))
+
+transformers_path = os.path.dirname(transformers.__file__)
+
+import_utils_path = os.path.join(
+    transformers_path, 
+    "utils/import_utils.py"
+)
+modeling_llama_path = os.path.join(
+    transformers_path, 
+    "models/llama/modeling_llama.py"
+)
+
+import_utils_patch_bash = "patch --silent --forward " + \
+	import_utils_path + \
+	" import_utils.patch -o import_utils.py;"
+modeling_llama_patch_bash = "patch --silent --forward " + \
+	modeling_llama_path + \
+	" modeling_llama.patch -o modeling_llama.py;"
+    
+if os.system(import_utils_patch_bash) == 0:
+    os.system("mv import_utils.py " + import_utils_path + ";")
+if os.system(modeling_llama_patch_bash) == 0:
+    os.system("mv modeling_llama.py " + modeling_llama_path +";")
+    
+# useing torch_npu
+os.system("cp ./train_mem.py ../../../benchmarks/llava1.5_7b/deepspeed-torch/train/")
+
+os.chdir(cwd)
+
+
+# Common arguments
+theoryflops = 304277000000000.0
+
+# pretrain arguments
+pretrain_per_device_train_batch_size = 32
+pretrain_gradient_accumulation_steps = 1
+
+
+# finetune arguments
+finetune_per_device_train_batch_size = 16
+finetune_gradient_accumulation_steps = 1
+output_dir_finetune = "Output/checkpoints_finetune/llava-v1.5-7b"
+
+# eval arguments
+mmmu_data_path = "MMMU/MMMU"
+
+os.chdir(cwd)
diff --git a/training/ascend/llava1.5_7b-deepspeed-torch/config/ds_config.json b/training/ascend/llava1.5_7b-deepspeed-torch/config/ds_config.json
new file mode 100644
index 000000000..8b63ca6e3
--- /dev/null
+++ b/training/ascend/llava1.5_7b-deepspeed-torch/config/ds_config.json
@@ -0,0 +1,27 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "logging": {
+        "log_level": "INFO"
+    },
+    
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 2,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto"
+    }
+}
\ No newline at end of file
diff --git a/training/ascend/llava1.5_7b-deepspeed-torch/config/environment_variables.sh b/training/ascend/llava1.5_7b-deepspeed-torch/config/environment_variables.sh
new file mode 100644
index 000000000..f61e3c257
--- /dev/null
+++ b/training/ascend/llava1.5_7b-deepspeed-torch/config/environment_variables.sh
@@ -0,0 +1,7 @@
+# When using Conda, please specify the location of your environment variable
+# If you are using the system's environment, there is no need to set this variable
+source /root/anaconda3/bin/activate /opt/nvme1n1/conda-envs/patch; 
+
+# Activate the environment related to Ascend
+source /usr/local/Ascend/driver/bin/setenv.bash; 
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
diff --git a/training/ascend/llava1.5_7b-deepspeed-torch/config/import_utils.patch b/training/ascend/llava1.5_7b-deepspeed-torch/config/import_utils.patch
new file mode 100644
index 000000000..f28360d6a
--- /dev/null
+++ b/training/ascend/llava1.5_7b-deepspeed-torch/config/import_utils.patch
@@ -0,0 +1,22 @@
+627,645c627
+<     if not is_torch_available():
+<         return False
+< 
+<     if not _is_package_available("flash_attn"):
+<         return False
+< 
+<     # Let's add an extra check to see if cuda is available
+<     import torch
+< 
+<     if not torch.cuda.is_available():
+<         return False
+< 
+<     if torch.version.cuda:
+<         return version.parse(importlib.metadata.version("flash_attn")) >= version.parse("2.1.0")
+<     elif torch.version.hip:
+<         # TODO: Bump the requirement to 2.1.0 once released in https://github.com/ROCmSoftwarePlatform/flash-attention
+<         return version.parse(importlib.metadata.version("flash_attn")) >= version.parse("2.0.4")
+<     else:
+<         return False
+---
+>     return True
diff --git a/training/ascend/llava1.5_7b-deepspeed-torch/config/modeling_llama.patch b/training/ascend/llava1.5_7b-deepspeed-torch/config/modeling_llama.patch
new file mode 100644
index 000000000..9e344e7c2
--- /dev/null
+++ b/training/ascend/llava1.5_7b-deepspeed-torch/config/modeling_llama.patch
@@ -0,0 +1,53 @@
+52a53
+> import torch_npu
+54,56c55,57
+< if is_flash_attn_2_available():
+<     from flash_attn import flash_attn_func, flash_attn_varlen_func
+<     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+---
+> # if is_flash_attn_2_available():
+> #     from flash_attn import flash_attn_func, flash_attn_varlen_func
+> #     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+583,597c584,586
+<         if not self._flash_attn_uses_top_left_mask:
+<             causal = self.is_causal
+<         else:
+<             # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+<             causal = self.is_causal and query_length != 1
+< 
+<         # Contains at least one padding token in the sequence
+<         if attention_mask is not None:
+<             batch_size = query_states.shape[0]
+<             query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+<                 query_states, key_states, value_states, attention_mask, query_length
+<             )
+< 
+<             cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+<             max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+---
+>         head_num = query_states.shape[2]
+>         out = torch_npu.npu_fusion_attention(query_states, key_states, value_states, head_num, "BSND", keep_prob=1.0,
+>         scale=1.0 / math.sqrt(query_states.shape[-1]))
+599,616c588
+<             attn_output_unpad = flash_attn_varlen_func(
+<                 query_states,
+<                 key_states,
+<                 value_states,
+<                 cu_seqlens_q=cu_seqlens_q,
+<                 cu_seqlens_k=cu_seqlens_k,
+<                 max_seqlen_q=max_seqlen_in_batch_q,
+<                 max_seqlen_k=max_seqlen_in_batch_k,
+<                 dropout_p=dropout,
+<                 softmax_scale=softmax_scale,
+<                 causal=causal,
+<             )
+< 
+<             attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+<         else:
+<             attn_output = flash_attn_func(
+<                 query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+<             )
+---
+>         return out[0]
+618d589
+<         return attn_output
diff --git a/training/ascend/llava1.5_7b-deepspeed-torch/config/net.sh b/training/ascend/llava1.5_7b-deepspeed-torch/config/net.sh
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/ascend/llava1.5_7b-deepspeed-torch/config/requirements.txt b/training/ascend/llava1.5_7b-deepspeed-torch/config/requirements.txt
new file mode 100644
index 000000000..9ca6d6eb3
--- /dev/null
+++ b/training/ascend/llava1.5_7b-deepspeed-torch/config/requirements.txt
@@ -0,0 +1,12 @@
+packaging
+schedule
+Pillow==10.3.0
+accelerate==0.26.1
+transformers==4.37.2
+sentencepiece==0.1.99
+safetensors==0.4.2
+scikit-learn==1.2.2
+scipy==1.11.1
+jinja2==3.1.2
+deepspeed==0.13.1
+datasets
diff --git a/training/ascend/llava1.5_7b-deepspeed-torch/config/train_mem.py b/training/ascend/llava1.5_7b-deepspeed-torch/config/train_mem.py
new file mode 100644
index 000000000..e9e668cda
--- /dev/null
+++ b/training/ascend/llava1.5_7b-deepspeed-torch/config/train_mem.py
@@ -0,0 +1,7 @@
+import torch_npu
+from torch_npu.contrib import transfer_to_npu
+from train import train
+
+
+if __name__ == "__main__":
+    train(attn_implementation="flash_attention_2")
diff --git a/training/benchmarks/chatglm3_6b/deepspeed/chatglm3_6b_hf/modeling_chatglm.py b/training/benchmarks/chatglm3_6b/deepspeed/chatglm3_6b_hf/modeling_chatglm.py
index e75568b77..8f8c43ae5 100644
--- a/training/benchmarks/chatglm3_6b/deepspeed/chatglm3_6b_hf/modeling_chatglm.py
+++ b/training/benchmarks/chatglm3_6b/deepspeed/chatglm3_6b_hf/modeling_chatglm.py
@@ -181,7 +181,7 @@ def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Ten
 class RMSNorm(torch.nn.Module):
     def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs):
         super().__init__()
-        self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype))
+        self.weight = torch.nn.Parameter(torch.ones(normalized_shape, device=device, dtype=dtype))
         self.eps = eps
 
     def forward(self, hidden_states: torch.Tensor):
diff --git a/training/benchmarks/chatglm3_6b/deepspeed/run_pretraining.py b/training/benchmarks/chatglm3_6b/deepspeed/run_pretraining.py
index f6dd9e4ad..fc150cd10 100644
--- a/training/benchmarks/chatglm3_6b/deepspeed/run_pretraining.py
+++ b/training/benchmarks/chatglm3_6b/deepspeed/run_pretraining.py
@@ -80,6 +80,8 @@ def get_deepspeed_engine(args, model_config_dir, flashattn):
                              mpu=None):
         model = get_chatglm_model(model_config_dir, flashattn)
 
+    if args.gradient_checkpointing_enable:
+        model.gradient_checkpointing_enable()
     model_engine, _, _, _ = deepspeed.initialize(
         args=args, model=model, model_parameters=model.parameters())
     return model_engine
@@ -109,6 +111,8 @@ def get_metric(texts):
     theoryflops = getattr(module, 'theoryflops')
     epochs = getattr(module, 'epochs')
     flashattn = getattr(module, 'flashattn')
+    gradient_checkpointing_enable = getattr(module, 'gradient_checkpointing_enable', False)
+    args.gradient_checkpointing_enable = gradient_checkpointing_enable
 
     deepspeed.init_distributed()
     model_engine = get_deepspeed_engine(args, os.path.join("chatglm3_6b_hf"),
diff --git a/training/benchmarks/llama2_7b_finetune/pytorch/optimizers/__init__.py b/training/benchmarks/llama2_7b_finetune/pytorch/optimizers/__init__.py
index 28dffc1a4..d3331479a 100755
--- a/training/benchmarks/llama2_7b_finetune/pytorch/optimizers/__init__.py
+++ b/training/benchmarks/llama2_7b_finetune/pytorch/optimizers/__init__.py
@@ -6,5 +6,6 @@ def create_optimizer(model, train_config):
         model.parameters(),
         lr=train_config.lr,
         weight_decay=train_config.weight_decay,
+        fused=True if train_config.use_fp16 else False
     )
     return opt
diff --git a/training/benchmarks/llama3_70B/flagscale/run_pretraining.py b/training/benchmarks/llama3_70B/flagscale/run_pretraining.py
new file mode 100644
index 000000000..28983a7b0
--- /dev/null
+++ b/training/benchmarks/llama3_70B/flagscale/run_pretraining.py
@@ -0,0 +1,187 @@
+import subprocess
+from argparse import ArgumentParser
+import os
+import sys
+from importlib import import_module
+import yaml
+import time
+
+
+def parse_args():
+    '''we parse ddp related args, check system config args, and running env
+       args such as --data_dir_xxx. Then pass all useful args to the real
+       training script.
+    '''
+    parser = ArgumentParser(description="flagscale main python")
+    parser.add_argument("--world_size", type=int, required=True)
+    parser.add_argument("--vendor", type=str, required=True)
+    parser.add_argument("--data_dir", type=str, required=True)
+    parser.add_argument("--hosts", type=str, required=True)
+    parser.add_argument("--host_addr", type=str, required=True)
+    parser.add_argument("--log_dir", type=str, required=True)
+    parser.add_argument("--flagperf_config_file", type=str, required=True)
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+def install_scale(module, log_dir, debug_mode=False):
+    if not debug_mode:
+        exec_cmd = getattr(module, "scale_download_cmd")
+        print(exec_cmd)
+
+        install_logdir = os.path.join(log_dir, "install_logs")
+        os.makedirs(install_logdir)
+
+        logfile = os.path.join(install_logdir, "scale_download.log.txt")
+        with open(logfile, 'w') as f:
+            p = subprocess.Popen(exec_cmd,
+                                 shell=True,
+                                 stdout=f,
+                                 stderr=subprocess.STDOUT)
+        p.wait()
+        f.close()
+
+        exec_cmd = getattr(module, "scale_install_cmd")
+        logfile = os.path.join(install_logdir, "scale_install.log.txt")
+        with open(logfile, 'w') as f:
+            p = subprocess.Popen(exec_cmd,
+                                 shell=True,
+                                 stdout=f,
+                                 stderr=subprocess.STDOUT)
+        p.wait()
+        f.close()
+
+
+def replace_yamls(scale_home, config_module, args):
+    scale_conf_dir = getattr(config_module, "scale_conf_dir")
+    dist_yaml = getattr(config_module, "configyaml")
+    with open(dist_yaml, 'r') as f:
+        dist_data = yaml.safe_load(f)
+
+    try:
+        dist_data["experiment"]["exp_dir"] = os.path.join(
+            args.log_dir, "outputs_llama3")
+        hosts = args.hosts.split(",")
+        dist_data["experiment"]["runner"]["nnodes"] = len(hosts)
+        dist_data["experiment"]["runner"]["ssh_port"] = getattr(
+            config_module, "flagscale_ssh_port")
+        hostfile = os.path.join(scale_home, "hostfile")
+        with open(hostfile, 'w') as f:
+            for host in hosts:
+                slots = dist_data["experiment"]["runner"]["nproc_per_node"]
+                chiptype = getattr(config_module, "flagscale_chip_type")
+                f.write(f"{host} slots={slots} type={chiptype}\n")
+        dist_data["experiment"]["runner"]["hostfile"] = hostfile
+        dist_data["experiment"]["cmds"] = getattr(config_module, "cmds")
+    except Exception as e:
+        print(e)
+        print(
+            "You're using an illegal config.yaml in flagscale. You must fix it"
+        )
+
+    print(dist_data)
+
+    train_yaml = getattr(config_module, "trainyaml")
+
+    with open(train_yaml, 'r') as f:
+        train_data = yaml.safe_load(f)
+
+    try:
+        train_data["system"]["checkpoint"].pop("load", None)
+
+        train_data["model"]["train_samples"] = int(
+            getattr(config_module, "steps") * 1024)
+        train_data["model"]["use_mcore_models"] = True
+        train_data["model"]["transformer_impl"] = "transformer_engine"
+        train_data["model"]["optimizer"]["lr_scheduler"][
+            "lr_warmup_samples"] = 12288
+        train_data["data"]["data_path"] = os.path.join(
+            args.data_dir, getattr(config_module, "dataset"),
+            "dedup-md5-pile-pile-cc_text_document")
+        train_data["data"]["tokenizer"]["tokenizer_path"] = os.path.join(
+            args.data_dir, getattr(config_module, "tokenizer"))
+    except Exception as e:
+        print(e)
+        print(train_data)
+        print(
+            "You're using an illegal trainllama.yaml in flagscale. You must fix it"
+        )
+
+    print(train_data)
+
+    with open(dist_yaml, 'w') as f:
+        yaml.safe_dump(dist_data, f)
+
+    with open(train_yaml, 'w') as f:
+        yaml.safe_dump(train_data, f)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    print(args)
+    host = args.host_addr
+    hosts = args.hosts.split(",")
+    print(host, hosts)
+
+    if host != hosts[0]:
+        exit(0)
+
+    sys.path.append(os.path.dirname(args.flagperf_config_file))
+    config_file = os.path.basename(args.flagperf_config_file).split('.')[0]
+
+    module = import_module(config_file)
+    print(module)
+    scale_home = getattr(module, "scale_home")
+
+    install_scale(module, args.log_dir)
+
+    replace_yamls(scale_home, module, args)
+
+    scale_conf_dir = getattr(module, "scale_conf_dir")
+    configyaml = getattr(module, "configyaml")
+    configname = os.path.splitext(os.path.basename(configyaml))[0]
+    exec_cmd = f"cd {scale_home}; python3 run.py --config-path {scale_conf_dir} --config-name {configname}"
+
+    print(exec_cmd)
+    with open(os.path.join(args.log_dir, "flagscale_main.log.txt"), 'w') as f:
+        p = subprocess.Popen(exec_cmd,
+                             shell=True,
+                             stdout=f,
+                             stderr=subprocess.STDOUT)
+        p.wait()
+
+    timestamp_log_host = hosts[-1]
+    timestamp_log_noderank = len(hosts) - 1
+
+    timestamp_log_file = os.path.join(
+        args.log_dir, "outputs_llama3", "logs", "host_" +
+        str(timestamp_log_noderank) + "_" + timestamp_log_host + ".output")
+
+    info_line = []
+    while True:
+        try:
+            with open(timestamp_log_file, 'r') as f:
+                lines = f.readlines()
+                for line in lines:
+                    if "elapsed time per iteration" in line:
+                        info_line.append(line)
+        except Exception as e:
+            print("Maybe some errors")
+        if len(info_line) == getattr(module, "steps"):
+            break
+        time.sleep(300)
+
+    infos = []
+    for line in info_line:
+        info = line.split("|")[2]
+        steptime = info.split(":")[1]
+        stepsecond = float(steptime) / 1000
+        infos.append(stepsecond)
+    print(infos)
+
+    ave_steptime = sum(infos[1:]) / len(infos[1:])
+    tps = 8192 * 1024 / ave_steptime / args.world_size
+    mfu = tps * 70E9 * 6 / getattr(module, "flops")
+    print(ave_steptime, tps)
+    print(f"MFU: {mfu}")
diff --git a/training/benchmarks/llama3_70B/megatron/run_pretraining.py b/training/benchmarks/llama3_70B/megatron/run_pretraining.py
index dbcdf93a7..0b12a1162 100644
--- a/training/benchmarks/llama3_70B/megatron/run_pretraining.py
+++ b/training/benchmarks/llama3_70B/megatron/run_pretraining.py
@@ -61,18 +61,20 @@ def parse_args():
     
     # merge llama3 patch
 
-    
-    origin_file = os.path.join(megapath, "megatron/training/arguments.py")
-    exec_cmd = "patch --silent --forward " + origin_file + " arguments.patch -o tmp.py;mv tmp.py " + origin_file
-    exec_cmd = exec_cmd + ";"
-    
-    origin_file = os.path.join(megapath, "megatron/training/tokenizer/tokenizer.py")
-    exec_cmd = exec_cmd + "patch --silent --forward " + origin_file + " tokenizer.patch -o tmp.py;mv tmp.py " + origin_file
-    exec_cmd = exec_cmd + ";"
-    
-    # bash pretrain_llama3.sh
-    
-    exec_cmd = exec_cmd + "bash pretrain_llama3.sh"
+    if args.vendor=="cambricon":
+        exec_cmd = "bash pretrain_llama3.sh"
+    else:      
+        origin_file = os.path.join(megapath, "megatron/training/arguments.py")
+        exec_cmd = "patch --silent --forward " + origin_file + " arguments.patch -o tmp.py;mv tmp.py " + origin_file
+        exec_cmd = exec_cmd + ";"
+        
+        origin_file = os.path.join(megapath, "megatron/training/tokenizer/tokenizer.py")
+        exec_cmd = exec_cmd + "patch --silent --forward " + origin_file + " tokenizer.patch -o tmp.py;mv tmp.py " + origin_file
+        exec_cmd = exec_cmd + ";"
+        
+        # bash pretrain_llama3.sh
+        
+        exec_cmd = exec_cmd + "bash pretrain_llama3.sh"
     
     # args
 
diff --git a/training/benchmarks/llama3_70B_continuetrain/flagscale/run_pretraining.py b/training/benchmarks/llama3_70B_continuetrain/flagscale/run_pretraining.py
new file mode 100644
index 000000000..e15f7842c
--- /dev/null
+++ b/training/benchmarks/llama3_70B_continuetrain/flagscale/run_pretraining.py
@@ -0,0 +1,193 @@
+import subprocess
+from argparse import ArgumentParser
+import os
+import sys
+from importlib import import_module
+import yaml
+import time
+
+
+def parse_args():
+    '''we parse ddp related args, check system config args, and running env
+       args such as --data_dir_xxx. Then pass all useful args to the real
+       training script.
+    '''
+    parser = ArgumentParser(description="flagscale main python")
+    parser.add_argument("--world_size", type=int, required=True)
+    parser.add_argument("--vendor", type=str, required=True)
+    parser.add_argument("--data_dir", type=str, required=True)
+    parser.add_argument("--hosts", type=str, required=True)
+    parser.add_argument("--host_addr", type=str, required=True)
+    parser.add_argument("--log_dir", type=str, required=True)
+    parser.add_argument("--flagperf_config_file", type=str, required=True)
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+def install_scale(module, log_dir, debug_mode=False):
+    if not debug_mode:
+        exec_cmd = getattr(module, "scale_download_cmd")
+        print(exec_cmd)
+
+        install_logdir = os.path.join(log_dir, "install_logs")
+        os.makedirs(install_logdir)
+
+        logfile = os.path.join(install_logdir, "scale_download.log.txt")
+        with open(logfile, 'w') as f:
+            p = subprocess.Popen(exec_cmd,
+                                 shell=True,
+                                 stdout=f,
+                                 stderr=subprocess.STDOUT)
+        p.wait()
+        f.close()
+
+        exec_cmd = getattr(module, "scale_install_cmd")
+        logfile = os.path.join(install_logdir, "scale_install.log.txt")
+        with open(logfile, 'w') as f:
+            p = subprocess.Popen(exec_cmd,
+                                 shell=True,
+                                 stdout=f,
+                                 stderr=subprocess.STDOUT)
+        p.wait()
+        f.close()
+
+
+def replace_yamls(scale_home, config_module, args):
+    scale_conf_dir = getattr(config_module, "scale_conf_dir")
+    dist_yaml = getattr(config_module, "configyaml")
+    with open(dist_yaml, 'r') as f:
+        dist_data = yaml.safe_load(f)
+
+    try:
+        train_yaml = getattr(config_module, "trainyaml")
+        filename = os.path.basename(train_yaml)
+        prefix_filename = os.path.splitext(filename)[0]
+        dist_data["defaults"][0]["train"] = prefix_filename
+        dist_data["experiment"]["exp_dir"] = os.path.join(
+            args.log_dir, "outputs_llama3")
+        hosts = args.hosts.split(",")
+        dist_data["experiment"]["runner"]["nnodes"] = len(hosts)
+        dist_data["experiment"]["runner"]["ssh_port"] = getattr(
+            config_module, "flagscale_ssh_port")
+        hostfile = os.path.join(scale_home, "hostfile")
+        with open(hostfile, 'w') as f:
+            for host in hosts:
+                slots = dist_data["experiment"]["runner"]["nproc_per_node"]
+                chiptype = getattr(config_module, "flagscale_chip_type")
+                f.write(f"{host} slots={slots} type={chiptype}\n")
+        dist_data["experiment"]["runner"]["hostfile"] = hostfile
+        dist_data["experiment"]["cmds"] = getattr(config_module, "cmds")
+    except Exception as e:
+        print(e)
+        print(
+            "You're using an illegal config.yaml in flagscale. You must fix it"
+        )
+
+    print(dist_data)
+
+    train_yaml = getattr(config_module, "trainyaml")
+
+    with open(train_yaml, 'r') as f:
+        train_data = yaml.safe_load(f)
+
+    try:
+        train_data["system"]["checkpoint"]["load"] = os.path.join(
+            args.data_dir, getattr(config_module, "ckpt"))
+        train_data["system"]["checkpoint"].pop("save", "None")
+
+        train_data["model"]["train_samples"] = int(
+            getattr(config_module, "steps") * 1024)
+        train_data["model"]["use_mcore_models"] = True
+        train_data["model"]["transformer_impl"] = "transformer_engine"
+        train_data["model"]["optimizer"]["lr_scheduler"][
+            "lr_warmup_samples"] = 51200
+        train_data["data"]["data_path"] = os.path.join(
+            args.data_dir, getattr(config_module, "dataset"),
+            "dedup-md5-pile-pile-cc_text_document")
+        train_data["data"]["tokenizer"]["tokenizer_path"] = os.path.join(
+            args.data_dir, getattr(config_module, "tokenizer"))
+    except Exception as e:
+        print(e)
+        print(train_data)
+        print(
+            "You're using an illegal trainllama.yaml in flagscale. You must fix it"
+        )
+
+    print(train_data)
+
+    with open(dist_yaml, 'w') as f:
+        yaml.safe_dump(dist_data, f)
+
+    with open(train_yaml, 'w') as f:
+        yaml.safe_dump(train_data, f)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    print(args)
+    host = args.host_addr
+    hosts = args.hosts.split(",")
+    print(host, hosts)
+
+    if host != hosts[0]:
+        exit(0)
+
+    sys.path.append(os.path.dirname(args.flagperf_config_file))
+    config_file = os.path.basename(args.flagperf_config_file).split('.')[0]
+
+    module = import_module(config_file)
+    print(module)
+    scale_home = getattr(module, "scale_home")
+
+    install_scale(module, args.log_dir)
+
+    replace_yamls(scale_home, module, args)
+
+    scale_conf_dir = getattr(module, "scale_conf_dir")
+    configyaml = getattr(module, "configyaml")
+    configname = os.path.splitext(os.path.basename(configyaml))[0]
+    exec_cmd = f"cd {scale_home}; python3 run.py --config-path {scale_conf_dir} --config-name {configname}"
+
+    print(exec_cmd)
+    with open(os.path.join(args.log_dir, "flagscale_main.log.txt"), 'w') as f:
+        p = subprocess.Popen(exec_cmd,
+                             shell=True,
+                             stdout=f,
+                             stderr=subprocess.STDOUT)
+        p.wait()
+
+    timestamp_log_host = hosts[-1]
+    timestamp_log_noderank = len(hosts) - 1
+
+    timestamp_log_file = os.path.join(
+        args.log_dir, "outputs_llama3", "logs", "host_" +
+        str(timestamp_log_noderank) + "_" + timestamp_log_host + ".output")
+
+    info_line = []
+    while True:
+        try:
+            with open(timestamp_log_file, 'r') as f:
+                lines = f.readlines()
+                for line in lines:
+                    if "elapsed time per iteration" in line:
+                        info_line.append(line)
+        except Exception as e:
+            print("Maybe some errors")
+        if len(info_line) == getattr(module, "steps"):
+            break
+        time.sleep(300)
+
+    infos = []
+    for line in info_line:
+        info = line.split("|")[2]
+        steptime = info.split(":")[1]
+        stepsecond = float(steptime) / 1000
+        infos.append(stepsecond)
+    print(infos)
+
+    ave_steptime = sum(infos[1:]) / len(infos[1:])
+    tps = 8192 * 1024 / ave_steptime / args.world_size
+    mfu = tps * 70E9 * 6 / getattr(module, "flops")
+    print(ave_steptime, tps)
+    print(f"MFU: {mfu}")
diff --git a/training/benchmarks/llama3_8B/megatron/run_pretraining.py b/training/benchmarks/llama3_8B/megatron/run_pretraining.py
index 620c77d82..ff64936e6 100644
--- a/training/benchmarks/llama3_8B/megatron/run_pretraining.py
+++ b/training/benchmarks/llama3_8B/megatron/run_pretraining.py
@@ -61,18 +61,20 @@ def parse_args():
     
     # merge llama3 patch
 
-    
-    origin_file = os.path.join(megapath, "megatron/training/arguments.py")
-    exec_cmd = "patch --silent --forward " + origin_file + " arguments.patch -o tmp.py;mv tmp.py " + origin_file
-    exec_cmd = exec_cmd + ";"
-    
-    origin_file = os.path.join(megapath, "megatron/training/tokenizer/tokenizer.py")
-    exec_cmd = exec_cmd + "patch --silent --forward " + origin_file + " tokenizer.patch -o tmp.py;mv tmp.py " + origin_file
-    exec_cmd = exec_cmd + ";"
-    
-    # bash pretrain_llama3.sh
-    
-    exec_cmd = exec_cmd + "bash pretrain_llama3.sh"
+    if args.vendor=="cambricon" or args.vendor=="metax":
+        exec_cmd = "bash pretrain_llama3.sh"
+    else:    
+        origin_file = os.path.join(megapath, "megatron/training/arguments.py")
+        exec_cmd = "patch --silent --forward " + origin_file + " arguments.patch -o tmp.py;mv tmp.py " + origin_file
+        exec_cmd = exec_cmd + ";"
+        
+        origin_file = os.path.join(megapath, "megatron/training/tokenizer/tokenizer.py")
+        exec_cmd = exec_cmd + "patch --silent --forward " + origin_file + " tokenizer.patch -o tmp.py;mv tmp.py " + origin_file
+        exec_cmd = exec_cmd + ";"
+        
+        # bash pretrain_llama3.sh
+        
+        exec_cmd = exec_cmd + "bash pretrain_llama3.sh"
     
     # args
 
diff --git a/training/benchmarks/llava1.5_13b/deepspeed-torch/evaluate/evaluator.py b/training/benchmarks/llava1.5_13b/deepspeed-torch/evaluate/evaluator.py
index 058cb9ac5..ac8f73745 100644
--- a/training/benchmarks/llava1.5_13b/deepspeed-torch/evaluate/evaluator.py
+++ b/training/benchmarks/llava1.5_13b/deepspeed-torch/evaluate/evaluator.py
@@ -1,3 +1,8 @@
+# cambricon mlu import
+try:
+    from torch_mlu.utils.model_transfer import transfer
+except ImportError:
+    pass
 import os
 import sys
 import torch
@@ -80,18 +85,19 @@ def main_eval(output_path, answer_path):
         
         exampels_to_eval = []
         for data_id, parsed_pred in cat_outputs.items():
-            question_type = cat_answers[data_id]['question_type']
-            if question_type != 'multiple-choice':
-                parsed_pred = parse_open_response(parsed_pred) # mainly for type consistency (make it number, etc.)
-            else:
-                parsed_pred = parsed_pred
-
-            exampels_to_eval.append({
-                "id": data_id,
-                "question_type": question_type,
-                "answer": cat_answers[data_id]['ground_truth'],
-                "parsed_pred": parsed_pred
-            })
+            if data_id in cat_answers:
+                question_type = cat_answers[data_id]['question_type']
+                if question_type != 'multiple-choice':
+                    parsed_pred = parse_open_response(parsed_pred) # mainly for type consistency (make it number, etc.)
+                else:
+                    parsed_pred = parsed_pred
+
+                exampels_to_eval.append({
+                    "id": data_id,
+                    "question_type": question_type,
+                    "answer": cat_answers[data_id]['ground_truth'],
+                    "parsed_pred": parsed_pred
+                })
 
         judge_dict, metric_dict = evaluate(exampels_to_eval)
         metric_dict.update({"num_example": len(exampels_to_eval)})
@@ -177,4 +183,4 @@ def eval_mmmu_llava(model_path, data_path, config_path, output_path, answer_path
 
 if __name__ == "__main__":
     _, model_path, data_path, config_path, output_path, answer_path = sys.argv
-    eval_mmmu_llava(model_path, data_path, config_path, output_path, answer_path)
\ No newline at end of file
+    eval_mmmu_llava(model_path, data_path, config_path, output_path, answer_path)
diff --git a/training/benchmarks/llava1.5_13b/deepspeed-torch/run_pretraining.py b/training/benchmarks/llava1.5_13b/deepspeed-torch/run_pretraining.py
index cc36be233..233190bfa 100644
--- a/training/benchmarks/llava1.5_13b/deepspeed-torch/run_pretraining.py
+++ b/training/benchmarks/llava1.5_13b/deepspeed-torch/run_pretraining.py
@@ -112,7 +112,7 @@ def get_argument_parser():
         mmmu_answer_path
     ])
     whole_tps_pretrain = (tokens_pretrain * 558128) / pretrain_time  # 714
-    chip_tps_pretrain = whole_tps_pretrain / args.nproc_per_node * args.nnodes
+    chip_tps_pretrain = whole_tps_pretrain / (args.nproc_per_node * args.nnodes)
     print("Pretrain stage")
     print("System tokens per second: ", whole_tps_pretrain)
     print("Tokens/p/s: ", chip_tps_pretrain)
@@ -121,7 +121,7 @@ def get_argument_parser():
     print("Tokens/TFLOPS: ", chip_tps_pretrain / TFLOPS)
     print("MFU: ", chip_tps_pretrain * 13000000000.0 * 2 / theoryflops)
     whole_tps_finetune = (tokens_finetune * 665344) / finetune_time
-    chip_tps_finetune = whole_tps_finetune / args.nproc_per_node * args.nnodes
+    chip_tps_finetune = whole_tps_finetune / (args.nproc_per_node * args.nnodes)
     print("Finetune stage")
     print("System tokens per second: ", whole_tps_finetune)
     print("Tokens/p/s: ", chip_tps_finetune)
diff --git a/training/benchmarks/llava1.5_13b/deepspeed-torch/train/train_mem.py b/training/benchmarks/llava1.5_13b/deepspeed-torch/train/train_mem.py
index 425a46f25..6f7379089 100644
--- a/training/benchmarks/llava1.5_13b/deepspeed-torch/train/train_mem.py
+++ b/training/benchmarks/llava1.5_13b/deepspeed-torch/train/train_mem.py
@@ -1,3 +1,8 @@
+# cambricon mlu import
+try:
+    from torch_mlu.utils.model_transfer import transfer
+except ImportError:
+    pass
 from train import train
 
 if __name__ == "__main__":
diff --git a/training/benchmarks/llava1.5_7b/deepspeed-torch/evaluate/evaluator.py b/training/benchmarks/llava1.5_7b/deepspeed-torch/evaluate/evaluator.py
index 058cb9ac5..ac8f73745 100644
--- a/training/benchmarks/llava1.5_7b/deepspeed-torch/evaluate/evaluator.py
+++ b/training/benchmarks/llava1.5_7b/deepspeed-torch/evaluate/evaluator.py
@@ -1,3 +1,8 @@
+# cambricon mlu import
+try:
+    from torch_mlu.utils.model_transfer import transfer
+except ImportError:
+    pass
 import os
 import sys
 import torch
@@ -80,18 +85,19 @@ def main_eval(output_path, answer_path):
         
         exampels_to_eval = []
         for data_id, parsed_pred in cat_outputs.items():
-            question_type = cat_answers[data_id]['question_type']
-            if question_type != 'multiple-choice':
-                parsed_pred = parse_open_response(parsed_pred) # mainly for type consistency (make it number, etc.)
-            else:
-                parsed_pred = parsed_pred
-
-            exampels_to_eval.append({
-                "id": data_id,
-                "question_type": question_type,
-                "answer": cat_answers[data_id]['ground_truth'],
-                "parsed_pred": parsed_pred
-            })
+            if data_id in cat_answers:
+                question_type = cat_answers[data_id]['question_type']
+                if question_type != 'multiple-choice':
+                    parsed_pred = parse_open_response(parsed_pred) # mainly for type consistency (make it number, etc.)
+                else:
+                    parsed_pred = parsed_pred
+
+                exampels_to_eval.append({
+                    "id": data_id,
+                    "question_type": question_type,
+                    "answer": cat_answers[data_id]['ground_truth'],
+                    "parsed_pred": parsed_pred
+                })
 
         judge_dict, metric_dict = evaluate(exampels_to_eval)
         metric_dict.update({"num_example": len(exampels_to_eval)})
@@ -177,4 +183,4 @@ def eval_mmmu_llava(model_path, data_path, config_path, output_path, answer_path
 
 if __name__ == "__main__":
     _, model_path, data_path, config_path, output_path, answer_path = sys.argv
-    eval_mmmu_llava(model_path, data_path, config_path, output_path, answer_path)
\ No newline at end of file
+    eval_mmmu_llava(model_path, data_path, config_path, output_path, answer_path)
diff --git a/training/benchmarks/llava1.5_7b/deepspeed-torch/run_pretraining.py b/training/benchmarks/llava1.5_7b/deepspeed-torch/run_pretraining.py
index f8d7e272c..cfa4c4f05 100644
--- a/training/benchmarks/llava1.5_7b/deepspeed-torch/run_pretraining.py
+++ b/training/benchmarks/llava1.5_7b/deepspeed-torch/run_pretraining.py
@@ -114,7 +114,7 @@ def get_argument_parser():
         mmmu_answer_path
     ])
     whole_tps_pretrain = (tokens_pretrain * 558128) / pretrain_time  # 714
-    chip_tps_pretrain = whole_tps_pretrain / args.nproc_per_node * args.nnodes
+    chip_tps_pretrain = whole_tps_pretrain / (args.nproc_per_node * args.nnodes)
     print("Pretrain stage")
     print("System tokens per second: ", whole_tps_pretrain)
     print("Tokens/p/s: ", chip_tps_pretrain)
@@ -123,7 +123,7 @@ def get_argument_parser():
     print("Tokens/TFLOPS: ", chip_tps_pretrain / TFLOPS)
     print("MFU: ", chip_tps_pretrain * 7000000000.0 * 2 / theoryflops)
     whole_tps_finetune = (tokens_finetune * 665344) / finetune_time
-    chip_tps_finetune = whole_tps_finetune / args.nproc_per_node * args.nnodes
+    chip_tps_finetune = whole_tps_finetune / (args.nproc_per_node * args.nnodes)
     print("Finetune stage")
     print("System tokens per second: ", whole_tps_finetune)
     print("Tokens/p/s: ", chip_tps_finetune)
@@ -139,4 +139,4 @@ def get_argument_parser():
                                         args.nnodes) / theoryflops
     print("two-stage average")
     print("MFU: ", mfu_average)
-    print("Actual computing power: ", mfu_average * TFLOPS)
\ No newline at end of file
+    print("Actual computing power: ", mfu_average * TFLOPS)
diff --git a/training/benchmarks/llava1.5_7b/deepspeed-torch/train/train_mem.py b/training/benchmarks/llava1.5_7b/deepspeed-torch/train/train_mem.py
index 425a46f25..6f7379089 100644
--- a/training/benchmarks/llava1.5_7b/deepspeed-torch/train/train_mem.py
+++ b/training/benchmarks/llava1.5_7b/deepspeed-torch/train/train_mem.py
@@ -1,3 +1,8 @@
+# cambricon mlu import
+try:
+    from torch_mlu.utils.model_transfer import transfer
+except ImportError:
+    pass
 from train import train
 
 if __name__ == "__main__":
diff --git a/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py b/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py
new file mode 100644
index 000000000..dd2871dae
--- /dev/null
+++ b/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py
@@ -0,0 +1,234 @@
+import subprocess
+from argparse import ArgumentParser
+import os
+import sys
+from importlib import import_module
+import yaml
+import time
+
+
+def parse_args():
+    '''we parse ddp related args, check system config args, and running env
+       args such as --data_dir_xxx. Then pass all useful args to the real
+       training script.
+    '''
+    parser = ArgumentParser(description="flagscale main python")
+    parser.add_argument("--world_size", type=int, required=True)
+    parser.add_argument("--vendor", type=str, required=True)
+    parser.add_argument("--data_dir", type=str, required=True)
+    parser.add_argument("--hosts", type=str, required=True)
+    parser.add_argument("--host_addr", type=str, required=True)
+    parser.add_argument("--log_dir", type=str, required=True)
+    parser.add_argument("--flagperf_config_file", type=str, required=True)
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+def install_scale(module, log_dir, debug_mode=False):
+    if not debug_mode:
+        exec_cmd = getattr(module, "scale_download_cmd")
+        print(exec_cmd)
+
+        install_logdir = os.path.join(log_dir, "install_logs")
+        os.makedirs(install_logdir)
+
+        logfile = os.path.join(install_logdir, "scale_download.log.txt")
+        with open(logfile, 'w') as f:
+            p = subprocess.Popen(exec_cmd,
+                                 shell=True,
+                                 stdout=f,
+                                 stderr=subprocess.STDOUT)
+        p.wait()
+        f.close()
+
+        exec_cmd = getattr(module, "scale_install_cmd")
+        logfile = os.path.join(install_logdir, "scale_install.log.txt")
+        with open(logfile, 'w') as f:
+            p = subprocess.Popen(exec_cmd,
+                                 shell=True,
+                                 stdout=f,
+                                 stderr=subprocess.STDOUT)
+        p.wait()
+        f.close()
+
+        exec_cmd = getattr(module, "energon_locate_cmd")
+        logfile = os.path.join(install_logdir, "energon_locate.log.txt")
+        with open(logfile, 'w') as f:
+            p = subprocess.Popen(exec_cmd,
+                                 shell=True,
+                                 stdout=f,
+                                 stderr=subprocess.STDOUT)
+        p.wait()
+        f.close()
+
+        with open(logfile, 'r') as f:
+            energon_locate = f.readline().replace('\n', '')
+        print(energon_locate)
+
+        src_dir = os.path.join(energon_locate, "megatron", "energon")
+        dst_dir = os.path.join(getattr(module, "scale_home"), "megatron",
+                               "megatron")
+        exec_cmd = f"cp -r {src_dir} {dst_dir}/"
+
+        logfile = os.path.join(install_logdir, "energon_copy.log.txt")
+        with open(logfile, 'w') as f:
+            p = subprocess.Popen(exec_cmd,
+                                 shell=True,
+                                 stdout=f,
+                                 stderr=subprocess.STDOUT)
+        p.wait()
+        f.close()
+
+
+def replace_yamls(scale_home, config_module, args):
+    scale_conf_dir = getattr(config_module, "scale_conf_dir")
+    dist_yaml = getattr(config_module, "configyaml")
+    with open(dist_yaml, 'r') as f:
+        dist_data = yaml.safe_load(f)
+
+    try:
+        dist_data["experiment"]["exp_dir"] = os.path.join(
+            args.log_dir, "outputs_llava1.5")
+        hosts = args.hosts.split(",")
+        dist_data["experiment"]["runner"]["nnodes"] = len(hosts)
+        dist_data["experiment"]["runner"]["ssh_port"] = getattr(
+            config_module, "flagscale_ssh_port")
+        hostfile = os.path.join(scale_home, "hostfile")
+        with open(hostfile, 'w') as f:
+            for host in hosts:
+                slots = dist_data["experiment"]["runner"]["nproc_per_node"]
+                chiptype = getattr(config_module, "flagscale_chip_type")
+                f.write(f"{host} slots={slots} type={chiptype}\n")
+        dist_data["experiment"]["runner"]["hostfile"] = hostfile
+        dist_data["experiment"]["cmds"] = getattr(config_module, "cmds")
+    except Exception as e:
+        print(e)
+        print(
+            "You're using an illegal config.yaml in flagscale. You must fix it"
+        )
+
+    print(dist_data)
+
+    train_yaml = getattr(config_module, "trainyaml")
+
+    with open(train_yaml, 'r') as f:
+        train_data = yaml.safe_load(f)
+
+    try:
+        train_data["system"]["checkpoint"]["save_interval"] = 1000
+        train_data["system"]["checkpoint"][
+            "pretrained_checkpoint"] = os.path.join(
+                args.data_dir, "LLaVA_megatron",
+                "vicuna_instruct_clip336_tp1_combined_mcore")
+
+        train_data["model"]["train_iters"] = getattr(config_module, "steps")
+        train_data["model"].pop("img_embedding_idx", None)
+        train_data["data"]["data_path"] = getattr(config_module, "datasetyaml")
+        train_data["data"]["valid_path"] = getattr(config_module,
+                                                   "datasetyaml")
+        train_data["data"]["prompt_path"] = getattr(config_module, "prompt")
+        train_data["data"]["tokenizer"]["tokenizer_model"] = os.path.join(
+            args.data_dir, "vicuna-7b-v1___5/tokenizer.model")
+    except Exception as e:
+        print(
+            "You're using an illegal trainllava.yaml in flagscale. You must fix it"
+        )
+
+    print(train_data)
+
+    dataset_yaml = getattr(config_module, "datasetyaml")
+
+    with open(dataset_yaml, 'r') as f:
+        dataset_data = yaml.safe_load(f)
+
+    try:
+        llava_train_dir = os.path.join(args.data_dir, "LLaVA-Pretrain/wds")
+        dataset_data["splits"]["train"]["datasets"][0][
+            "path"] = llava_train_dir
+        dataset_data["splits"]["val"]["datasets"][0]["path"] = llava_train_dir
+    except Exception as e:
+        print(
+            "You're using an illegal dataset.yaml in flagscale. You must fix it"
+        )
+
+    print(dataset_data)
+
+    with open(dist_yaml, 'w') as f:
+        yaml.safe_dump(dist_data, f)
+
+    with open(train_yaml, 'w') as f:
+        yaml.safe_dump(train_data, f)
+
+    with open(dataset_yaml, 'w') as f:
+        yaml.safe_dump(dataset_data, f)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    print(args)
+    host = args.host_addr
+    hosts = args.hosts.split(",")
+    print(host, hosts)
+
+    if host != hosts[0]:
+        exit(0)
+
+    sys.path.append(os.path.dirname(args.flagperf_config_file))
+    config_file = os.path.basename(args.flagperf_config_file).split('.')[0]
+
+    module = import_module(config_file)
+    print(module)
+    scale_home = getattr(module, "scale_home")
+
+    install_scale(module, args.log_dir)
+
+    replace_yamls(scale_home, module, args)
+
+    scale_conf_dir = getattr(module, "scale_conf_dir")
+    configyaml = getattr(module, "configyaml")
+    configname = os.path.splitext(os.path.basename(configyaml))[0]
+    exec_cmd = f"cd {scale_home}; python3 run.py --config-path {scale_conf_dir} --config-name {configname}"
+
+    print(exec_cmd)
+    with open(os.path.join(args.log_dir, "flagscale_main.log.txt"), 'w') as f:
+        p = subprocess.Popen(exec_cmd,
+                             shell=True,
+                             stdout=f,
+                             stderr=subprocess.STDOUT)
+        p.wait()
+
+    timestamp_log_host = hosts[-1]
+    timestamp_log_noderank = len(hosts) - 1
+
+    timestamp_log_file = os.path.join(
+        args.log_dir, "outputs_llava1.5", "logs", "host_" +
+        str(timestamp_log_noderank) + "_" + timestamp_log_host + ".output")
+
+    info_line = []
+    while True:
+        try:
+            with open(timestamp_log_file, 'r') as f:
+                lines = f.readlines()
+                for line in lines:
+                    if "elapsed time per iteration" in line:
+                        info_line.append(line)
+        except Exception as e:
+            print("Maybe some errors")
+        if len(info_line) == getattr(module, "steps"):
+            break
+        time.sleep(300)
+
+    infos = []
+    for line in info_line:
+        info = line.split("|")[2]
+        steptime = info.split(":")[1]
+        stepsecond = float(steptime) / 1000
+        infos.append(stepsecond)
+    print(infos)
+
+    ave_steptime = sum(infos[1:]) / len(infos[1:])
+    tps = 2048 * 256 / ave_steptime / args.world_size
+    mfu = tps * 7E9 * 6 / getattr(module, "flops")
+    print(ave_steptime, tps)
+    print(f"MFU: {mfu}")
diff --git a/training/benchmarks/llava1.5_7b_continuetrain/flagscale/run_pretraining.py b/training/benchmarks/llava1.5_7b_continuetrain/flagscale/run_pretraining.py
new file mode 100644
index 000000000..e2a4a238b
--- /dev/null
+++ b/training/benchmarks/llava1.5_7b_continuetrain/flagscale/run_pretraining.py
@@ -0,0 +1,234 @@
+import subprocess
+from argparse import ArgumentParser
+import os
+import sys
+from importlib import import_module
+import yaml
+import time
+
+
+def parse_args():
+    '''we parse ddp related args, check system config args, and running env
+       args such as --data_dir_xxx. Then pass all useful args to the real
+       training script.
+    '''
+    parser = ArgumentParser(description="flagscale main python")
+    parser.add_argument("--world_size", type=int, required=True)
+    parser.add_argument("--vendor", type=str, required=True)
+    parser.add_argument("--data_dir", type=str, required=True)
+    parser.add_argument("--hosts", type=str, required=True)
+    parser.add_argument("--host_addr", type=str, required=True)
+    parser.add_argument("--log_dir", type=str, required=True)
+    parser.add_argument("--flagperf_config_file", type=str, required=True)
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+def install_scale(module, log_dir, debug_mode=False):
+    if not debug_mode:
+        exec_cmd = getattr(module, "scale_download_cmd")
+        print(exec_cmd)
+
+        install_logdir = os.path.join(log_dir, "install_logs")
+        os.makedirs(install_logdir)
+
+        logfile = os.path.join(install_logdir, "scale_download.log.txt")
+        with open(logfile, 'w') as f:
+            p = subprocess.Popen(exec_cmd,
+                                 shell=True,
+                                 stdout=f,
+                                 stderr=subprocess.STDOUT)
+        p.wait()
+        f.close()
+
+        exec_cmd = getattr(module, "scale_install_cmd")
+        logfile = os.path.join(install_logdir, "scale_install.log.txt")
+        with open(logfile, 'w') as f:
+            p = subprocess.Popen(exec_cmd,
+                                 shell=True,
+                                 stdout=f,
+                                 stderr=subprocess.STDOUT)
+        p.wait()
+        f.close()
+
+        exec_cmd = getattr(module, "energon_locate_cmd")
+        logfile = os.path.join(install_logdir, "energon_locate.log.txt")
+        with open(logfile, 'w') as f:
+            p = subprocess.Popen(exec_cmd,
+                                 shell=True,
+                                 stdout=f,
+                                 stderr=subprocess.STDOUT)
+        p.wait()
+        f.close()
+
+        with open(logfile, 'r') as f:
+            energon_locate = f.readline().replace('\n', '')
+        print(energon_locate)
+
+        src_dir = os.path.join(energon_locate, "megatron", "energon")
+        dst_dir = os.path.join(getattr(module, "scale_home"), "megatron",
+                               "megatron")
+        exec_cmd = f"cp -r {src_dir} {dst_dir}/"
+
+        logfile = os.path.join(install_logdir, "energon_copy.log.txt")
+        with open(logfile, 'w') as f:
+            p = subprocess.Popen(exec_cmd,
+                                 shell=True,
+                                 stdout=f,
+                                 stderr=subprocess.STDOUT)
+        p.wait()
+        f.close()
+
+
+def replace_yamls(scale_home, config_module, args):
+    scale_conf_dir = getattr(config_module, "scale_conf_dir")
+    dist_yaml = getattr(config_module, "configyaml")
+    with open(dist_yaml, 'r') as f:
+        dist_data = yaml.safe_load(f)
+
+    try:
+        dist_data["experiment"]["exp_dir"] = os.path.join(
+            args.log_dir, "outputs_llava1.5")
+        hosts = args.hosts.split(",")
+        dist_data["experiment"]["runner"]["nnodes"] = len(hosts)
+        dist_data["experiment"]["runner"]["ssh_port"] = getattr(
+            config_module, "flagscale_ssh_port")
+        hostfile = os.path.join(scale_home, "hostfile")
+        with open(hostfile, 'w') as f:
+            for host in hosts:
+                slots = dist_data["experiment"]["runner"]["nproc_per_node"]
+                chiptype = getattr(config_module, "flagscale_chip_type")
+                f.write(f"{host} slots={slots} type={chiptype}\n")
+        dist_data["experiment"]["runner"]["hostfile"] = hostfile
+        dist_data["experiment"]["cmds"] = getattr(config_module, "cmds")
+    except Exception as e:
+        print(e)
+        print(
+            "You're using an illegal config.yaml in flagscale. You must fix it"
+        )
+
+    print(dist_data)
+
+    train_yaml = getattr(config_module, "trainyaml")
+
+    with open(train_yaml, 'r') as f:
+        train_data = yaml.safe_load(f)
+
+    try:
+        train_data["system"]["checkpoint"]["save_interval"] = 1000
+        train_data["system"]["checkpoint"][
+            "pretrained_checkpoint"] = os.path.join(
+                args.data_dir, "LLaVA_megatron",
+                "vicuna_instruct_clip336_mlp_tp1_combined_mcore")
+
+        train_data["model"]["train_iters"] = getattr(config_module, "steps")
+        train_data["model"].pop("img_embedding_idx", None)
+        train_data["data"]["data_path"] = getattr(config_module, "datasetyaml")
+        train_data["data"]["valid_path"] = getattr(config_module,
+                                                   "datasetyaml")
+        train_data["data"]["prompt_path"] = getattr(config_module, "prompt")
+        train_data["data"]["tokenizer"]["tokenizer_model"] = os.path.join(
+            args.data_dir, "vicuna-7b-v1___5/tokenizer.model")
+    except Exception as e:
+        print(
+            "You're using an illegal trainllava.yaml in flagscale. You must fix it"
+        )
+
+    print(train_data)
+
+    dataset_yaml = getattr(config_module, "datasetyaml")
+
+    with open(dataset_yaml, 'r') as f:
+        dataset_data = yaml.safe_load(f)
+
+    try:
+        llava_train_dir = os.path.join(args.data_dir, "LLaVA-Pretrain/wds")
+        dataset_data["splits"]["train"]["datasets"][0][
+            "path"] = llava_train_dir
+        dataset_data["splits"]["val"]["datasets"][0]["path"] = llava_train_dir
+    except Exception as e:
+        print(
+            "You're using an illegal dataset.yaml in flagscale. You must fix it"
+        )
+
+    print(dataset_data)
+
+    with open(dist_yaml, 'w') as f:
+        yaml.safe_dump(dist_data, f)
+
+    with open(train_yaml, 'w') as f:
+        yaml.safe_dump(train_data, f)
+
+    with open(dataset_yaml, 'w') as f:
+        yaml.safe_dump(dataset_data, f)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    print(args)
+    host = args.host_addr
+    hosts = args.hosts.split(",")
+    print(host, hosts)
+
+    if host != hosts[0]:
+        exit(0)
+
+    sys.path.append(os.path.dirname(args.flagperf_config_file))
+    config_file = os.path.basename(args.flagperf_config_file).split('.')[0]
+
+    module = import_module(config_file)
+    print(module)
+    scale_home = getattr(module, "scale_home")
+
+    install_scale(module, args.log_dir)
+
+    replace_yamls(scale_home, module, args)
+
+    scale_conf_dir = getattr(module, "scale_conf_dir")
+    configyaml = getattr(module, "configyaml")
+    configname = os.path.splitext(os.path.basename(configyaml))[0]
+    exec_cmd = f"cd {scale_home}; python3 run.py --config-path {scale_conf_dir} --config-name {configname}"
+
+    print(exec_cmd)
+    with open(os.path.join(args.log_dir, "flagscale_main.log.txt"), 'w') as f:
+        p = subprocess.Popen(exec_cmd,
+                             shell=True,
+                             stdout=f,
+                             stderr=subprocess.STDOUT)
+        p.wait()
+
+    timestamp_log_host = hosts[-1]
+    timestamp_log_noderank = len(hosts) - 1
+
+    timestamp_log_file = os.path.join(
+        args.log_dir, "outputs_llava1.5", "logs", "host_" +
+        str(timestamp_log_noderank) + "_" + timestamp_log_host + ".output")
+
+    info_line = []
+    while True:
+        try:
+            with open(timestamp_log_file, 'r') as f:
+                lines = f.readlines()
+                for line in lines:
+                    if "elapsed time per iteration" in line:
+                        info_line.append(line)
+        except Exception as e:
+            print("Maybe some errors")
+        if len(info_line) == getattr(module, "steps"):
+            break
+        time.sleep(300)
+
+    infos = []
+    for line in info_line:
+        info = line.split("|")[2]
+        steptime = info.split(":")[1]
+        stepsecond = float(steptime) / 1000
+        infos.append(stepsecond)
+    print(infos)
+
+    ave_steptime = sum(infos[1:]) / len(infos[1:])
+    tps = 2048 * 256 / ave_steptime / args.world_size
+    mfu = tps * 7E9 * 6 / getattr(module, "flops")
+    print(ave_steptime, tps)
+    print(f"MFU: {mfu}")
diff --git a/training/cambricon/aquila2_34B_container-in_container/config.py b/training/cambricon/aquila2_34B_container-in_container/config.py
index e105e1bad..97950bf7d 100755
--- a/training/cambricon/aquila2_34B_container-in_container/config.py
+++ b/training/cambricon/aquila2_34B_container-in_container/config.py
@@ -8,7 +8,7 @@
 # =========================================================
 # chip attribute
 # =========================================================
-flops_16bit = "294900000000000"
+flops_16bit = "0.0"
 
 # =========================================================
 # env attribute
diff --git a/training/cambricon/aquila2_70B_container-in_container/config.py b/training/cambricon/aquila2_70B_container-in_container/config.py
index e105e1bad..97950bf7d 100755
--- a/training/cambricon/aquila2_70B_container-in_container/config.py
+++ b/training/cambricon/aquila2_70B_container-in_container/config.py
@@ -8,7 +8,7 @@
 # =========================================================
 # chip attribute
 # =========================================================
-flops_16bit = "294900000000000"
+flops_16bit = "0.0"
 
 # =========================================================
 # env attribute
diff --git a/training/cambricon/aquila2_7B_container-in_container/config.py b/training/cambricon/aquila2_7B_container-in_container/config.py
index e105e1bad..97950bf7d 100755
--- a/training/cambricon/aquila2_7B_container-in_container/config.py
+++ b/training/cambricon/aquila2_7B_container-in_container/config.py
@@ -8,7 +8,7 @@
 # =========================================================
 # chip attribute
 # =========================================================
-flops_16bit = "294900000000000"
+flops_16bit = "0.0"
 
 # =========================================================
 # env attribute
diff --git a/training/cambricon/baichuan2_13b-deepspeed/README.md b/training/cambricon/baichuan2_13b-deepspeed/README.md
new file mode 100644
index 000000000..f07d1fd06
--- /dev/null
+++ b/training/cambricon/baichuan2_13b-deepspeed/README.md
@@ -0,0 +1,52 @@
+### Cambricon MLU配置与运行信息参考
+#### 环境配置
+- ##### MLU硬件环境
+    - 机器型号: /
+    - 加速卡型号: /
+    - CPU型号: /
+    - 多机网络类型、带宽: /
+
+- ##### MLU软件环境
+   - OS版本：Ubuntu 22.04 LTS
+   - OS kernel版本: 5.15.0-107-generic     
+   - 加速卡驱动版本：5.10.34
+   - Docker 版本：24.04
+   - 训练框架版本：deepspeed 0.10.1
+   
+- ##### 并行策略
+
+   - 并行技术：sharded data parallel
+   - 实施者：deepspeed ZeRO-DP
+   - 实施细节：ZeRO-DP O1, DP_SIZE=8
+
+- ##### 优化策略
+
+   - gradient checkpointing
+
+### 运行情况
+
+* 输入批尺寸
+  1. local_batchsize(micro_batchsize)，简写为LBS，即实际进入模型的张量批尺寸，为config_MLUx1x8.py中所写，在本case中默认为2
+  2. seqlength(max_position_embedding)，简写为MPE，即实际进入模型的序列长度，为config_MLUx1x8.py中所写，在本case中默认为2048
+  3. gradient_accumulate_steps，简写为GAS，即梯度累加步数，为ds_config.json中所写，在本case中默认为8，精度对齐实验默认为64
+  4. global_batchsize恒等于local_batchsize\*gradient_accumulate_steps\*data_parallel_size，简写为GBS。在本case中，只存在数据并行，因此data_parallel_size=world_size。
+
+* 通用指标
+
+| 指标名称    | 指标值                   | 特殊说明                          |
+| ------- | --------------------- | ----------------------------- |
+| 任务类别    | 自然语言理解                |                               |
+| 模型      | baichuan2_13b         |                               |
+| 数据集     | openwebtext           | 如无特殊说明，训练前1亿个token            |
+| 数据精度    | bf16                  |                               |
+| 超参修改    | fix_hp,见“性能指标”        | 运行必要特殊超参，例如需要改小seqlength避免OOM |
+| 硬件设备简称  | /           |                               |
+| 硬件存储使用  | mem,见“性能指标”           | 通常称为“显存”,单位为GiB               |
+| 计算使用率   | MFU,见“性能指标”           | 参见PaLM论文定义                    |
+| **吞吐量** | **token/p/s,见“性能指标”** | 平均单卡每秒处理的token数               |
+
+* 性能指标
+
+| 配置              | fix_hp                     | token/p/s | loss | mem   | MFU   |
+| --------------- | -------------------------- | --------- | ---- | ----- | ----- |
+| / 1机8卡（1x8） | / | / | / | / | / |
diff --git a/training/cambricon/baichuan2_13b-deepspeed/config/config_MLUx1x8.py b/training/cambricon/baichuan2_13b-deepspeed/config/config_MLUx1x8.py
new file mode 100644
index 000000000..061b32ee8
--- /dev/null
+++ b/training/cambricon/baichuan2_13b-deepspeed/config/config_MLUx1x8.py
@@ -0,0 +1,12 @@
+seqlength = 2048
+batchsize = 2
+datafilename = "openwebtext_baichuan2_100M.npy"
+# 请在Flagperf目录下，创建MLU_FP16_FLOPS.py，
+# 文件包含MLU硬件算力值，示例如下：
+# MLU_FP16_FLOPS=1.0
+FLOPS_DIR='../../../../'
+import sys
+sys.path.append(FLOPS_DIR)
+from MLU_FP16_FLOPS import MLU_FP16_FLOPS
+theoryflops = float(MLU_FP16_FLOPS)
+epochs = 1
diff --git a/training/cambricon/baichuan2_13b-deepspeed/config/config_MLUx2x8.py b/training/cambricon/baichuan2_13b-deepspeed/config/config_MLUx2x8.py
new file mode 100644
index 000000000..ed070856b
--- /dev/null
+++ b/training/cambricon/baichuan2_13b-deepspeed/config/config_MLUx2x8.py
@@ -0,0 +1,12 @@
+seqlength = 2048
+batchsize = 1
+datafilename = "openwebtext_baichuan2_100M.npy"
+# 请在Flagperf目录下，创建MLU_FP16_FLOPS.py，
+# 文件包含MLU硬件算力值，示例如下：
+# MLU_FP16_FLOPS=1.0
+FLOPS_DIR='../../../../'
+import sys
+sys.path.append(FLOPS_DIR)
+from MLU_FP16_FLOPS import MLU_FP16_FLOPS
+theoryflops = float(MLU_FP16_FLOPS)
+epochs = 1
diff --git a/training/cambricon/baichuan2_13b-deepspeed/config/ds_config.json b/training/cambricon/baichuan2_13b-deepspeed/config/ds_config.json
new file mode 100644
index 000000000..2c66645d9
--- /dev/null
+++ b/training/cambricon/baichuan2_13b-deepspeed/config/ds_config.json
@@ -0,0 +1,4 @@
+{
+    "gradient_accumulation_steps": 64
+  }
+  
diff --git a/training/cambricon/baichuan2_13b-deepspeed/config/net.sh b/training/cambricon/baichuan2_13b-deepspeed/config/net.sh
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/cambricon/baichuan2_13b-deepspeed/config/requirements.txt b/training/cambricon/baichuan2_13b-deepspeed/config/requirements.txt
new file mode 100644
index 000000000..112148b3c
--- /dev/null
+++ b/training/cambricon/baichuan2_13b-deepspeed/config/requirements.txt
@@ -0,0 +1,4 @@
+regex==2024.5.15
+schedule==1.2.2
+accelerate==0.31.0
+transformers==4.40.1
\ No newline at end of file
diff --git a/training/cambricon/cambricon_monitor.py b/training/cambricon/cambricon_monitor.py
index 8a976fb48..90db55a0c 100755
--- a/training/cambricon/cambricon_monitor.py
+++ b/training/cambricon/cambricon_monitor.py
@@ -75,10 +75,11 @@ def run(self):
 
         def mlu_mon(file):
             TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
-            cmd = "cnmon |grep 'Default'|awk '{print $3,$4,$5,$9,$10,$11,$2}' && cnmon |grep 'MLU590-M9'|awk '{print $9}';"
+            cmd = "paste <(cnmon |grep 'Default') <(cnmon |grep 'MLU' | head -n -1) | awk '{print $3,$4,$5,$9,$10,$11,$25}'; echo \"\""
             process = subprocess.Popen(cmd,
                                        shell=True,
                                        stdout=subprocess.PIPE,
+                                       executable="/bin/bash",
                                        stderr=subprocess.STDOUT,
                                        encoding='utf-8')
             try:
@@ -211,7 +212,7 @@ def parse_args():
                        type=str,
                        metavar='[log_path]',
                        required=False,
-                       default='./logs/',
+                       default='/tmp/',
                        help='log path')
     args = parse.parse_args()
     return args
@@ -285,4 +286,4 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
+    main()
\ No newline at end of file
diff --git a/training/cambricon/docker_image/deepspeed-torch/Dockerfile b/training/cambricon/docker_image/deepspeed-torch/Dockerfile
new file mode 100644
index 000000000..f601c5adb
--- /dev/null
+++ b/training/cambricon/docker_image/deepspeed-torch/Dockerfile
@@ -0,0 +1,14 @@
+FROM flagperf:cambricon-deepspeed-v24.06-torch2.1.0-catch1.21.0-ubuntu22.04-py310-megatron-patch
+#shell
+SHELL ["/bin/bash", "-c"]
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -yq --no-install-recommends tzdata && apt-get install -y openssh-server && mkdir -p /run/sshd
+RUN cp /etc/apt/sources.list /etc/apt/sources.list.backup \
+  && sed -i 's|http://.*archive.ubuntu.com/ubuntu/|https://mirrors.tuna.tsinghua.edu.cn/ubuntu/|g' /etc/apt/sources.list \
+  && sed -i 's|http://.*security.ubuntu.com/ubuntu/|https://mirrors.tuna.tsinghua.edu.cn/ubuntu/|g' /etc/apt/sources.list
+RUN apt update -y && apt install -y sudo dmidecode ipmitool sysstat net-tools sshpass
+# modify ~/.bashrc file
+RUN sed -i '/\[ -z "\$PS1" \] \&\& return/s/^/#/' ~/.bashrc
+RUN echo -e "\n# Add environment variables\n\
+export NEUWARE_HOME=/usr/local/neuware\n\
+export LD_LIBRARY_PATH=/usr/local/mpi_wrapper/build/install/lib64:/usr/local/neuware/lib64:/usr/local/openmpi/lib:${LD_LIBRARY_PATH}\n\
+export PATH=/torch/venv3/pytorch/bin:/torch/venv3/pytorch/bin:/usr/local/neuware/bin:/usr/local/openmpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:${PATH}" >> ~/.bashrc
diff --git a/training/cambricon/docker_image/deepspeed-torch/deepspeed-torch_install.sh b/training/cambricon/docker_image/deepspeed-torch/deepspeed-torch_install.sh
new file mode 100644
index 000000000..17a77b3f9
--- /dev/null
+++ b/training/cambricon/docker_image/deepspeed-torch/deepspeed-torch_install.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+set -xe
+pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+pip install loguru schedule protobuf sentencepiece datasets==2.15.0 schedule==1.2.2 safetensors==0.4.3 numpy==1.26.4
+pip uninstall -y transformer-engine
+# transformers and accelarate
+git clone https://gitee.com/xiaoqi25478/cambricon_wheels.git
+cd cambricon_wheels/transformers
+pip install -e .
+cd ../accelerate
+pip install -e .
+cd ../../
\ No newline at end of file
diff --git a/training/cambricon/docker_image/deepspeed/Dockerfile b/training/cambricon/docker_image/deepspeed/Dockerfile
new file mode 100644
index 000000000..f601c5adb
--- /dev/null
+++ b/training/cambricon/docker_image/deepspeed/Dockerfile
@@ -0,0 +1,14 @@
+FROM flagperf:cambricon-deepspeed-v24.06-torch2.1.0-catch1.21.0-ubuntu22.04-py310-megatron-patch
+#shell
+SHELL ["/bin/bash", "-c"]
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -yq --no-install-recommends tzdata && apt-get install -y openssh-server && mkdir -p /run/sshd
+RUN cp /etc/apt/sources.list /etc/apt/sources.list.backup \
+  && sed -i 's|http://.*archive.ubuntu.com/ubuntu/|https://mirrors.tuna.tsinghua.edu.cn/ubuntu/|g' /etc/apt/sources.list \
+  && sed -i 's|http://.*security.ubuntu.com/ubuntu/|https://mirrors.tuna.tsinghua.edu.cn/ubuntu/|g' /etc/apt/sources.list
+RUN apt update -y && apt install -y sudo dmidecode ipmitool sysstat net-tools sshpass
+# modify ~/.bashrc file
+RUN sed -i '/\[ -z "\$PS1" \] \&\& return/s/^/#/' ~/.bashrc
+RUN echo -e "\n# Add environment variables\n\
+export NEUWARE_HOME=/usr/local/neuware\n\
+export LD_LIBRARY_PATH=/usr/local/mpi_wrapper/build/install/lib64:/usr/local/neuware/lib64:/usr/local/openmpi/lib:${LD_LIBRARY_PATH}\n\
+export PATH=/torch/venv3/pytorch/bin:/torch/venv3/pytorch/bin:/usr/local/neuware/bin:/usr/local/openmpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:${PATH}" >> ~/.bashrc
diff --git a/training/cambricon/docker_image/deepspeed/deepspeed_install.sh b/training/cambricon/docker_image/deepspeed/deepspeed_install.sh
new file mode 100644
index 000000000..5814907d4
--- /dev/null
+++ b/training/cambricon/docker_image/deepspeed/deepspeed_install.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+set -xe
+pip install regex==2024.5.15
+pip install schedule==1.2.2
+pip install accelerate==0.31.0
+pip install transformers==4.40.1
\ No newline at end of file
diff --git a/training/cambricon/docker_image/megatron/Dockerfile b/training/cambricon/docker_image/megatron/Dockerfile
new file mode 100644
index 000000000..b2242d7c0
--- /dev/null
+++ b/training/cambricon/docker_image/megatron/Dockerfile
@@ -0,0 +1,17 @@
+FROM flagperf:cambricon-deepspeed-v24.06-torch2.1.0-catch1.21.0-ubuntu22.04-py310-megatron-patch
+#shell
+SHELL ["/bin/bash", "-c"]
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -yq --no-install-recommends tzdata && apt-get install -y openssh-server && mkdir -p /run/sshd
+RUN cp /etc/apt/sources.list /etc/apt/sources.list.backup \
+  && sed -i 's|http://.*archive.ubuntu.com/ubuntu/|https://mirrors.tuna.tsinghua.edu.cn/ubuntu/|g' /etc/apt/sources.list \
+  && sed -i 's|http://.*security.ubuntu.com/ubuntu/|https://mirrors.tuna.tsinghua.edu.cn/ubuntu/|g' /etc/apt/sources.list
+RUN apt update -y && apt install -y sudo dmidecode ipmitool sysstat net-tools sshpass
+# modify ~/.bashrc file
+RUN sed -i '/\[ -z "\$PS1" \] \&\& return/s/^/#/' ~/.bashrc
+RUN echo -e "\n# Add environment variables\n\
+export NEUWARE_HOME=/usr/local/neuware\n\
+export LD_LIBRARY_PATH=/usr/local/mpi_wrapper/build/install/lib64:/usr/local/neuware/lib64:/usr/local/openmpi/lib:${LD_LIBRARY_PATH}\n\
+export PATH=/torch/venv5/pytorch/bin:/torch/venv3/pytorch/bin:/usr/local/neuware/bin:/usr/local/openmpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:${PATH}\n\
+export CNCL_MLULINK_OVER_ROCE_DISABLE=1\n\
+export CNCL_MLULINK_CROSS_HOSTS_ENABLE=0\n\
+export CNCL_MLU_DIRECT_LEVEL=1" >> ~/.bashrc
diff --git a/training/cambricon/docker_image/megatron/megatron_install.sh b/training/cambricon/docker_image/megatron/megatron_install.sh
new file mode 100644
index 000000000..0f2d2df25
--- /dev/null
+++ b/training/cambricon/docker_image/megatron/megatron_install.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+set -xe
+pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+pip install regex==2024.5.15 schedule==1.2.2 accelerate==0.31.0 transformers==4.40.1 pybind11
\ No newline at end of file
diff --git a/training/cambricon/llama3_70B-megatron/README.md b/training/cambricon/llama3_70B-megatron/README.md
new file mode 100644
index 000000000..66205c485
--- /dev/null
+++ b/training/cambricon/llama3_70B-megatron/README.md
@@ -0,0 +1,53 @@
+
+### Cambricon MLU配置与运行信息参考
+#### MLU环境配置
+- ##### 软硬件环境
+
+    - 加速卡型号: /
+    - 多机网络类型、带宽: /
+
+   - 训练框架版本：megatron-core tag:core_v0.6.0
+   - 依赖软件版本：sentencepiece==0.2.0, transformers==4.40.1
+
+- ##### 并行策略
+
+   - 并行技术：张量、流水、数据混合并行，具体并行方案见“运行情况”章节
+   - 实施者：megatron-core
+   - 实施细节：PP4DP2TP8
+
+- ##### 优化策略
+
+   - flash attention 2
+   - recompute-activations
+   - transformer-engine impl
+
+### 运行情况
+
+* 输入批尺寸
+  1. local_batchsize(micro_batchsize)，简写为LBS，即实际进入模型的张量批尺寸，为config_MLUx4x8.py中所写，在本case中默认为1。**厂商适配时可任意更改**
+  2. seqlength(max_position_embedding)，简写为MPE，即实际进入模型的序列长度，为config_MLUx4x8.py中所写，在本case中默认为8192，原则上不可更改
+  3. global_batchsize恒等于local_batchsize\*gradient_accumulate_steps\*data_parallel_size。在本case中，data_parallel_size=world_size/TPsize/PPsize。在本case中默认为512，使得globalbatchsize=4M tokens。
+
+* 通用指标
+
+| 指标名称    | 指标值                   | 特殊说明                                     |
+| ------- | --------------------- | ---------------------------------------- |
+| 任务类别    | 自然语言理解                |                                          |
+| 模型      | llama3_70b             |                                          |
+| 数据集     | wudao                 | wudao数据集来源于智源研究院<br>bin/idx数据集文件来源于阿里云灵骏团队<br>使用llama3 tokenizer预处理 |
+| 数据精度    | precision,见“性能指标”     | 可选fp32/amp/fp16/bf16                     |
+| 超参修改    | parallel,见“性能指标”      | 格式为PPxDPyTPz，例如PP2DP2TP8                 |
+| 超参修改    | fix_hp,见“性能指标”        | 跑满硬件设备评测吞吐量所需特殊超参                        |
+| 硬件设备简称  | cambricon /          |                                          |
+| 硬件存储使用  | mem,见“性能指标”           | 通常称为“显存”,单位为GiB                          |
+| 计算使用率   | MFU,见“性能指标”           | 参见PaLM论文定义                               |
+| **吞吐量** | **token/p/s,见“性能指标”** | 平均单卡每秒处理的token数                          |
+
+* 性能指标
+
+精度对齐需第21步及之后，所有步的loss与nvidia对应步的loss平均相对误差小于2%。NVloss曲线请联系智源研究院获取
+
+| 配置                 | precision | parallel    | fix_hp | token/p/s | 是否精度对齐     | mem   | MFU         |
+| ------------------ | --------- | ----------- | ------ | --------- | ---------- | ----- | ----------- |
+| MLU四机32卡（4x8）     | /      | /   | /      |       | /       | / | / |
+
diff --git a/training/cambricon/llama3_70B-megatron/config/config_MLUx4x8.py b/training/cambricon/llama3_70B-megatron/config/config_MLUx4x8.py
new file mode 100644
index 000000000..ab90ceae4
--- /dev/null
+++ b/training/cambricon/llama3_70B-megatron/config/config_MLUx4x8.py
@@ -0,0 +1,14 @@
+tokenizer_path = "llama3_70b_hf"
+localbs = 1
+train_steps = 300
+# 请在Flagperf目录下，创建MLU_FP16_FLOPS.py，
+# 文件包含MLU硬件算力值，示例如下：
+# MLU_FP16_FLOPS=1.0
+FLOPS_DIR='../../../../'
+import sys
+sys.path.append(FLOPS_DIR)
+from MLU_FP16_FLOPS import MLU_FP16_FLOPS
+theoryflops = float(MLU_FP16_FLOPS)
+megatron_path = "/workspace/Megatron-LM" # need to be aligned with DockerFile. In NGCtorch, it's /workspace/ + Megatron-LM
+tensor_parallel = 8
+pipeline_parallel = 2
diff --git a/training/cambricon/llama3_70B-megatron/config/requirements.txt b/training/cambricon/llama3_70B-megatron/config/requirements.txt
new file mode 100644
index 000000000..112148b3c
--- /dev/null
+++ b/training/cambricon/llama3_70B-megatron/config/requirements.txt
@@ -0,0 +1,4 @@
+regex==2024.5.15
+schedule==1.2.2
+accelerate==0.31.0
+transformers==4.40.1
\ No newline at end of file
diff --git a/training/cambricon/llama3_70B-megatron/config/training_adapter.sh b/training/cambricon/llama3_70B-megatron/config/training_adapter.sh
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/cambricon/llama3_8B-megatron/README.md b/training/cambricon/llama3_8B-megatron/README.md
new file mode 100644
index 000000000..9c57011e2
--- /dev/null
+++ b/training/cambricon/llama3_8B-megatron/README.md
@@ -0,0 +1,60 @@
+
+### Cambricon MLU配置与运行信息参考
+#### MLU环境配置
+- ##### 硬件环境
+
+    - 机器型号: / 
+    - 加速卡型号: /
+    - CPU型号: /
+    - 多机网络类型、带宽: /
+
+- ##### 软件环境
+
+   - OS版本：Ubuntu 22.04 LTS
+   - OS kernel版本: 5.15.0-107-generic    
+   - 加速卡驱动版本：5.10.34
+   - Docker 版本：24.04
+   - 训练框架版本：megatron-core tag:core_v0.6.0
+   - 依赖软件版本：sentencepiece==0.2.0, transformers==4.40.1
+
+- ##### 并行策略
+
+   - 并行技术：张量、流水、数据混合并行，具体并行方案见“运行情况”章节
+   - 实施者：megatron-core
+   - 实施细节：PP2DP4TP1
+
+- ##### 优化策略
+
+   - flash attention 2
+   - recompute-activations
+   - transformer-engine impl
+
+### 运行情况
+
+* 输入批尺寸
+  1. local_batchsize(micro_batchsize)，简写为LBS，即实际进入模型的张量批尺寸，为config_MLUx1x8.py中所写，在本case中默认为1。**厂商适配时可任意更改**
+  2. seqlength(max_position_embedding)，简写为MPE，即实际进入模型的序列长度，为config_MLUx1x8.py中所写，在本case中默认为8192，原则上不可更改
+  3. global_batchsize恒等于local_batchsize\*gradient_accumulate_steps\*data_parallel_size。在本case中，data_parallel_size=world_size/TPsize/PPsize。在本case中默认为512，使得globalbatchsize=4M tokens。
+
+* 通用指标
+
+| 指标名称    | 指标值                   | 特殊说明                                     |
+| ------- | --------------------- | ---------------------------------------- |
+| 任务类别    | 自然语言理解                |                                          |
+| 模型      | llama3_8b             |                                          |
+| 数据集     | wudao                 | wudao数据集来源于智源研究院<br>bin/idx数据集文件来源于阿里云灵骏团队<br>使用llama3 tokenizer预处理 |
+| 数据精度    | precision,见“性能指标”     | 可选fp32/amp/fp16/bf16                     |
+| 超参修改    | parallel,见“性能指标”      | 格式为PPxDPyTPz，例如PP2DP4TP1                 |
+| 超参修改    | fix_hp,见“性能指标”        | 跑满硬件设备评测吞吐量所需特殊超参                        |
+| 硬件设备简称  | cambricon /           |                                          |
+| 硬件存储使用  | mem,见“性能指标”           | 通常称为“显存”,单位为GiB                          |
+| 计算使用率   | MFU,见“性能指标”           | 参见PaLM论文定义                               |
+| **吞吐量** | **token/p/s,见“性能指标”** | 平均单卡每秒处理的token数                          |
+
+* 性能指标
+
+精度对齐需第21步及之后，所有步的loss与nvidia对应步的loss平均相对误差小于2%。NVloss曲线请联系智源研究院获取
+
+| 配置                 | precision | parallel    | fix_hp | token/p/s | 是否精度对齐     | mem   | MFU         |
+| ------------------ | --------- | ----------- | ------ | --------- | ---------- | ----- | ----------- |
+| MLU单机8卡（1x8）      | /      | /   | /      |  /   | / | / | /      |
diff --git a/training/cambricon/llama3_8B-megatron/config/config_MLUx1x8.py b/training/cambricon/llama3_8B-megatron/config/config_MLUx1x8.py
new file mode 100644
index 000000000..433eb58df
--- /dev/null
+++ b/training/cambricon/llama3_8B-megatron/config/config_MLUx1x8.py
@@ -0,0 +1,14 @@
+tokenizer_path = "llama3_8b_hf"
+localbs = 1
+train_steps = 300
+# 请在Flagperf目录下，创建MLU_FP16_FLOPS.py，
+# 文件包含MLU硬件算力值，示例如下：
+# MLU_FP16_FLOPS=1.0
+FLOPS_DIR='../../../../'
+import sys
+sys.path.append(FLOPS_DIR)
+from MLU_FP16_FLOPS import MLU_FP16_FLOPS
+theoryflops = float(MLU_FP16_FLOPS)
+megatron_path = "/workspace/Megatron-LM" # need to be aligned with DockerFile. In NGCtorch, it's /workspace/ + Megatron-LM
+tensor_parallel = 1
+pipeline_parallel = 2
diff --git a/training/cambricon/llama3_8B-megatron/config/requirements.txt b/training/cambricon/llama3_8B-megatron/config/requirements.txt
new file mode 100644
index 000000000..112148b3c
--- /dev/null
+++ b/training/cambricon/llama3_8B-megatron/config/requirements.txt
@@ -0,0 +1,4 @@
+regex==2024.5.15
+schedule==1.2.2
+accelerate==0.31.0
+transformers==4.40.1
\ No newline at end of file
diff --git a/training/cambricon/llama3_8B-megatron/config/training_adapter.sh b/training/cambricon/llama3_8B-megatron/config/training_adapter.sh
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/cambricon/llava1.5_13b-deepspeed-torch/README.md b/training/cambricon/llava1.5_13b-deepspeed-torch/README.md
new file mode 100644
index 000000000..a1671497b
--- /dev/null
+++ b/training/cambricon/llava1.5_13b-deepspeed-torch/README.md
@@ -0,0 +1,58 @@
+### Cambricon MLU配置与运行信息参考
+#### 环境配置
+- ##### /硬件环境
+    - 机器型号: /
+    - 加速卡型号: /
+    - CPU型号: /
+    - 多机网络类型、带宽: /
+    
+- ##### /软件环境
+   - OS版本：Ubuntu 22.04
+   - OS kernel版本: 5.15.0-107-generic  
+   - 加速卡驱动版本：v5.10.34
+   - Docker 版本：25.0.3
+   - 训练框架版本：deepspeed 0.13.2
+
+- ##### 并行策略
+
+   - 并行技术：sharded data parallel
+   - 实施者：deepspeed ZeRO-DP
+   - 实施细节：ZeRO-DP Pretrain-stage:O2 Finetune-stage:O3
+
+- ##### 优化策略
+
+   - flash attention 2
+
+### 运行情况
+
+* 输入批尺寸
+  1. local_batchsize(micro_batchsize)，简写为LBS，即实际进入模型的张量批尺寸，为config_MLUx1x8.py中所写，在本case中pretrain阶段为32，finetune阶段为16
+  2. seqlength(max_position_embedding)，简写为MPE，即实际进入模型的序列长度，为config_MLUx1x8.py中所写，在本case中默认为2048。这里需注意，llava1.5实际训练时，实际序列长度并非都为2048，本case在计算MFU时，统计每条数据进入模型的实际序列长度求取平均值作为实际序列长度
+  3. gradient_accumulate_steps，简写为GAS，即梯度累加步数，为config_MLUx1x8中所写，在本case中默认为1
+  4. global_batchsize恒等于local_batchsize\*gradient_accumulate_steps\*data_parallel_size，简写为GBS。在本case中，只存在数据并行，因此data_parallel_size=world_size
+
+- ##### 优化策略
+
+   - 优化方案：flash attention 2
+
+
+* 通用指标
+
+| 指标名称     | 指标值                     | 特殊说明                           |
+| ------------ | -------------------------- | ---------------------------------- |
+| 任务类别     | 多模态大模型               |                                    |
+| 模型         | llava1.5_13b                  |                                    |
+| 数据集       | LAION-CC-SBU、llava的混合指令微调数据                | |
+| 数据精度     |bf16                        |                                    |
+| 超参修改     | fix_hp,见“性能指标”        | 运行必要特殊超参 |
+| 硬件设备简称 | cambricon /                |                                    |
+| 硬件存储使用 | mem,见“性能指标”           | 通常称为“显存”,单位为GiB           |
+| 计算使用率 | MFU,见“性能指标”           | 参见PaLM论文定义 |
+| **吞吐量**   | **token/p/s,见“性能指标”** | 平均单卡每秒处理的token数          |
+| MMMU（val）结果           | acc(推理/验证)   | MMMU（val）回答准确率                   |
+* 性能指标
+
+| 配置                |  fix_hp           | token/p/s | loss | mem       |acc(MMMU) |MFU       |
+| ------------------- | ---------------- | ------ | ------- | --------- | --------- |--------- |
+| /单机8卡（1x8）（pretrain）  |  /  | / | / | / | / | / |
+| /单机8卡（1x8）（finetune）  |  /  | / | / | / | / | / |
diff --git a/training/cambricon/llava1.5_13b-deepspeed-torch/config/config_MLUx1x8.py b/training/cambricon/llava1.5_13b-deepspeed-torch/config/config_MLUx1x8.py
new file mode 100644
index 000000000..4b64b5e61
--- /dev/null
+++ b/training/cambricon/llava1.5_13b-deepspeed-torch/config/config_MLUx1x8.py
@@ -0,0 +1,23 @@
+# Common arguments
+
+# 请在Flagperf目录下，创建MLU_FP16_FLOPS.py，
+# 文件包含MLU硬件算力值，示例如下：
+# MLU_FP16_FLOPS=1.0
+FLOPS_DIR='../../../../'
+import sys
+sys.path.append(FLOPS_DIR)
+from MLU_FP16_FLOPS import MLU_FP16_FLOPS
+theoryflops = float(MLU_FP16_FLOPS)
+
+# pretrain arguments
+pretrain_per_device_train_batch_size = 32
+pretrain_gradient_accumulation_steps = 1
+
+
+# finetune arguments
+finetune_per_device_train_batch_size = 16
+finetune_gradient_accumulation_steps = 1
+output_dir_finetune = "Output/checkpoints_finetune/llava-v1.5-13b"
+
+# eval arguments
+mmmu_data_path = "MMMU/MMMU"
\ No newline at end of file
diff --git a/training/cambricon/llava1.5_13b-deepspeed-torch/config/ds_config.json b/training/cambricon/llava1.5_13b-deepspeed-torch/config/ds_config.json
new file mode 100644
index 000000000..8b63ca6e3
--- /dev/null
+++ b/training/cambricon/llava1.5_13b-deepspeed-torch/config/ds_config.json
@@ -0,0 +1,27 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "logging": {
+        "log_level": "INFO"
+    },
+    
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 2,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto"
+    }
+}
\ No newline at end of file
diff --git a/training/cambricon/llava1.5_13b-deepspeed-torch/config/net.sh b/training/cambricon/llava1.5_13b-deepspeed-torch/config/net.sh
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/cambricon/llava1.5_13b-deepspeed-torch/config/requirements.txt b/training/cambricon/llava1.5_13b-deepspeed-torch/config/requirements.txt
new file mode 100644
index 000000000..7a19cd6fc
--- /dev/null
+++ b/training/cambricon/llava1.5_13b-deepspeed-torch/config/requirements.txt
@@ -0,0 +1,2 @@
+sentencePiece==0.1.99
+datasets==2.15.0
\ No newline at end of file
diff --git a/training/cambricon/standalone_monitor.py b/training/cambricon/standalone_monitor.py
index f49072af8..d5b1615ac 100644
--- a/training/cambricon/standalone_monitor.py
+++ b/training/cambricon/standalone_monitor.py
@@ -96,7 +96,7 @@ def main():
     pwr_thread = threading.Thread(target=run_pwr_cmd, args=(pwr_cmd,5,log_dir + "pwr.log.txt"))
     threads.append(pwr_thread)
     
-    mlu_cmd = "date; paste <(cnmon |grep 'Default') <(cnmon |grep 'MLU590-M9') | awk '{print $3,$4,$5,$9,$10,$11,$25}'; echo \"\""
+    mlu_cmd = "date; paste <(cnmon |grep 'Default') <(cnmon |grep 'MLU' | head -n -1) | awk '{print $3,$4,$5,$9,$10,$11,$25}'; echo \"\""
     mlu_file = open(log_dir + "mlu.log.txt", "w")
     mlu_thread = threading.Thread(target=run_cmd, args=(mlu_cmd, 5, mlu_file))
     threads.append(mlu_thread)
diff --git a/training/iluvatar/docker_image/megatron-deepspeed/Dockerfile b/training/iluvatar/docker_image/megatron-deepspeed/Dockerfile
new file mode 100644
index 000000000..74c01ef32
--- /dev/null
+++ b/training/iluvatar/docker_image/megatron-deepspeed/Dockerfile
@@ -0,0 +1,70 @@
+FROM ubuntu:20.04
+
+# copy /etc/apt/sources.list . or choose an available one if encountering a problem with the mirror source
+# ADD sources.list /etc/apt/
+
+RUN /bin/bash -c "source /root/.bashrc"
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PATH /root/miniconda/bin:$PATH
+
+RUN sed -i 's#http://archive.ubuntu.com/#http://mirrors.tuna.tsinghua.edu.cn/#' /etc/apt/sources.list
+RUN apt-get update -y
+RUN apt-get install -y --fix-missing \
+     apt-utils \
+     sudo \
+     openssh-server \
+     vim \
+     git \
+     curl \
+     wget \
+     tree \
+     perl \
+     kmod \
+     make \
+     pciutils \
+     build-essential \
+     python3.8-dev \
+     python3-pip \
+     libjpeg-dev \
+     zlib1g-dev \
+     unzip \
+     cmake \
+     bzip2 \
+     cabextract \
+     iputils-ping \
+     pbzip2 \
+     pv \
+     numactl \
+     ninja-build \
+     gcc-7 \
+     g++-7 \
+     libncursesw5
+     
+
+# Configure anaconda
+RUN wget https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-py38_4.10.3-Linux-x86_64.sh && \
+    bash ./Miniconda3-py38_4.10.3-Linux-x86_64.sh -b -p /root/miniconda && \
+    /root/miniconda/bin/conda clean -tipsy && \
+    ln -s /root/miniconda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
+    echo ". /root/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \
+    echo "conda activate base" >> ~/.bashrc && \
+    conda config --set always_yes yes --set changeps1 no && \
+    echo 'LD_LIBRARY_PATH="/usr/local/corex/lib:${LD_LIBRARY_PATH}"' >> ~/.bashrc && \
+    echo 'PATH="/usr/local/corex/bin:${PATH}"' >> ~/.bashrc 
+
+
+RUN /bin/bash -c "apt-get install -y linux-headers-`uname -r`"
+
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 10 --slave /usr/bin/g++ g++ /usr/bin/g++-7
+
+RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple"
+
+# iluvatar megatron-deepspeed 
+COPY megatron-deepspeed /workspace/megatron-deepspeed
+COPY .triton /root/.triton
+
+ENV LD_LIBRARY_PATH="/usr/local/corex/lib:${LD_LIBRARY_PATH}"
+ENV PATH="/usr/local/corex/bin:${PATH}"
+ENV NVCC_ARGUMENTS="-U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -ftemplate-depth=1024"
+
diff --git a/training/iluvatar/docker_image/megatron-deepspeed/megatron-deepspeed_install.sh b/training/iluvatar/docker_image/megatron-deepspeed/megatron-deepspeed_install.sh
new file mode 100755
index 000000000..c28c003e8
--- /dev/null
+++ b/training/iluvatar/docker_image/megatron-deepspeed/megatron-deepspeed_install.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+SDK_DIR="/workspace/docker_image/sdk_installers"
+PKG_DIR="/workspace/docker_image/packages"
+
+
+search_cuda_results=`find ${SDK_DIR} -name "*cuda*10.2*.run"`
+for installer in $search_cuda_results; do
+    echo "Install ${installer}"
+    sh "${installer}" -- --silent --toolkit
+done
+
+search_sdk_results=`find ${SDK_DIR} -name "corex*.run"`
+for installer in $search_sdk_results; do
+    echo "Install ${installer}"
+    sh "${installer}" -- --silent --toolkit
+done
+
+torch_packages_results=`find ${PKG_DIR} -name "torch-*.whl"`
+if [ -n "$torch_packages_results" ]; then    
+    pip3 install "$torch_packages_results"
+fi
+
+conda install -c conda-forge mpi4py openmpi
+
+search_packages_results=`find ${PKG_DIR} -name "*.whl"`
+for pkg in $search_packages_results; do
+    echo "Install ${pkg}"
+    pip3 install "${pkg}"
+done
+
+
+
diff --git a/training/iluvatar/docker_image/megatron-deepspeed/packages/README.md b/training/iluvatar/docker_image/megatron-deepspeed/packages/README.md
new file mode 100755
index 000000000..333794a9b
--- /dev/null
+++ b/training/iluvatar/docker_image/megatron-deepspeed/packages/README.md
@@ -0,0 +1,31 @@
+# 以下软件包需联系天数智芯获取
+
+>联系邮箱: contact-us@iluvatar.com
+BI-100:
+
+flash_attn-2.5.8+corex.3.4.0.20240623.134-cp38-cp38-linux_x86_64.whl
+
+ixte-0.2.0+corex.3.4.0.20240623.134-cp38-cp38-linux_x86_64.whl
+
+megatron_deepspeed-2.4.1+corex.3.4.0.20240623.134-py3-none-any.whl
+
+torch-2.1.1+corex.3.4.0.20240623.134-cp38-cp38-linux_x86_64.whl
+
+transformer_engine-1.4.0.dev0+corex.3.4.0.20240623.134-cp38-cp38-linux_x86_64.whl
+
+triton-2.1.0+corex.3.4.0.20240623.134-cp38-cp38-linux_x86_64.whl
+
+
+BI-150:
+
+flash_attn-2.5.8+corex.3.4.0.20240623.134-cp38-cp38-linux_x86_64.whl
+
+ixte-0.2.0+corex.3.4.0.20240623.134-cp38-cp38-linux_x86_64.whl
+
+megatron_deepspeed-2.4.1+corex.3.4.0.20240623.134-py3-none-any.whl
+
+torch-2.1.1+corex.3.4.0.20240623.134-cp38-cp38-linux_x86_64.whl
+
+transformer_engine-1.4.0.dev0+corex.3.4.0.20240623.134-cp38-cp38-linux_x86_64.whl
+
+triton-2.1.0+corex.3.4.0.20240623.134-cp38-cp38-linux_x86_64.whl
diff --git a/training/iluvatar/docker_image/megatron-deepspeed/sdk_installers/README.md b/training/iluvatar/docker_image/megatron-deepspeed/sdk_installers/README.md
new file mode 100755
index 000000000..257474664
--- /dev/null
+++ b/training/iluvatar/docker_image/megatron-deepspeed/sdk_installers/README.md
@@ -0,0 +1,16 @@
+# 以下软件包需联系天数智芯获取
+
+>联系邮箱: contact-us@iluvatar.com
+
+BI-100:
+
+corex-installer-linux64-3.2.0_x86_64_10.2.run
+
+cuda_10.2.89_440.33.01_linux.run
+
+BI-150:
+
+corex-installer-linux64-3.4.0_x86_64_10.2.run
+
+cuda_10.2.89_440.33.01_linux.run
+
diff --git a/training/iluvatar/docker_image/megatron/Dockerfile b/training/iluvatar/docker_image/megatron/Dockerfile
new file mode 100755
index 000000000..a58fe6269
--- /dev/null
+++ b/training/iluvatar/docker_image/megatron/Dockerfile
@@ -0,0 +1,63 @@
+FROM ubuntu:20.04
+
+# copy /etc/apt/sources.list . or choose an available one if encountering a problem with the mirror source
+# ADD sources.list /etc/apt/
+
+RUN /bin/bash -c "source /root/.bashrc"
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PATH /root/miniconda/bin:$PATH
+
+RUN sed -i 's#http://archive.ubuntu.com/#http://mirrors.tuna.tsinghua.edu.cn/#' /etc/apt/sources.list
+RUN apt-get update -y
+RUN apt-get install -y --fix-missing \
+     apt-utils \
+     sudo \
+     openssh-server \
+     vim \
+     git \
+     curl \
+     wget \
+     tree \
+     perl \
+     kmod \
+     make \
+     pciutils \
+     build-essential \
+     python3.10-dev \
+     python3-pip \
+     libjpeg-dev \
+     zlib1g-dev \
+     unzip \
+     cmake \
+     bzip2 \
+     cabextract \
+     iputils-ping \
+     pbzip2 \
+     pv \
+     numactl \
+     ninja-build \
+     gcc-7 \
+     g++-7 \
+     libncursesw5
+
+# Configure anaconda
+RUN wget https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-py310_4.10.3-Linux-x86_64.sh && \
+    bash ./Miniconda3-py310_4.10.3-Linux-x86_64.sh -b -p /root/miniconda && \
+    /root/miniconda/bin/conda clean -tipsy && \
+    ln -s /root/miniconda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
+    echo ". /root/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \
+    echo "conda activate base" >> ~/.bashrc && \
+    conda config --set always_yes yes --set changeps1 no && \
+    echo 'LD_LIBRARY_PATH="/usr/local/corex/lib:${LD_LIBRARY_PATH}"' >> ~/.bashrc && \
+    echo 'PATH="/usr/local/corex/bin:${PATH}"' >> ~/.bashrc 
+
+RUN /bin/bash -c "apt-get install -y linux-headers-`uname -r`"
+
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 10 --slave /usr/bin/g++ g++ /usr/bin/g++-7
+
+RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple"
+
+ENV LD_LIBRARY_PATH="/usr/local/corex/lib:${LD_LIBRARY_PATH}"
+ENV PATH="/usr/local/corex/bin:${PATH}"
+ENV NVCC_ARGUMENTS="-U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -ftemplate-depth=1024"
diff --git a/training/iluvatar/docker_image/megatron/megatron_install.sh b/training/iluvatar/docker_image/megatron/megatron_install.sh
new file mode 100755
index 000000000..89f0a0284
--- /dev/null
+++ b/training/iluvatar/docker_image/megatron/megatron_install.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+SDK_DIR="/workspace/docker_image/sdk_installers"
+PKG_DIR="/workspace/docker_image/packages"
+
+search_sdk_results=`find ${SDK_DIR} -name "corex*.run"`
+for installer in $search_sdk_results; do
+    echo "Install ${installer}"
+    sh "${installer}" -- --silent --toolkit
+done
+
+
+
diff --git a/training/iluvatar/docker_image/megatron/sdk_installers/README.md b/training/iluvatar/docker_image/megatron/sdk_installers/README.md
new file mode 100755
index 000000000..6a270efb3
--- /dev/null
+++ b/training/iluvatar/docker_image/megatron/sdk_installers/README.md
@@ -0,0 +1,6 @@
+# 以下软件包需联系天数智芯获取
+
+>联系邮箱: contact-us@iluvatar.com
+BI-150:
+corex-docker-installer-3.4.0.20240531.113-10.2-ubuntu20.04-py3.10-x86_64.run
+
diff --git a/training/iluvatar/llama3_8B-megatron-deepspeed/README.md b/training/iluvatar/llama3_8B-megatron-deepspeed/README.md
new file mode 100644
index 000000000..b0861ac17
--- /dev/null
+++ b/training/iluvatar/llama3_8B-megatron-deepspeed/README.md
@@ -0,0 +1,69 @@
+
+### iluvatar BI-V150 GPU配置与运行信息参考
+#### A100环境配置
+- ##### 硬件环境
+
+    - 机器型号: H3C R5300 G5
+    - 加速卡型号: BI-V150
+    - CPU型号: Intel(R) Xeon(R) Gold 6330 CPU@2.00GHz
+    - 多机网络类型、带宽: InfiniBand，100Gb/s
+
+- ##### 软件环境
+
+   - OS版本：Ubuntu 20.04
+   - OS kernel版本: 5.4.0-148-generic
+   - 加速卡驱动版本：SDK 3.4.0
+   - Docker 版本：20.10.25
+   - 训练框架版本：megatron_deepspeed-2.4.1
+   - 依赖软件版本：sentencepiece==0.2.0, transformers==4.41.2
+
+- ##### 并行策略
+
+   - 并行技术：张量、流水、数据混合并行，具体并行方案见“运行情况”章节
+   - 实施者：megatron-deepspeed
+   - 实施细节：PP8DP2TP1
+
+- ##### 优化策略
+
+   - flash attention 2
+   - recompute-activations
+   - transformer-engine local
+
+### 运行情况
+
+* 输入批尺寸
+  1. local_batchsize(micro_batchsize)，简写为LBS，即实际进入模型的张量批尺寸，为config_A100x1x8.py中所写，在本case中默认为1。**厂商适配时可任意更改**
+  2. seqlength(max_position_embedding)，简写为MPE，即实际进入模型的序列长度，为config_A100x1x8.py中所写，在本case中默认为8192，原则上不可更改
+  3. global_batchsize恒等于local_batchsize\*gradient_accumulate_steps\*data_parallel_size。在本case中，data_parallel_size=world_size/TPsize/PPsize。在本case中默认为512，使得globalbatchsize=4M tokens。
+
+* 通用指标
+
+| 指标名称    | 指标值                   | 特殊说明                                     |
+| ------- | --------------------- | ---------------------------------------- |
+| 任务类别    | 自然语言理解                |                                          |
+| 模型      | llama3_8b             |                                          |
+| 数据集     | wudao                 | wudao数据集来源于智源研究院<br>bin/idx数据集文件来源于阿里云灵骏团队<br>使用llama3 tokenizer预处理 |
+| 数据精度    | precision,见“性能指标”     | 可选fp32/amp/fp16/bf16                     |
+| 超参修改    | parallel,见“性能指标”      | 格式为PPxDPyTPz，例如PP2DP4TP1                 |
+| 超参修改    | fix_hp,见“性能指标”        | 跑满硬件设备评测吞吐量所需特殊超参                        |
+| 硬件设备简称  | BI-V150           |                                          |
+| 硬件存储使用  | mem,见“性能指标”           | 通常称为“显存”,单位为GiB                          |
+| 计算使用率   | MFU,见“性能指标”           | 参见PaLM论文定义                               |
+| **吞吐量** | **token/p/s,见“性能指标”** | 平均单卡每秒处理的token数                          |
+
+* 性能指标
+
+精度对齐需第21步及之后，所有步的loss与nvidia对应步的loss平均相对误差小于2%。NVloss曲线请联系智源研究院获取
+
+| 配置                 | precision | parallel    | fix_hp | token/p/s | 是否精度对齐     | mem   | MFU         |
+| ------------------ | --------- | ----------- | ------ | --------- | ---------- | ----- | ----------- |
+| BI-V150单机8卡（1x8）      | bf16      | PP2DP4TP1   | /      | /    | / | / | /       |
+
+
+*BI-V150单机8卡 消融实验*
+| 配置            | precision | parallel  | fix_hp | token/p/s | 是否精度对齐 | mem   | MFU   |
+| ------------- | --------- | --------- | ------ | --------- | ------ | ----- | ----- |
+| BI-V150单机8卡（1x8） | bf16      | PP8DP2TP1 | /      | /    | /      | / | / |
+|
+
+
diff --git a/training/iluvatar/llama3_8B-megatron-deepspeed/config/config_BI-V150x1x16.py b/training/iluvatar/llama3_8B-megatron-deepspeed/config/config_BI-V150x1x16.py
new file mode 100644
index 000000000..c61e46c53
--- /dev/null
+++ b/training/iluvatar/llama3_8B-megatron-deepspeed/config/config_BI-V150x1x16.py
@@ -0,0 +1,7 @@
+tokenizer_path = "/data1/user_homes/lisen/codes/FlagPerf/training/benchmarks/llama3_8B/megatron-deepspeed/tokenizer_llama3.model"
+localbs = 1
+train_steps = 300
+theoryflops = 192000000000000.0
+megatron_path = "/workspace/megatron-deepspeed" # need to be aligned with DockerFile. In iluvatar, it's /workspace/ + Megatron-LM
+tensor_parallel = 1
+pipeline_parallel = 8
diff --git a/training/iluvatar/llama3_8B-megatron-deepspeed/config/requirements.txt b/training/iluvatar/llama3_8B-megatron-deepspeed/config/requirements.txt
new file mode 100644
index 000000000..cd02df739
--- /dev/null
+++ b/training/iluvatar/llama3_8B-megatron-deepspeed/config/requirements.txt
@@ -0,0 +1,2 @@
+sentencepiece==0.2.0
+transformers==4.41.2
diff --git a/training/iluvatar/llama3_8B-megatron-deepspeed/config/training_adapter.sh b/training/iluvatar/llama3_8B-megatron-deepspeed/config/training_adapter.sh
new file mode 100644
index 000000000..10455e5e6
--- /dev/null
+++ b/training/iluvatar/llama3_8B-megatron-deepspeed/config/training_adapter.sh
@@ -0,0 +1,33 @@
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export ENABLE_FLASH_ATTENTION_WITH_IXDNN=1
+
+TRAINING_ARGS=" \
+    --micro-batch-size $MBS \
+    --tensor-model-parallel-size $TP \
+    --pipeline-model-parallel-size $PP \
+    --bf16 \
+    --transformer-impl local \
+    --eval-iters 0 \
+    --disable-bias-linear \
+    --eval-interval 100 \
+    --no-fp8-wgrad \
+    --custom-partition 4 4 4 4 4 4 5 3 \
+    --recompute-granularity full \
+    --recompute-method block \
+    --custom-recompute-layers-per-stage 3 2 2 1 0 0 0 0 \
+    --no-load-optim \
+    --no-load-rng \
+    --initial-loss-scale 4096 \
+    --min-loss-scale 1.0 \
+    --no-query-key-layer-scaling \
+    --use-rotary-position-embeddings \
+    --no-position-embedding \
+    --untie-embeddings-and-output-weights \
+    --rotary-position-embeddings-theta 500000 \
+    --make-vocab-size-divisible-by 16032 \
+    --seed 1234 \
+    --attention-dropout 0.0 \
+    --hidden-dropout 0.0 \
+    --lr-warmup-iters 0 \
+    --save-interval 10000
+"
\ No newline at end of file
diff --git a/training/iluvatar/mixtral_8x7B-megatron/README.md b/training/iluvatar/mixtral_8x7B-megatron/README.md
new file mode 100755
index 000000000..98ca7488a
--- /dev/null
+++ b/training/iluvatar/mixtral_8x7B-megatron/README.md
@@ -0,0 +1,63 @@
+### Iluvatar GPU配置与运行信息参考
+#### 环境配置
+- ##### BI-V150硬件环境
+    - 机器型号: R5300 G5 
+    - 加速卡型号: Iluvatar Bi-150
+    - CPU型号: Intel(R) Xeon(R) Gold 6330 CPU @ 2.00GHz
+    - 多机网络类型、带宽: InfiniBand，200Gb/s
+
+- ##### BI-V150软件环境
+   - OS版本：Ubuntu 20.04 LTS
+   - OS kernel版本: 5.4.0-148-generic   
+   - 加速卡驱动版本：3.4.0
+   - Docker 版本：20.10.25
+   - 训练框架版本：megatron-lm 0.6.0+corex.3.4.0.20240531.104
+   - 依赖软件版本：transformers==4.37.1,wandb>=0.15.7,hydra-core 
+
+
+- ##### 并行策略
+
+   - 并行技术：张量、流水、数据混合并行，具体并行方案见“运行情况”章节
+   - 实施者：megatron
+
+- ##### 优化策略
+
+   - transformer-engine impl
+
+
+### 运行情况
+
+* 输入批尺寸
+  1. local_batchsize(micro_batchsize)，简写为LBS，即实际进入模型的张量批尺寸，为config_BI-V150x1x16.py中所写，在本case中默认为1。**厂商适配时可任意更改**
+  2. seqlength(max_position_embedding)，简写为MPE，即实际进入模型的序列长度，在本case中默认为2048，原则上不可更改
+* 具体操作
+  1. 获取run包：corex-docker-installer-3.4.0.20240531.113-10.2-ubuntu20.04-py3.10-x86_64.run，放置位置/FlagPerf/training/iluvatar/docker_image/megatron/sdk_installers  ###由于是daily-sdk不能直接制作需要手动完成，需要先bash *.run得到一个docker image【flagperf-iluvatar-megatron:t_v0.1】
+  安装方式：1、accept；2、点击"install driver";3、点击"set image name",改为flagperf-iluvatar-megatron:t_v0.1；4、点击"install"
+  2. 获取mixtral_8x7B的运行代码，放置位置/FlagPerf/data_dir,>联系邮箱: contact-us@iluvatar.com  ###也可以放置在其他位置需要修改/FlagPerf/training/iluvatar/mixtral_8x7B-megatron/config/config_BI-V150x1x16.py中mixtral_iluvatar_path的位置；根据自己机器修改同级training_adapter.sh中MASTERADDR的ip。
+  3. 由于算法引用的层级不一致，需要修改/FlagPerf/training/benchmarks/mixtral_8x7B/megatron/run_pretraining.py第63行origin_file = os.path.join(megapath, "megatron/megatron/training/arguments.py")和origin_file = os.path.join(megapath, "megatron/megatron/training/tokenizer/tokenizer.py")；修改megatron_main.sh中run_cmd="torchrun $DISTRIBUTED_ARGS $MEGAPATH/megatron/pretrain_gpt.py 
+  4. /FlagPerf/training/run_benchmarks/config/test_conf.py，下载pip库的源需要用清华源"https://pypi.tuna.tsinghua.edu.cn/simple/";再执行python3 ./run_benchmarks/run.py
+  5. 单机测试中/FlagPerf/training/iluvatar/mixtral_8x7B-megatron/config/config_BI-V150x1x16.py：tensor_parallel=2，pipeline_parallel=2；/FlagPerf/training/iluvatar/mixtral_8x7B-megatron/config/training_adapter.sh：num-layers=8.四机测试中/FlagPerf/training/iluvatar/mixtral_8x7B-megatron/config/config_BI-V150x1x16.py：tensor_parallel=4，pipeline_parallel=2；/FlagPerf/training/iluvatar/mixtral_8x7B-megatron/config/training_adapter.sh：num-layers=32.
+  注意：若出现卡断现象，先停掉所有进程执行"ixsmi -r"
+
+* 通用指标
+
+| 指标名称    | 指标值                   | 特殊说明                                     |
+| ------- | --------------------- | ---------------------------------------- |
+| 任务类别    | 自然语言理解                |                                          |
+| 模型      | mixtral_8*7B             |                                          |
+| 数据集     | wudao                 | wudao数据集来源于智源研究院<br>bin/idx数据集文件来源于阿里云灵骏团队<br>使用llama tokenizer预处理 |
+| 数据精度    | precision,见“性能指标”     | 可选bf16                     |
+| 超参修改    | parallel,见“性能指标”      | 格式为PPxDPyTPz，例如PP2DP4TP1                 |
+| 超参修改    | fix_hp,见“性能指标”        | 跑满硬件设备评测吞吐量所需特殊超参, global batchsize=1200                        |
+| 硬件设备简称  | nvidia H100           |                                          |
+| 硬件存储使用  | mem,见“性能指标”           | 通常称为“显存”,单位为GiB                          |
+| 计算使用率   | MFU,见“性能指标”           | 参见PaLM论文定义                               |
+| **吞吐量** | **token/p/s,见“性能指标”** | 平均单卡每秒处理的token数                          |
+
+* 性能指标
+
+精度对齐需第21步及之后，所有步的loss与nvidia对应步的loss平均相对误差小于2%。NVloss曲线请联系智源研究院获取
+#目前仅支持单机
+| 配置             | precision | parallel  | fix_hp | token/p/s | 是否精度对齐     | mem   | MFU         |
+| -------------- | --------- | --------- | ------ | --------- | ---------- | ----- | ----------- |
+| BI150单机8卡（1x8）  | bf16   | PP2DP4EP4TP2 | / | 20106.0  | *（仅供性能参考）* | 41/64 | 7.98%       |
diff --git a/training/iluvatar/mixtral_8x7B-megatron/config/config_BI-V150x1x16.py b/training/iluvatar/mixtral_8x7B-megatron/config/config_BI-V150x1x16.py
new file mode 100755
index 000000000..8b0a16d06
--- /dev/null
+++ b/training/iluvatar/mixtral_8x7B-megatron/config/config_BI-V150x1x16.py
@@ -0,0 +1,8 @@
+mixtral_iluvatar_path = "/home/gengyang/FlagPerf/data_dir"
+tokenizer_path = mixtral_iluvatar_path + "/flagscale-iluvatar-mixtral/data_dir/Qwen1___5-7B-Chat-GPTQ-Int8"
+localbs = 1  #micro-batch-size
+train_steps = 100  ##训练迭代次数
+theoryflops = 192000000000000.0
+megatron_path = mixtral_iluvatar_path + "/flagscale-iluvatar-mixtral"#"/workspace/Megatron-LM" # need to be aligned with DockerFile. In NGCtorch, it's /workspace/ + Megatron-LM
+tensor_parallel = 2  #四机为4,非四机暂设为2
+pipeline_parallel = 2 
diff --git a/training/iluvatar/mixtral_8x7B-megatron/config/config_BI-V150x4x16.py b/training/iluvatar/mixtral_8x7B-megatron/config/config_BI-V150x4x16.py
new file mode 100755
index 000000000..ee70f58dd
--- /dev/null
+++ b/training/iluvatar/mixtral_8x7B-megatron/config/config_BI-V150x4x16.py
@@ -0,0 +1,8 @@
+mixtral_iluvatar_path = "/data1/user_homes/gengyang/code/FlagPerf/data_dir"
+tokenizer_path = mixtral_iluvatar_path + "/flagscale-iluvatar-mixtral/data_dir/Qwen1___5-7B-Chat-GPTQ-Int8"
+localbs = 1  #micro-batch-size
+train_steps = 100  ##训练迭代次数
+theoryflops = 192000000000000.0
+megatron_path = mixtral_iluvatar_path + "/flagscale-iluvatar-mixtral"#"/workspace/Megatron-LM" # need to be aligned with DockerFile. In NGCtorch, it's /workspace/ + Megatron-LM
+tensor_parallel = 4  #四机为4,非四机暂设为2
+pipeline_parallel = 2 
diff --git a/training/iluvatar/mixtral_8x7B-megatron/config/requirements.txt b/training/iluvatar/mixtral_8x7B-megatron/config/requirements.txt
new file mode 100755
index 000000000..98965d706
--- /dev/null
+++ b/training/iluvatar/mixtral_8x7B-megatron/config/requirements.txt
@@ -0,0 +1,3 @@
+#transformers==4.40.1
+wandb>=0.15.7
+hydra-core 
\ No newline at end of file
diff --git a/training/iluvatar/mixtral_8x7B-megatron/config/training_adapter.sh b/training/iluvatar/mixtral_8x7B-megatron/config/training_adapter.sh
new file mode 100755
index 000000000..f5bbfbdbc
--- /dev/null
+++ b/training/iluvatar/mixtral_8x7B-megatron/config/training_adapter.sh
@@ -0,0 +1,102 @@
+echo "[Prompt] iluvatar adaption is not NULL, for other Vendors"
+GPUS_PER_NODE=$2
+NNODES=$3
+NODE_RANK=$4
+MEGAPATH=$7
+MBS=$8
+ITERS=$9
+TP=${10}
+PP=${11}
+MASTERADDR=10.31.10.149
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+export PYTHONPATH=$MEGAPATH/megatron:$MEGAPATH/megatron:$PYTHONPATH
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+##非四机暂设为8，四机16卡num-layers=32
+MODEL_ARGS=" \
+    --num-layers 8 \
+    --hidden-size 4096 \
+    --ffn-hidden-size 14336 \
+    --num-attention-heads 32 \
+    --max-position-embeddings 32768 \
+    --seq-length 2048 \
+    --swiglu \
+    --normalization RMSNorm \
+    --global-batch-size 128 \
+    --disable-bias-linear \
+    --no-position-embedding \
+    --no-masked-softmax-fusion \
+    --untie-embeddings-and-output-weights \
+    --position-embedding-type rope \
+    --rotary-base 1000000.0"
+
+OPT_ARGS=" \
+    --lr 1.0e-5 \
+    --min-lr 1e-05 \
+    --train-iters $ITERS \
+    --lr-decay-iters 320000 \
+    --lr-decay-style cosine \
+    --weight-decay 0.1 \
+    --clip-grad 1.0 \
+    --init-method-std 0.02"
+
+MOE_ARGS=" \
+    --moe-router-topk 2 \
+    --num-experts 8 \
+    --moe-aux-loss-coeff 1e-2 \
+    --expert-model-parallel-size 4 \
+    --moe-router-load-balancing-type aux_loss \
+    "
+TRAINING_ARGS=" \
+    --micro-batch-size $MBS \
+    --tensor-model-parallel-size $TP \
+    --pipeline-model-parallel-size $PP \
+    --sequence-parallel \
+    --bf16
+"
+DATA_ARGS=" \
+    --data-path $MEGAPATH/data_dir/pile_wikipedia_demo \
+    --tokenizer-type QwenTokenizer \
+    --split 99,1,0
+"
+# vendor args
+VENDOR_ARGS=" \
+    --transformer-impl transformer_engine \
+    --use-distributed-optimizer \
+    --use-mcore-models 
+"
+DISTRIBUTED_ARGS="
+    --rdzv_id default \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master-addr $MASTERADDR \
+    --master-port 53497 \
+    --redirects 3 \
+    --tee 3
+"
+setup_args="
+    --tensorboard-log-interval 1 \
+    --wandb-project mixtral \
+    --wandb-exp-name mixtral-8x7b \
+    --save-interval 10000 \
+    --save  $MEGAPATH/outputs/checkpoints \
+    --norm-epsilon 1e-05 \
+    --attention-dropout 0.0 \
+    --hidden-dropout 0.0 \
+    --eval-interval 1000 \
+    --eval-iters 10 \
+    --lr-warmup-iters 500 \
+    --tokenizer-path $MEGAPATH/data_dir/Qwen1___5-7B-Chat-GPTQ-Int8/ \
+    --vocab-file $MEGAPATH/examples/aquila/tokenizer/vocab.json \
+    --merge-file $MEGAPATH/examples/aquila/tokenizer/merges.txt \
+    --special-tokens-file $MEGAPATH/examples/aquila/tokenizer/special_tokens.txt \
+    --vocab-size 151851 \
+    --make-vocab-size-divisible-by 64 \
+    --wandb-save-dir $MEGAPATH/outputs/wandb \
+    --tensorboard-dir $MEGAPATH/outputs/tensorboard \
+    --load $MEGAPATH/outputs/checkpoints
+"
+
+ALGO_ARGS="$MODEL_ARGS $OPT_ARGS $GQA_ARGS $MOE_ARGS $setup_args"
+##--log_dir $MEGAPATH/outputs/logs/details/host/20240627_060549.688979
diff --git a/training/kunlunxin/aquila2_34B_container-in_container/Dockerfile b/training/kunlunxin/aquila2_34B_container-in_container/Dockerfile
index dd906bf1d..fa71f1513 100644
--- a/training/kunlunxin/aquila2_34B_container-in_container/Dockerfile
+++ b/training/kunlunxin/aquila2_34B_container-in_container/Dockerfile
@@ -1,4 +1,45 @@
-FROM 10.8.211.50:5000/zhiyuan_flagperf_ai_platform:v1.0
+FROM 10.1.15.74:5000/zhiyuan_flagperf_ai_platform:v2.0
 
-RUN /bin/bash -c "git clone https://github.com/FlagOpen/FlagScale -b v0.2"
-RUN /bin/bash -c "git clone https://github.com/FlagOpen/FlagPerf.git -b AI_platform"
+ENV CUDART_DUMMY_REGISTER=1
+ENV XPU_DUMMY_EVENT=1
+
+ENV XBLAS_FC_HBM_VERSION=40
+ENV XPU_FORCE_CODE_PARAM_LOCATE_IN_L3=1
+ENV XPU_FORCE_USERMODE_LAUNCH=1
+
+ENV LD_LIBRARY_PATH=/usr/local/xpu/xre/so:/usr/local/xpu/xccl/so:$LD_LIBRARY_PATH
+ENV XMLIR_FA_GEMM_TYPE=float16
+ENV FAST_SWIGLU_ENABLE=1
+ENV XDNN_FAST_DIV_SCALAR=true
+ENV FAST_SUB_MN_M=true
+
+ENV NCCL_SOCKET_IFNAME=ens21f0np0
+ENV NCCL_IB_HCA=mlx5
+ENV NCCL_IB_GID_INDEX=3
+
+ENV BKCL_RDMA_PROXY_DISABLE=1
+ENV BKCL_FLAT_RING=1
+ENV BKCL_CCIX_RING=1
+ENV BKCL_TREE_THRESHOLD=1
+ENV BKCL_CCIX_BUFFER_GM=1
+ENV BKCL_FORCE_L3_RDMA=1
+ENV BKCL_RING_BUFFER_GM=1
+ENV BKCL_ENABLE_XDR=1
+ENV BKCL_RDMA_FORCE_TREE=1
+ENV BKCL_TREE_THRESHOLD=1
+ENV XPU_ZEBU_MODE=1
+ENV BKCL_RDMA_NICS=ens11np0,ens11np0,ens13np0,ens13np0,ens15np0,ens15np0,ens17np0,ens17np0
+ENV BKCL_XLINK_D2D=0
+ENV BKCL_XLINK_C2C=1
+ENV BKCL_XLINK_ETH=0
+ENV BKCL_TRANS_UNSUPPORTED_DATATYPE=1
+ENV BKCL_KL3_TURBO_MODE=1
+ENV BKCL_RING_BUFFER_SIZE=2097152
+ENV ALLREDUCE_ASYNC=false
+ENV ALLGATHER_ASYNC=false
+ENV ALLREDUCE_FUSION=0
+ENV BKCL_TIMEOUT=360000
+RUN unset BKCL_KL3_SYSCON_FLAG
+
+RUN /bin/bash -c "git clone https://github.com/FlagOpen/FlagScale -b release/v0.2"
+RUN /bin/bash -c "git clone https://github.com/FlagOpen/FlagPerf.git -b main"
diff --git a/training/kunlunxin/aquila2_34B_container-in_container/singlenode_adapt.sh b/training/kunlunxin/aquila2_34B_container-in_container/singlenode_adapt.sh
index 0c94f9ca5..9a246da98 100644
--- a/training/kunlunxin/aquila2_34B_container-in_container/singlenode_adapt.sh
+++ b/training/kunlunxin/aquila2_34B_container-in_container/singlenode_adapt.sh
@@ -14,8 +14,7 @@ TRAINING_ARGS="
 "
 
 MIXED_PRECISION_ARGS="
-    --embedding-weights-in-fp32 \
-    --rotary-position-embeddings-in-fp32 \
-    --attention-softmax-in-fp32 \
-    --accumulate-allreduce-grads-in-fp32
+    --fp16 \
+    --initial-loss-scale 65536 \
+    --min-loss-scale 1.0 \
 "
diff --git a/training/kunlunxin/baichuan2_13b-deepspeed/README.md b/training/kunlunxin/baichuan2_13b-deepspeed/README.md
index 632c517d4..c35e6c0d8 100644
--- a/training/kunlunxin/baichuan2_13b-deepspeed/README.md
+++ b/training/kunlunxin/baichuan2_13b-deepspeed/README.md
@@ -9,7 +9,7 @@
   - OS版本：Ubuntu 20.04
   - OS kernel版本: 5.4.0-26-generic
   - 加速卡驱动版本：4.0.25
-  - Docker镜像和版本：iregistry.baidu-int.com/xmlir/xmlir_ubuntu_2004_x86_64:v0.27
+  - Docker镜像和版本：iregistry.baidu-int.com/xmlir/xmlir_ubuntu_2004_x86_64:v0.29
   - 训练框架版本：xmlir
   - 训练编译器版本：xacc
   - 依赖软件版本：pytorch-2.0.1
@@ -36,7 +36,7 @@
 | 任务类别    | 自然语言理解                |                               |
 | 模型      | baichuan2_13b         |                               |
 | 数据集     | openwebtext           | 如无特殊说明，训练前1亿个token            |
-| 数据精度    | bf16                  |                               |
+| 数据精度    | fp16                  |                               |
 | 超参修改    | fix_hp,见“性能指标”        | 运行必要特殊超参，例如需要改小seqlength避免OOM |
 | 硬件设备简称  | nvidia H800           |                               |
 | 硬件存储使用  | mem,见“性能指标”           | 通常称为“显存”,单位为GiB               |
@@ -48,3 +48,4 @@
 | 配置              | fix_hp                     | token/p/s | loss | mem   | MFU   |
 | --------------- | -------------------------- | --------- | ---- | ----- | ----- |
 | R300 1机1卡（1x1） | GAS=1 | --      | -- | -- | -- |
+| R300 2机8卡（2x8） | GAS=64 | --      | -- | -- | -- |
diff --git a/training/kunlunxin/baichuan2_13b-deepspeed/config/config_R300x2x8.py b/training/kunlunxin/baichuan2_13b-deepspeed/config/config_R300x2x8.py
new file mode 100644
index 000000000..d0b35482c
--- /dev/null
+++ b/training/kunlunxin/baichuan2_13b-deepspeed/config/config_R300x2x8.py
@@ -0,0 +1,5 @@
+seqlength = 2048
+batchsize = 1
+datafilename = "openwebtext_baichuan2_100M.npy"
+theoryflops = 1.0
+epochs = 1
diff --git a/training/kunlunxin/baichuan2_13b-deepspeed/config/ds_config.json b/training/kunlunxin/baichuan2_13b-deepspeed/config/ds_config.json
index f2567f72a..bc82dd6c1 100644
--- a/training/kunlunxin/baichuan2_13b-deepspeed/config/ds_config.json
+++ b/training/kunlunxin/baichuan2_13b-deepspeed/config/ds_config.json
@@ -1,5 +1,8 @@
 {
-    "gradient_accumulation_steps": 1,
+    "gradient_accumulation_steps": 64,
+    "bf16": {
+        "enabled": false
+    },
     "fp16": {
         "enabled": true
     }
diff --git a/training/kunlunxin/baichuan2_13b-deepspeed/config/environment_variables.sh b/training/kunlunxin/baichuan2_13b-deepspeed/config/environment_variables.sh
index 8dcbc2157..4ebddd816 100644
--- a/training/kunlunxin/baichuan2_13b-deepspeed/config/environment_variables.sh
+++ b/training/kunlunxin/baichuan2_13b-deepspeed/config/environment_variables.sh
@@ -1,13 +1,3 @@
-export LD_LIBRARY_PATH=/workspace/bkcl_so/so:/workspace/xre-Linux-x86_64-0.0.0.1-2024-03-28-23-30-24-daily/so:$LD_LIBRARY_PATH
-
-export CUDART_DUMMY_REGISTER=1
-export XPU_DUMMY_EVENT=1
-
-# ulimit -c 0
-export NCCL_SOCKET_IFNAME=xgbe0
-export NCCL_IB_HCA=mlx5
-export NCCL_IB_GID_INDEX=3
-
 #################################
 # driver方式部分
 #################################
@@ -15,63 +5,34 @@ export CUDART_DUMMY_REGISTER=1
 export XPU_FORCE_USERMODE_LAUNCH=1
 export XPU_DUMMY_EVENT=1
 
-#################################
-# 算子部分
-#################################
-export XPUAPI_DEFAULT_SIZE=4000000000
-export XBLAS_FC_HBM_VERSION=40
-
-#################################
-# 算子检查部分
-#################################
-export XMLIR_XDNN_PYTORCH_CHECK_ENABLE_FALLBACK_BOOL=0
-export XMLIR_ENABLE_FALLBACK_TO_CPU_BOOL=False
-export XMLIR_DUMP_FALLBACK_OP_LIST_BOOL=true
-
-#################################
-# hbm部分
-#################################
-export XPU_FORCE_CODE_PARAM_LOCATE_IN_L3=1
-
 #################################
 # BKCL C2C部分
 #################################
+export BKCL_CCIX_BUFFER_GM=1
 export BKCL_CCIX_RING=1
+export BKCL_ENABLE_XDR=1
+export BKCL_FORCE_L3_RDMA=0
+export BKCL_FORCE_SYNC=1
+export BKCL_KL3_TURBO_MODE=1
+export BKCL_RDMA_FORCE_TREE=1
+export BKCL_RDMA_NICS=ens11np0,ens11np0,ens13np0,ens13np0,ens15np0,ens15np0,ens17np0,ens17np0
+export BKCL_RING_BUFFER_GM=1
+export BKCL_RING_BUFFER_SIZE=2097152
+export BKCL_SOCKET_IFNAME=ens21f0np0
+export BKCL_TIMEOUT=360000
+export BKCL_TRANS_UNSUPPORTED_DATATYPE=1
 export BKCL_TREE_THRESHOLD=1
-export BKCL_CCIX_BUFFER_GM=1
-
-# ccix_inner_8chips
-cat > ccix_inter.txt <<EOF
-[chip 0, port 0] <===> [chip 6, port 3]
-[chip 4, port 1] <===> [chip 5, port 2]
-[chip 2, port 3] <===> [chip 4, port 0]
-[chip 2, port 1] <===> [chip 3, port 2]
-[chip 1, port 1] <===> [chip 3, port 1]
-[chip 0, port 1] <===> [chip 1, port 2]
-[chip 0, port 2] <===> [chip 2, port 2]
-[chip 0, port 3] <===> [chip 3, port 3]
-[chip 1, port 0] <===> [chip 2, port 0]
-[chip 1, port 3] <===> [chip 7, port 0]
-[chip 5, port 1] <===> [chip 7, port 1]
-[chip 3, port 0] <===> [chip 5, port 3]
-[chip 4, port 3] <===> [chip 7, port 3]
-[chip 4, port 2] <===> [chip 6, port 2]
-[chip 6, port 1] <===> [chip 7, port 2]
-[chip 5, port 0] <===> [chip 6, port 0]
-EOF
-
-export XPU_ZEBU_MODE=1
-export BKCL_XLINK_D2D=0
 export BKCL_XLINK_C2C=1
+export BKCL_XLINK_D2D=0
 export BKCL_XLINK_ETH=0
-export BKCL_TRANS_UNSUPPORTED_DATATYPE=1
-export BKCL_RING_BUFFER_GM=1
-export BKCL_FORCE_SYNC=1
-export BKCL_KL3_TURBO_MODE=1 
-export BKCL_RING_BUFFER_SIZE=2097152
-export XPUSIM_TOPOLOGY_FILE="ccix_inter.txt"
 export ALLREDUCE_ASYNC=false
 export ALLGATHER_ASYNC=false
 export ALLREDUCE_FUSION=0
-export BKCL_TIMEOUT=3600
-unset BKCL_KL3_SYSCON_FLAG
+
+# 性能
+export XDNN_USE_FAST_SWISH=true
+export XDNN_FAST_DIV_SCALAR=true
+export XMLIR_BMM_DISPATCH_VALUE=2
+export XBLAS_FC_HBM_VERSION=40
+export XPU_FORCE_CODE_PARAM_LOCATE_IN_L3=1
+export XPUAPI_DEFAULT_SIZE=4000000000
diff --git a/training/kunlunxin/chatglm3_6b-deepspeed/README.md b/training/kunlunxin/chatglm3_6b-deepspeed/README.md
new file mode 100644
index 000000000..cc24a3558
--- /dev/null
+++ b/training/kunlunxin/chatglm3_6b-deepspeed/README.md
@@ -0,0 +1,51 @@
+### 昆仑芯XPU配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+  - 机器型号: 昆仑芯AI加速器组R480-X8
+  - 加速卡型号: 昆仑芯AI加速卡R300+
+  - 多机网络类型、带宽: InfiniBand，200Gb/s
+
+- ##### 软件环境
+  - OS版本：Ubuntu 20.04
+  - OS kernel版本: 5.15.0-97-generic
+  - 加速卡驱动版本：5.0.11
+  - Docker镜像和版本：iregistry.baidu-int.com/xmlir/xmlir_ubuntu_2004_x86_64:v0.29
+  - 训练框架版本：xmlir
+  - 训练编译器版本：xacc
+  - 依赖软件版本：pytorch-2.0.1
+  - 训练框架版本：deepspeed 0.14.4
+
+- ##### 并行策略
+
+   - 并行技术：sharded data parallel
+   - 实施者：deepspeed ZeRO-DP
+   - 实施细节：ZeRO-DP O3, DP_SIZE=8
+
+
+### 运行情况
+
+* 输入批尺寸
+  1. local_batchsize(micro_batchsize)，简写为LBS，即实际进入模型的张量批尺寸，为config_R300x1x8.py中所写，在本case中默认为**1**
+  2. seqlength(max_position_embedding)，简写为MPE，即实际进入模型的序列长度，为config_R300x1x8.py中所写，在本case中默认为**4096**
+  3. gradient_accumulate_steps，简写为GAS，即梯度累加步数，为ds_config.json中所写，在本case中默认为1，精度对齐实验默认为**1**
+  4. global_batchsize恒等于local_batchsize\*gradient_accumulate_steps\*data_parallel_size，简写为GBS。在本case中，只存在数据并行，因此data_parallel_size=world_size。
+
+* 通用指标
+
+| 指标名称     | 指标值                     | 特殊说明                           |
+| ------------ | -------------------------- | ---------------------------------- |
+| 任务类别     | 自然语言理解               |                                    |
+| 模型         | chatglm3_6b             |                                    |
+| 数据集       | openwebtext                | 如无特殊说明，训练前1亿个token |
+| 数据精度     | amp                        |                                    |
+| 超参修改     | fix_hp,见“性能指标”        | 运行必要特殊超参，例如需要改小seqlength避免OOM |
+| 硬件设备简称 | R300+                |                                    |
+| 硬件存储使用 | mem,见“性能指标”           | 通常称为“显存”,单位为GiB           |
+| 计算使用率 | MFU,见“性能指标”           | 参见PaLM论文定义 |
+| **吞吐量**   | **token/p/s,见“性能指标”** | 平均单卡每秒处理的token数          |
+
+* 性能指标
+
+| 配置              | precision | fix_hp                     | token/p/s | loss | mem   | MFU   |
+| --------------- | ----------- |-------------------------- | --------- | ---- | ----- | ----- |
+| R300+ 单机8卡（1x8） | bf16 |  LBS=8, gradient_checkpointing_enable = True | --      | -- | -- | -- |
diff --git a/training/kunlunxin/chatglm3_6b-deepspeed/config/config_R300x1x8.py b/training/kunlunxin/chatglm3_6b-deepspeed/config/config_R300x1x8.py
new file mode 100644
index 000000000..0426208a3
--- /dev/null
+++ b/training/kunlunxin/chatglm3_6b-deepspeed/config/config_R300x1x8.py
@@ -0,0 +1,7 @@
+seqlength = 4096
+batchsize = 8
+datafilename = "openwebtext_chatglm3_100M.npy"
+theoryflops = 128000000000000.0
+epochs = 1
+flashattn = True
+gradient_checkpointing_enable = True
\ No newline at end of file
diff --git a/training/kunlunxin/chatglm3_6b-deepspeed/config/ds_config.json b/training/kunlunxin/chatglm3_6b-deepspeed/config/ds_config.json
new file mode 100644
index 000000000..7b9b07412
--- /dev/null
+++ b/training/kunlunxin/chatglm3_6b-deepspeed/config/ds_config.json
@@ -0,0 +1,3 @@
+{
+    "gradient_accumulation_steps": 1
+}
\ No newline at end of file
diff --git a/training/kunlunxin/chatglm3_6b-deepspeed/config/environment_variables.sh b/training/kunlunxin/chatglm3_6b-deepspeed/config/environment_variables.sh
new file mode 100644
index 000000000..d67905fd8
--- /dev/null
+++ b/training/kunlunxin/chatglm3_6b-deepspeed/config/environment_variables.sh
@@ -0,0 +1,10 @@
+export LD_LIBRARY_PATH=/workspace/xre-Linux-x86_64-5.0.11.1/so:/workspace/bkcl/so:$LD_LIBRARY_PATH
+export CUDART_DUMMY_REGISTER=1
+export XPU_FORCE_USERMODE_LAUNCH=1
+export XPU_DUMMY_EVENT=1
+export USE_FAST_BF16_FC=True
+export XMLIR_FA_GEMM_TYPE=float16
+export XMLIR_API_DEFAULT_SIZE=4000000000 
+export XBLAS_FC_HBM_VERSION=40
+export XDNN_USE_FAST_SWISH=true
+export XDNN_FAST_DIV_SCALAR=true
\ No newline at end of file
diff --git a/training/kunlunxin/chatglm3_6b-deepspeed/config/net.sh b/training/kunlunxin/chatglm3_6b-deepspeed/config/net.sh
new file mode 100644
index 000000000..08de32bfd
--- /dev/null
+++ b/training/kunlunxin/chatglm3_6b-deepspeed/config/net.sh
@@ -0,0 +1,2 @@
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+ipcs -m | awk '/0x/ {print $2}' | xargs -n 1 ipcrm shm
\ No newline at end of file
diff --git a/training/kunlunxin/chatglm3_6b-deepspeed/config/requirements.txt b/training/kunlunxin/chatglm3_6b-deepspeed/config/requirements.txt
new file mode 100644
index 000000000..c539fed13
--- /dev/null
+++ b/training/kunlunxin/chatglm3_6b-deepspeed/config/requirements.txt
@@ -0,0 +1,2 @@
+sentencepiece
+transformers==4.30.2
\ No newline at end of file
diff --git a/training/kunlunxin/docker_image/deepspeed_v0.14.4/Dockerfile b/training/kunlunxin/docker_image/deepspeed_v0.14.4/Dockerfile
new file mode 100644
index 000000000..daa079f84
--- /dev/null
+++ b/training/kunlunxin/docker_image/deepspeed_v0.14.4/Dockerfile
@@ -0,0 +1,10 @@
+FROM iregistry.baidu-int.com/xmlir/xmlir_ubuntu_2004_x86_64:v0.29
+RUN /bin/bash -c "pip config set global.index-url https://mirror.baidu.com/pypi/simple"
+RUN /bin/bash -c "uname -a"
+RUN /bin/bash -c alias python3=python
+
+ENV PATH /root/miniconda/envs/python38_torch201_cuda/bin:$PATH
+ENV LD_LIBRARY_PATH /workspace/xre-Linux-x86_64-5.0.11.1/so:/workspace/bkcl/so:$LD_LIBRARY_PATH
+ENV CUDART_DUMMY_REGISTER 1
+ENV XPU_FORCE_USERMODE_LAUNCH 1
+ENV XPU_DUMMY_EVENT 1
\ No newline at end of file
diff --git a/training/kunlunxin/docker_image/deepspeed_v0.14.4/deepspeed_v0.14.4_install.sh b/training/kunlunxin/docker_image/deepspeed_v0.14.4/deepspeed_v0.14.4_install.sh
new file mode 100644
index 000000000..4bcb475db
--- /dev/null
+++ b/training/kunlunxin/docker_image/deepspeed_v0.14.4/deepspeed_v0.14.4_install.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+export https_proxy=http://10.1.0.34:7890
+pip install deepspeed==0.14.4
+
+wget https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/R300_plus/latest/xpytorch-cp38-torch201-ubuntu2004-x64.run
+
+bash xpytorch-cp38-torch201-ubuntu2004-x64.run
\ No newline at end of file
diff --git a/training/metax/bert_hf-pytorch/README.md b/training/metax/bert_hf-pytorch/README.md
index 0b23692f7..5f2a3587d 100644
--- a/training/metax/bert_hf-pytorch/README.md
+++ b/training/metax/bert_hf-pytorch/README.md
@@ -6,9 +6,9 @@
 - ##### 软件环境
    - OS版本：Ubuntu 20.04.6
    - OS kernel版本:  5.4.0-26-generic
-   - 加速卡驱动版本：2.2.0
+   - 加速卡驱动版本：2.3.0
    - Docker 版本：24.0.7
-   - 训练框架版本：pytorch-2.0.0+mc2.18.0.8-cp38-cp38-linux_x86_64.whl
+   - 训练框架版本：pytorch-2.0.0+mc2.20.2.20-cp38-cp38-linux_x86_64.whl
    - 依赖软件版本：无
 
 ### 运行情况
@@ -35,10 +35,10 @@
 
 | 配置                | precision | fix_hp           | e2e_time | p_whole | p_train | p_core | acc         | mem       |
 | ------------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | ----------- | --------- |
-| C500单机8卡（1x8）  | fp32      | bs=12            |    |      |      |     | 0.657/0.655 | 30.4/64.0 |
-| C500单机8卡（1x8）  | amp       | bs=20,lr=5e-5    |    |      |      |     | 0.658/0.655 | 34.9/64.0 |
-| C500单机8卡（1x8）  | bf16      | bs=32,lr=1e-4    |    |      |      |     | 0.648/0.655 | 37.3/64.0 |
-| C500单机单卡（1x1） | fp32      | bs=12,lr=6.25e-6 |    |      |      |     | 0.656/0.655 | 29.4/64.0 |
-| C500两机8卡（2x8）  | fp32      | bs=12,lr=7e-5    |    |      |      |     | 0.657/0.655 | 32.9/64.0 |
+| C500单机8卡（1x8）  | fp32      | bs=16            |    |      |      |     | 0.655/0.655 | 39.9/64.0 |
+| C500单机8卡（1x8）  | amp       | bs=20,lr=5e-5    |    |      |      |     | 0.656/0.655 | 38.1/64.0 |
+| C500单机8卡（1x8）  | bf16      | bs=32,lr=1e-4    |    |      |      |     | 0.648/0.655 | 38.7/64.0 |
+| C500单机单卡（1x1） | fp32      | bs=12,lr=6.25e-6 |    |      |      |     | 0.656/0.655 | 34.7/64.0 |
+| C500两机8卡（2x8）  | fp32      | bs=12,lr=7e-5    |    |      |      |     | 0.656/0.655 | 30.6/64.0 |
 
 
diff --git a/training/metax/bert_hf-pytorch/config/config_C550x1x8_amp.py b/training/metax/bert_hf-pytorch/config/config_C550x1x8_amp.py
deleted file mode 100644
index d5b3a6992..000000000
--- a/training/metax/bert_hf-pytorch/config/config_C550x1x8_amp.py
+++ /dev/null
@@ -1,4 +0,0 @@
-train_batch_size = 20
-eval_batch_size = 20
-lr = 5e-5
-amp = True
diff --git a/training/metax/bert_hf-pytorch/config/config_C550x1x8_bf16.py b/training/metax/bert_hf-pytorch/config/config_C550x1x8_bf16.py
deleted file mode 100644
index 319475b57..000000000
--- a/training/metax/bert_hf-pytorch/config/config_C550x1x8_bf16.py
+++ /dev/null
@@ -1,4 +0,0 @@
-train_batch_size = 32
-eval_batch_size = 32
-lr = 1e-4
-fp16 = True
\ No newline at end of file
diff --git a/training/metax/bert_hf-pytorch/config/config_C550x1x8_fp32.py b/training/metax/bert_hf-pytorch/config/config_C550x1x8_fp32.py
deleted file mode 100644
index 3c8ea9d66..000000000
--- a/training/metax/bert_hf-pytorch/config/config_C550x1x8_fp32.py
+++ /dev/null
@@ -1,2 +0,0 @@
-train_batch_size = 12
-eval_batch_size = 12
\ No newline at end of file
diff --git a/training/metax/chatglm3_6b-deepspeed/README.md b/training/metax/chatglm3_6b-deepspeed/README.md
new file mode 100644
index 000000000..aefa43d55
--- /dev/null
+++ b/training/metax/chatglm3_6b-deepspeed/README.md
@@ -0,0 +1,46 @@
+### 沐曦集成电路 C500 GPU配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+    - 机器、加速卡型号: 曦云®C500 64G 
+    - 多机网络类型、带宽: InfiniBand，2x200 Gb/s
+
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04.6
+   - OS kernel版本:  5.4.0-26-generic
+   - 加速卡驱动版本：2.2.0
+   - Docker 版本：24.0.7
+   - 训练框架版本：pytorch-2.0.0+mc2.19.2.23-cp38-cp38-linux_x86_64.whl, deepspeed 0.10.0
+
+- ##### 并行策略
+
+   - 并行技术：sharded data parallel
+   - 实施者：deepspeed ZeRO-DP
+   - 实施细节：ZeRO-DP O3, DP_SIZE=8
+
+### 运行情况
+
+* 输入批尺寸
+  1. local_batchsize(micro_batchsize)，简写为LBS，即实际进入模型的张量批尺寸，为config_C500x1x8.py中所写，在本case中默认为**1**
+  2. seqlength(max_position_embedding)，简写为MPE，即实际进入模型的序列长度，为config_AC500x1x8.py中所写，在本case中默认为**4096**
+  3. gradient_accumulate_steps，简写为GAS，即梯度累加步数，为ds_config.json中所写，在本case中默认为**1**
+  4. global_batchsize恒等于local_batchsize\*gradient_accumulate_steps\*data_parallel_size，简写为GBS。在本case中，只存在数据并行，因此data_parallel_size=world_size。
+
+* 通用指标
+
+| 指标名称     | 指标值                     | 特殊说明                           |
+| ------------ | -------------------------- | ---------------------------------- |
+| 任务类别     | 自然语言理解               |                                    |
+| 模型         | chatglm3_6b             |                                    |
+| 数据集       | openwebtext                | 如无特殊说明，训练前1亿个token |
+| 数据精度     | amp                        |                                    |
+| 超参修改     | fix_hp,见“性能指标”        | 运行必要特殊超参，例如需要改小seqlength避免OOM |
+| 硬件设备简称 | Metax C500               |                                    |
+| 硬件存储使用 | mem,见“性能指标”           | 通常称为“显存”,单位为GiB           |
+| 计算使用率 | MFU,见“性能指标”           | 参见PaLM论文定义 |
+| **吞吐量**   | **token/p/s,见“性能指标”** | 平均单卡每秒处理的token数          |
+
+* 性能指标
+
+| 配置                |  fix_hp           | token/p/s | loss | mem       | MFU       |
+| ------------------- | ---------------- | ------ | ------- | --------- | --------- |
+| C500单机8卡（1x8）  |  mpe=2048        | /     | 6.84 | 54/64 | 35.7% |
diff --git a/training/nvidia/chatglm3_6b-deepspeed/config/config_A100x1x8.py b/training/metax/chatglm3_6b-deepspeed/config/config_C500x1x8.py
similarity index 73%
rename from training/nvidia/chatglm3_6b-deepspeed/config/config_A100x1x8.py
rename to training/metax/chatglm3_6b-deepspeed/config/config_C500x1x8.py
index 8291fc310..dc34b1700 100644
--- a/training/nvidia/chatglm3_6b-deepspeed/config/config_A100x1x8.py
+++ b/training/metax/chatglm3_6b-deepspeed/config/config_C500x1x8.py
@@ -1,6 +1,6 @@
 seqlength = 4096
 batchsize = 1
 datafilename = "openwebtext_chatglm3_100M.npy"
-theoryflops = 312000000000000.0
+theoryflops = 240000000000000.0
 epochs = 1
 flashattn = True
diff --git a/training/metax/chatglm3_6b-deepspeed/config/ds_config.json b/training/metax/chatglm3_6b-deepspeed/config/ds_config.json
new file mode 100644
index 000000000..e2b0f2b02
--- /dev/null
+++ b/training/metax/chatglm3_6b-deepspeed/config/ds_config.json
@@ -0,0 +1,3 @@
+{
+    "gradient_accumulation_steps": 64
+  }
diff --git a/training/metax/chatglm3_6b-deepspeed/config/environment_variables.sh b/training/metax/chatglm3_6b-deepspeed/config/environment_variables.sh
new file mode 100644
index 000000000..8583ab628
--- /dev/null
+++ b/training/metax/chatglm3_6b-deepspeed/config/environment_variables.sh
@@ -0,0 +1,31 @@
+export MACA_PATH=/opt/maca
+
+export MACA_CLANG_PATH=${MACA_PATH}/mxgpu_llvm/bin
+export MACA_CLANG=${MACA_PATH}/mxgpu_llvm
+export DEVINFO_ROOT=${MACA_PATH}
+export PATH=${MACA_PATH}/bin:${MACA_CLANG}/bin:${PATH}
+export LD_LIBRARY_PATH=${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${LD_LIBRARY_PATH}
+
+export CUCC_PATH=${MACA_PATH}/tools/cu-bridge
+export CUDA_PATH=${CUCC_PATH}
+export CUDA_HOME=${CUCC_PATH}
+
+#MACA-PyTorch envs
+export ISU_FASTMODEL=1 # must be set, otherwise may induce precision error
+export USE_TDUMP=OFF # optional, use to control whether generating debug file
+export TMEM_LOG=OFF # optional, use to control whether generating debug file
+export DEBUG_ITRACE=0 # optional, use to control whether generating debug file
+
+# export MACA_SMALL_PAGESIZE_ENABLE=1
+export MALLOC_THRESHOLD=99
+export MCPYTORCH_CHECK_ANOMALY_INF=1
+export FORCE_ACTIVATE_WAIT=1
+
+export MCCL_MAX_NCHANNELS=16
+export MCCL_P2P_LEVEL=SYS
+export MCCL_LIMIT_RING_LL_THREADTHRESHOLDS=1
+export MCPYTORCH_DISABLE_PRINT=1
+export MHA_USE_BLAS=ON
+export MHA_BWD_NO_ATOMIC_F64=1
+
+export SET_DEVICE_NUMA_PREFERRED=1
diff --git a/training/metax/chatglm3_6b-deepspeed/config/net.sh b/training/metax/chatglm3_6b-deepspeed/config/net.sh
new file mode 100644
index 000000000..63f52529c
--- /dev/null
+++ b/training/metax/chatglm3_6b-deepspeed/config/net.sh
@@ -0,0 +1 @@
+export CUDA_DEVICE_MAX_CONNECTIONS=1;export NCCL_SOCKET_IFNAME=enp;export NCCL_IB_DISABLE=1;
diff --git a/training/metax/chatglm3_6b-deepspeed/config/requirements.txt b/training/metax/chatglm3_6b-deepspeed/config/requirements.txt
new file mode 100644
index 000000000..511502efb
--- /dev/null
+++ b/training/metax/chatglm3_6b-deepspeed/config/requirements.txt
@@ -0,0 +1,2 @@
+sentencepiece
+transformers==4.30.2
diff --git a/training/metax/cpm-pytorch/README.md b/training/metax/cpm-pytorch/README.md
index 4199c1f66..5820166a3 100644
--- a/training/metax/cpm-pytorch/README.md
+++ b/training/metax/cpm-pytorch/README.md
@@ -15,9 +15,9 @@
 - ##### 软件环境
    - OS版本：Ubuntu 20.04.6
    - OS kernel版本:  5.4.0-26-generic
-   - 加速卡驱动版本：2.2.0
+   - 加速卡驱动版本：2.3.0
    - Docker 版本：24.0.7
-   - 训练框架版本：pytorch-2.0.0+mc2.18.0.8-cp38-cp38-linux_x86_64.whl
+   - 训练框架版本：pytorch-2.0.0+mc2.23.0.16-cp38-cp38-linux_x86_64.whl
    - 依赖软件版本：无
 
 #### 运行情况
@@ -44,6 +44,6 @@
 
 | 配置                | precision | fix_hp           | e2e_time | p_whole | p_train | p_core | acc   | mem       |
 | ------------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | ----- | --------- |
-| C500单机8卡（1x8）  | fp16      | /                 |       |      |     |    | 0.916 | 22.6/64.0 |
-| C500单机单卡（1x1） | fp16      | /                 |        |    |    |   |    | 29.7/64.0 |
-| C500两机8卡（2x8）  | fp16      | /                 |       |    |    |  |    | 30.8/64.0 |
\ No newline at end of file
+| C500单机8卡（1x8）  | fp16      | /                |       |      |     |    | 0.916 | 23.5/64.0 |
+| C500单机单卡（1x1） | fp16      |  /                |        |    |    |   |  0.921  | 47.9/64.0 |
+| C500两机8卡（2x8）  | fp16      | /                 |       |    |    |  |    | 49.4/64.0 |
diff --git a/training/metax/cpm-pytorch/config/config_C500x1x1.py b/training/metax/cpm-pytorch/config/config_C500x1x1.py
index 4b08673bb..4bdcdb158 100644
--- a/training/metax/cpm-pytorch/config/config_C500x1x1.py
+++ b/training/metax/cpm-pytorch/config/config_C500x1x1.py
@@ -7,13 +7,13 @@
 
 gradient_accumulation_steps = 1
 
-train_batch_size = 128
+train_batch_size = 384
 eval_batch_size = train_batch_size
 max_steps = 3000
 max_samples_termination = 439126000
 
 warmup = 0.2
-learning_rate = 0.002
+learning_rate = 0.0007
 
 beta_1: float = 0.9
 beta_2: float = 0.99
diff --git a/training/metax/cpm-pytorch/config/config_C500x1x8.py b/training/metax/cpm-pytorch/config/config_C500x1x8.py
index d5f4eda00..314d08a09 100644
--- a/training/metax/cpm-pytorch/config/config_C500x1x8.py
+++ b/training/metax/cpm-pytorch/config/config_C500x1x8.py
@@ -7,13 +7,13 @@
 
 gradient_accumulation_steps = 1
 
-train_batch_size = 192
+train_batch_size = 128
 eval_batch_size = train_batch_size
 max_steps = 4000
 max_samples_termination = 4391260
 
 warmup = 0.2
-learning_rate = 0.0005
+learning_rate = 0.002
 
 beta_1: float = 0.9
 beta_2: float = 0.99
diff --git a/training/metax/cpm-pytorch/config/config_C500x2x8py b/training/metax/cpm-pytorch/config/config_C500x2x8py
index 959be5caa..5eb6eb673 100644
--- a/training/metax/cpm-pytorch/config/config_C500x2x8py
+++ b/training/metax/cpm-pytorch/config/config_C500x2x8py
@@ -8,12 +8,12 @@ target_embedding_average = 0.92
 
 gradient_accumulation_steps = 1
 
-train_batch_size = 192
+train_batch_size = 384
 eval_batch_size = train_batch_size
 max_steps = 2000
 
 warmup = 0.2
-learning_rate = 0.0005
+learning_rate = 0.001
 
 beta_1: float = 0.9
 beta_2: float = 0.99
diff --git a/training/metax/docker_image/deepspeed/Dockerfile b/training/metax/docker_image/deepspeed/Dockerfile
index 54b2ce5d5..9a5baaa4c 100644
--- a/training/metax/docker_image/deepspeed/Dockerfile
+++ b/training/metax/docker_image/deepspeed/Dockerfile
@@ -1,3 +1,3 @@
-FROM maca-c500-pytorch-2.19.2.23-ubuntu20.04-amd64:deepspeed
+FROM maca-2.20.2.2:megatron-deepspeed
 ENV PATH="/opt/conda/bin:${PATH}"
 RUN /bin/bash -c "uname -a"
\ No newline at end of file
diff --git a/training/metax/docker_image/megatron-deepspeed/Dockerfile b/training/metax/docker_image/megatron-deepspeed/Dockerfile
index 30d95064f..f6661d4ae 100644
--- a/training/metax/docker_image/megatron-deepspeed/Dockerfile
+++ b/training/metax/docker_image/megatron-deepspeed/Dockerfile
@@ -1,4 +1,4 @@
-FROM megatron_2.18.0.8:FlagPerf_base
+FROM maca-2.20.2.2:megatron-deepspeed
 ENV PATH="/opt/conda/bin:${PATH}"
 RUN /bin/bash -c "uname -a"
 RUN /bin/bash -c "pip config set global.index-url https://mirror.baidu.com/pypi/simple"
diff --git a/training/metax/docker_image/megatron/Dockerfile b/training/metax/docker_image/megatron/Dockerfile
index 659aae18d..ca15a83e8 100755
--- a/training/metax/docker_image/megatron/Dockerfile
+++ b/training/metax/docker_image/megatron/Dockerfile
@@ -1,37 +1,43 @@
-FROM flagperf_megatron:2.18.0.8.1
+FROM metax-megatron:2.23.0.13.342-ubuntu20.04-amd64
 ENV PATH="/opt/conda/bin:${PATH}"
-RUN /bin/bash -c "pip config set global.index-url https://mirror.baidu.com/pypi/simple"
-
-ENV MACA_PATH="/opt/maca"
-ENV MACA_CLANG_PATH="${MACA_PATH}/mxgpu_llvm/bin"
-ENV MACA_CLANG="${MACA_PATH}/mxgpu_llvm"
-ENV DEVINFO_ROOT="${MACA_PATH}"
-ENV WCUDA_PATH="${MACA_PATH}/tools/wcuda"
-ENV CUDA_PATH="${WCUDA_PATH}"
-RUN unset CUDA_HOME
-ENV PATH="${MACA_PATH}/bin:${MACA_CLANG}/bin:${PATH}"
-ENV LD_LIBRARY_PATH="${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${LD_LIBRARY_PATH}"
+RUN /bin/bash -c "pip config set global.index-url http://repo.metax-tech.com/r/pypi/simple"
+
+ENV MACA_PATH=/opt/maca
+
+ENV MACA_CLANG_PATH=${MACA_PATH}/mxgpu_llvm/bin
+ENV MACA_CLANG=${MACA_PATH}/mxgpu_llvm
+ENV DEVINFO_ROOT=${MACA_PATH}
+ENV CUDA_PATH=${MACA_PATH}/tools/cu-bridge
+ENV CUDA_HOME=${CUDA_PATH}
+ENV PATH=${MACA_PATH}/bin:${MACA_CLANG}/bin:${PATH}
+ENV LD_LIBRARY_PATH=${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${LD_LIBRARY_PATH}
+
+ENV ISU_FASTMODEL=1 
+ENV USE_TDUMP=OFF 
+ENV TMEM_LOG=OFF 
+ENV DEBUG_ITRACE=0 
 
 ENV MALLOC_THRESHOLD=99
-ENV PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:64"
+ENV MCPYTORCH_CHECK_ANOMALY_INF=1
+ENV FORCE_ACTIVATE_WAIT=1
+
+#kernel selection
+ENV MCBLAS_CUSTOMIZED_CONFIG_PATH=/workspace/Megatron-LM-FlagScale/mcblas_customized_config.yaml
 
-ENV MACA_SMALL_PAGESIZE_ENABLE=1
-ENV MALLOC_THRESHOLD=95
-ENV MCPYTORCH_DISABLE_PRINT=1
 
-ENV MCCL_NET_GDR_LEVEL=7
-#ENV MCCL_MIN_NCHANNELS=16
 ENV MCCL_MAX_NCHANNELS=16
+
 ENV MCCL_P2P_LEVEL=SYS
+
 ENV MCCL_LIMIT_RING_LL_THREADTHRESHOLDS=1
-ENV FORCE_ACTIVATE_WAIT=1
+ENV MCPYTORCH_DISABLE_PRINT=1
 
 ENV MHA_USE_BLAS=ON
-ENV LD_LIBRARY_PATH="/root/FWD_76_BWD_79:${LD_LIBRARY_PATH}"
-ENV SET_DEVICE_NUMA_PREFERRED=1
+ENV MHA_BWD_NO_ATOMIC_F64=1
 
-ENV MAX_JOBS=20
-ENV PYTORCH_ENABLE_SAME_RAND_A100=1
+ENV SET_DEVICE_NUMA_PREFERRED=1
 
+ENV MCCL_IB_GID_INDEX=1
+ENV MACA_SMALL_PAGESIZE_ENABLE=1
 RUN /bin/bash -c "uname -a"
 RUN /bin/bash -c alias python3=python
diff --git a/training/metax/docker_image/megatron_core060/Dockerfile b/training/metax/docker_image/megatron_core060/Dockerfile
new file mode 100644
index 000000000..c8456d964
--- /dev/null
+++ b/training/metax/docker_image/megatron_core060/Dockerfile
@@ -0,0 +1,38 @@
+FROM metax-megatron:2.23.0.13.342-ubuntu20.04-amd64
+ENV PATH="/opt/conda/bin:${PATH}"
+RUN /bin/bash -c "pip config set global.index-url http://repo.metax-tech.com/r/pypi/simple"
+
+ENV MACA_PATH="/opt/maca"
+ENV MACA_CLANG_PATH="${MACA_PATH}/mxgpu_llvm/bin"
+ENV MACA_CLANG="${MACA_PATH}/mxgpu_llvm"
+ENV DEVINFO_ROOT="${MACA_PATH}"
+ENV CUCC_PATH="${MACA_PATH}/tools/cu-bridge"
+ENV CUDA_PATH="${CUCC_PATH}"
+
+ENV PATH="${CUCC_PATH}:${MACA_PATH}/bin:${MACA_CLANG}/bin:${PATH}"
+ENV LD_LIBRARY_PATH="${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${LD_LIBRARY_PATH}"
+
+# ENV GLOO_SOCKET_IFNAME=ens115f0
+
+ENV MACA_SMALL_PAGESIZE_ENABLE=1
+ENV MCPYTORCH_DISABLE_PRINT=1
+
+ENV MCCL_NET_GDR_LEVEL=7
+ENV MCCL_P2P_LEVEL=SYS
+ENV MCCL_LIMIT_RING_LL_THREADTHRESHOLDS=1
+ENV FORCE_ACTIVATE_WAIT=1
+
+ENV SET_DEVICE_NUMA_PREFERRED=1
+
+ENV MAX_JOBS=20
+ENV PYTORCH_ENABLE_SAME_RAND_A100=1
+ENV MHA_BWD_NO_ATOMIC_F64=1
+ENV MCCL_IB_GID_INDEX=1
+ENV NVTE_FLASH_ATTN=1
+ENV NVTE_FUSED_ATTN=0
+
+ENV MCBLAS_CUSTOMIZED_CONFIG_PATH=/workspace/Megatron-LM_metax/mcblas_customized_config.yaml
+
+RUN /bin/bash -c "uname -a"
+RUN /bin/bash -c alias python3=python
+
diff --git a/training/metax/docker_image/megatron_core060/megatron_core060_install.sh b/training/metax/docker_image/megatron_core060/megatron_core060_install.sh
new file mode 100644
index 000000000..a9bf588e2
--- /dev/null
+++ b/training/metax/docker_image/megatron_core060/megatron_core060_install.sh
@@ -0,0 +1 @@
+#!/bin/bash
diff --git a/training/metax/docker_image/megatron_pai/Dockerfile b/training/metax/docker_image/megatron_pai/Dockerfile
new file mode 100644
index 000000000..1c477326b
--- /dev/null
+++ b/training/metax/docker_image/megatron_pai/Dockerfile
@@ -0,0 +1,37 @@
+FROM flagperf_megatron:2.23.0.13
+ENV PATH="/opt/conda/bin:${PATH}"
+RUN /bin/bash -c "pip config set global.index-url https://mirror.baidu.com/pypi/simple"
+
+ENV MACA_PATH="/opt/maca"
+ENV MACA_CLANG_PATH="${MACA_PATH}/mxgpu_llvm/bin"
+ENV MACA_CLANG="${MACA_PATH}/mxgpu_llvm"
+ENV DEVINFO_ROOT="${MACA_PATH}"
+ENV WCUDA_PATH="${MACA_PATH}/tools/wcuda"
+ENV CUDA_PATH="${WCUDA_PATH}"
+RUN unset CUDA_HOME
+ENV PATH="${MACA_PATH}/bin:${MACA_CLANG}/bin:${PATH}"
+ENV LD_LIBRARY_PATH="${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${LD_LIBRARY_PATH}"
+
+ENV MALLOC_THRESHOLD=99
+ENV PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:64"
+
+ENV MACA_SMALL_PAGESIZE_ENABLE=1
+ENV MALLOC_THRESHOLD=95
+ENV MCPYTORCH_DISABLE_PRINT=1
+
+ENV MCCL_NET_GDR_LEVEL=7
+#ENV MCCL_MIN_NCHANNELS=16
+ENV MCCL_MAX_NCHANNELS=16
+ENV MCCL_P2P_LEVEL=SYS
+ENV MCCL_LIMIT_RING_LL_THREADTHRESHOLDS=1
+ENV FORCE_ACTIVATE_WAIT=1
+
+ENV MHA_USE_BLAS=ON
+ENV LD_LIBRARY_PATH="/root/FWD_76_BWD_79:${LD_LIBRARY_PATH}"
+ENV SET_DEVICE_NUMA_PREFERRED=1
+
+ENV MAX_JOBS=20
+ENV PYTORCH_ENABLE_SAME_RAND_A100=1
+
+RUN /bin/bash -c "uname -a"
+RUN /bin/bash -c alias python3=python
diff --git a/training/metax/docker_image/megatron_pai/megatron_pai_install.sh b/training/metax/docker_image/megatron_pai/megatron_pai_install.sh
new file mode 100644
index 000000000..bcdeba5cd
--- /dev/null
+++ b/training/metax/docker_image/megatron_pai/megatron_pai_install.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+git clone https://github.com/alibaba/Pai-Megatron-Patch.git
+cd /workspace/Pai-Megatron-Patch
+git checkout aa7c56272cb53a7aeb7fa6ebbfa61c7fa3a5c2e4
+pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/
+git submodule init
+git submodule update Megatron-LM-240405
\ No newline at end of file
diff --git a/training/metax/glm-pytorch/README.md b/training/metax/glm-pytorch/README.md
index 0eaff257b..481cdc8ce 100644
--- a/training/metax/glm-pytorch/README.md
+++ b/training/metax/glm-pytorch/README.md
@@ -11,9 +11,9 @@
 - ##### 软件环境
    - OS版本：Ubuntu 20.04.6
    - OS kernel版本:  5.4.0-26-generic
-   - 加速卡驱动版本：2.2.0
+   - 加速卡驱动版本：2.3.0
    - Docker 版本：24.0.7
-   - 训练框架版本：pytorch-2.0.0+mc2.18.0.8-cp38-cp38-linux_x86_64.whl
+   - 训练框架版本：pytorch-2.0.0+mc2.23.0.16-cp38-cp38-linux_x86_64.whl
    - 依赖软件版本：无
 
 
@@ -41,6 +41,6 @@
 
 | 配置                | precision | fix_hp          | e2e_time | p_whole | p_train | p_core | acc   | mem       | MFU   |
 | ------------------- | --------- | --------------- | -------- | ------- | ------- | ------ | ----- | --------- | ----- |
-| C500单机8卡（1x8）  | fp32      | / |     |     |  |  | 0.802 | 54.5/64.0 |  |
-| C500单机单卡（1x1） | fp32      | / |     |    | |   | /     | 50.4/64.0 |  |
-| C500两机16卡（2x8） | fp32      | /  |     |  |    |   | /     | 29.8/64.0 | |
+| C500单机8卡（1x8）  | fp32      | / |     |     |  |  | 0.802 | 55.1/64.0 |  |
+| C500单机单卡（1x1） | fp32      | / |     |    | |   | /     | 55.6/64.0 |  |
+| C500两机16卡（2x8） | fp32      | /  |     |  |    |   | /     | 54.9/64.0 | |
diff --git a/training/metax/glm-pytorch/config/config_C500x2x8.py b/training/metax/glm-pytorch/config/config_C500x2x8.py
index a40988fed..ef2daafd2 100644
--- a/training/metax/glm-pytorch/config/config_C500x2x8.py
+++ b/training/metax/glm-pytorch/config/config_C500x2x8.py
@@ -1,11 +1,11 @@
 fp16 = True
 ddp_type = "apex"
-train_batch_size = 8
-eval_batch_size = 8
+train_batch_size = 16
+eval_batch_size = 16
 
 dist_backend = "nccl"
 
-lr = 1e-5
+lr = 2e-5
 weight_decay = 0.1
 adam_beta1 = 0.9
 adam_beta2 = 0.999
diff --git a/training/metax/llama2_70B-megatron/config/config_C500x4x8.py b/training/metax/llama2_70B-megatron/config/config_C500x4x8.py
index 7e7a001eb..06115e26e 100755
--- a/training/metax/llama2_70B-megatron/config/config_C500x4x8.py
+++ b/training/metax/llama2_70B-megatron/config/config_C500x4x8.py
@@ -1,10 +1,10 @@
 seqlength = 4096
 batchsize = 1
-accumulate_steps = 44
+accumulate_steps =1024
 train_tokens = 100000000
-theoryflops = 240000000000000.0
+theoryflops = 280000000000000.0
 epochs = 1
 flashattn = True
 recompute = True
 tensor_parallel = 4
-pipeline_parallel = 8
\ No newline at end of file
+pipeline_parallel = 8
diff --git a/training/metax/llama2_70B-megatron/config/requirements.txt b/training/metax/llama2_70B-megatron/config/requirements.txt
deleted file mode 100755
index 080a0d3ac..000000000
--- a/training/metax/llama2_70B-megatron/config/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-sentencepiece
diff --git a/training/metax/llama2_70B-megatron/config/training_adapter.sh b/training/metax/llama2_70B-megatron/config/training_adapter.sh
index 2aaf4a2f1..4b32d2f5b 100755
--- a/training/metax/llama2_70B-megatron/config/training_adapter.sh
+++ b/training/metax/llama2_70B-megatron/config/training_adapter.sh
@@ -1,25 +1,5 @@
-export MALLOC_THRESHOLD=99
-export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:64"
-export MACA_SMALL_PAGESIZE_ENABLE=1
-export MALLOC_THRESHOLD=95
-export MCPYTORCH_DISABLE_PRINT=1
-
-export MCCL_NET_GDR_LEVEL=7
-#ENV MCCL_MIN_NCHANNELS=16
-export MCCL_MAX_NCHANNELS=16
-export MCCL_P2P_LEVEL=SYS
-export MCCL_LIMIT_RING_LL_THREADTHRESHOLDS=1
-export FORCE_ACTIVATE_WAIT=1
-
-export MHA_USE_BLAS=ON
-export LD_LIBRARY_PATH=/root/FWD_76_BWD_79:${LD_LIBRARY_PATH}
-export SET_DEVICE_NUMA_PREFERRED=1
-
-export MAX_JOBS=20
-export PYTORCH_ENABLE_SAME_RAND_A100=1
-
+export PYTHONPATH=$PYTHONPATH:/workspace/Megatron-LM-FlagScale
+CODE_PATH="/workspace/Megatron-LM-FlagScale/pretrain_llama.py"
 RECOMPUTE_ARGS="
-    --recompute-granularity full \
-    --recompute-method block \
-    --recompute-num-layers 6
+    --pipline-num-layers-list 9 9 10 10 10 11 11 10
 "
\ No newline at end of file
diff --git a/training/metax/llama2_7b-megatron-deepspeed/config/environment_variables.sh b/training/metax/llama2_7b-megatron-deepspeed/config/environment_variables.sh
index 5722e8916..5a50db331 100644
--- a/training/metax/llama2_7b-megatron-deepspeed/config/environment_variables.sh
+++ b/training/metax/llama2_7b-megatron-deepspeed/config/environment_variables.sh
@@ -1,37 +1,27 @@
-export MACA_PATH=/opt/maca-2.18.0.8
+export MACA_PATH=/opt/maca
 
 export MACA_CLANG_PATH=${MACA_PATH}/mxgpu_llvm/bin
 export MACA_CLANG=${MACA_PATH}/mxgpu_llvm
 export DEVINFO_ROOT=${MACA_PATH}
-
-unset CUDA_HOME
-export PATH=${MACA_PATH}/bin:${MACA_CLANG}/bin:${PATH}
+export CUDA_PATH=${MACA_PATH}/tools/cu-bridge
+export CUDA_HOME=${CUDA_PATH}
 export LD_LIBRARY_PATH=${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${LD_LIBRARY_PATH}
 
-export CUCC_PATH=${MACA_PATH}/tools/cu-bridge
-export CUDA_PATH=${CUCC_PATH}
-
-#MACA-PyTorch envs
-export ISU_FASTMODEL=1 # must be set, otherwise may induce precision error
-export USE_TDUMP=OFF # optional, use to control whether generating debug file
-export TMEM_LOG=OFF # optional, use to control whether generating debug file
-export DEBUG_ITRACE=0 # optional, use to control whether generating debug file
+export ISU_FASTMODEL=1
+export USE_TDUMP=OFF
+export TMEM_LOG=OFF
+export DEBUG_ITRACE=0
 
-export MACA_SMALL_PAGESIZE_ENABLE=1
 export MALLOC_THRESHOLD=99
-export MCPYTORCH_DISABLE_PRINT=1
-export SET_DEVICE_NUMA_PREFERRED=1
+export MCPYTORCH_CHECK_ANOMALY_INF=1
+export FORCE_ACTIVATE_WAIT=1
 
-export MCCL_NET_GDR_LEVEL=7
 export MCCL_MAX_NCHANNELS=16
 export MCCL_P2P_LEVEL=SYS
 export MCCL_LIMIT_RING_LL_THREADTHRESHOLDS=1
-export FORCE_ACTIVATE_WAIT=1
+export MCPYTORCH_DISABLE_PRINT=1
 
 export MHA_USE_BLAS=ON
-export OMP_NUM_THREADS=1
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export MCCL_IB_GID_INDEX=3
-export MAX_JOBS=20
-export PYTORCH_ENABLE_SAME_RAND_A100=1
-export LD_LIBRARY_PATH='/root/FWD_76_BWD_79':$LD_LIBRARY_PATH
+export MHA_BWD_NO_ATOMIC_F64=1
+
+export SET_DEVICE_NUMA_PREFERRED=1
diff --git a/training/metax/llama2_7b_finetune-pytorch/README.md b/training/metax/llama2_7b_finetune-pytorch/README.md
new file mode 100644
index 000000000..32b9e9f76
--- /dev/null
+++ b/training/metax/llama2_7b_finetune-pytorch/README.md
@@ -0,0 +1,65 @@
+### 沐曦集成电路 C500 GPU配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+    - 机器、加速卡型号: 曦云®C500 64G 
+    - 多机网络类型、带宽: InfiniBand，2x200 Gb/s
+
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04.6
+   - OS kernel版本:  5.4.0-26-generic
+   - 加速卡驱动版本：2.2.0
+   - Docker 版本：24.0.7
+   - 依赖软件版本：见llama2_7b_finetune-pytorch/config/requirements.txt
+
+- ##### 并行策略
+
+   - 并行技术：无
+   - 实施者：无
+   - 实施细节：无
+
+- ##### 优化策略
+
+   - 优化方案：lora
+   - 方案细节：LoraConfig(
+      auto_mapping=None, 
+      base_model_name_or_path=None, 
+      revision=None, task_type='CAUSAL_LM', 
+      inference_mode=False, r=8, 
+      target_modules=['q_proj', 'v_proj'], 
+      lora_alpha=32, lora_dropout=0.05, 
+      fan_in_fan_out=False, bias='none', 
+      modules_to_save=None, 
+      init_lora_weights=True, 
+      layers_to_transform=None, 
+      layers_pattern=None)
+
+### 运行情况
+
+* 输入批尺寸
+  1. local_batchsize(batch_size_training)，简写为LBS，即实际进入模型的张量批尺寸，为config_A100x1x1.py中所写，在本case中默认为2
+  2. seq_length(max_position_embedding)，简写为MPE，即实际进入模型的序列长度，为config_A100x1x1.py中所写，在本case中默认为512
+  3. gradient_accumulate_steps，简写为GAS，即梯度累加步数，为ds_config.json中所写，在本case中默认为1
+  4. global_batchsize恒等于local_batchsize*world_size，本case单卡运行因此为2.
+
+* 通用指标
+
+| 指标名称     | 指标值                     | 特殊说明                           |
+| ------------ | -------------------------- | ---------------------------------- |
+| 任务类别     | 自然语言理解               |                                    |
+| 模型         | llama2_7b                  |                                    |
+| 数据集       | openwebtext                | 如无特殊说明，训练前1亿个token |
+| 数据精度     |fp32                        |                                    |
+| 超参修改     | fix_hp,见“性能指标”        | 运行必要特殊超参，例如需要改小seqlength避免OOM |
+| 硬件设备简称 | nvidia A100                |                                    |
+| 硬件存储使用 | mem,见“性能指标”           | 通常称为“显存”,单位为GiB           |
+| 计算使用率 | MFU,见“性能指标”           | 参见PaLM论文定义 |
+| **吞吐量**   | **token/p/s,见“性能指标”** | 平均单卡每秒处理的token数          |
+| MMLU结果           | acc(推理/验证)   | MMLU回答准确率（few_shots:5）                   |
+* 性能指标
+
+| 配置                |  fix_hp           | token/p/s | loss | mem       |acc(MMLU) |MFU       |
+| ------------------- | ---------------- | ------ | ------- | --------- | --------- |--------- |
+| C500单机单卡（1x1）  |  数据精度=fp16, local_batchsize=4  | / | 1.72 | 32.0/64 | 0.41 |36.2%|
+
+>注：
+>finetune训练数据集为samsum_dataset,MMLU数据集在这里只做配合lora-finetune后功能测试使用，MMLU评测结果无finetune结果指导意义，这里关注吞吐量即可。
\ No newline at end of file
diff --git a/training/metax/llama2_7b_finetune-pytorch/config/config_C500x1x1.py b/training/metax/llama2_7b_finetune-pytorch/config/config_C500x1x1.py
new file mode 100644
index 000000000..2439b5938
--- /dev/null
+++ b/training/metax/llama2_7b_finetune-pytorch/config/config_C500x1x1.py
@@ -0,0 +1,11 @@
+batch_size_training: int = 4
+num_epochs: int = 3
+model_name: str = "llama2_7b_hf"
+mmlu_dir = "mmlu_dataset"
+dataset_dir = "samsum_dataset"
+output_dir: str = "PEFT_model"
+weight_dir = model_name
+nproc = 1
+nnodes = 1
+use_fp16 = True
+lr=1e-6
diff --git a/training/metax/llama2_7b_finetune-pytorch/config/environment_variables.sh b/training/metax/llama2_7b_finetune-pytorch/config/environment_variables.sh
new file mode 100644
index 000000000..8583ab628
--- /dev/null
+++ b/training/metax/llama2_7b_finetune-pytorch/config/environment_variables.sh
@@ -0,0 +1,31 @@
+export MACA_PATH=/opt/maca
+
+export MACA_CLANG_PATH=${MACA_PATH}/mxgpu_llvm/bin
+export MACA_CLANG=${MACA_PATH}/mxgpu_llvm
+export DEVINFO_ROOT=${MACA_PATH}
+export PATH=${MACA_PATH}/bin:${MACA_CLANG}/bin:${PATH}
+export LD_LIBRARY_PATH=${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${LD_LIBRARY_PATH}
+
+export CUCC_PATH=${MACA_PATH}/tools/cu-bridge
+export CUDA_PATH=${CUCC_PATH}
+export CUDA_HOME=${CUCC_PATH}
+
+#MACA-PyTorch envs
+export ISU_FASTMODEL=1 # must be set, otherwise may induce precision error
+export USE_TDUMP=OFF # optional, use to control whether generating debug file
+export TMEM_LOG=OFF # optional, use to control whether generating debug file
+export DEBUG_ITRACE=0 # optional, use to control whether generating debug file
+
+# export MACA_SMALL_PAGESIZE_ENABLE=1
+export MALLOC_THRESHOLD=99
+export MCPYTORCH_CHECK_ANOMALY_INF=1
+export FORCE_ACTIVATE_WAIT=1
+
+export MCCL_MAX_NCHANNELS=16
+export MCCL_P2P_LEVEL=SYS
+export MCCL_LIMIT_RING_LL_THREADTHRESHOLDS=1
+export MCPYTORCH_DISABLE_PRINT=1
+export MHA_USE_BLAS=ON
+export MHA_BWD_NO_ATOMIC_F64=1
+
+export SET_DEVICE_NUMA_PREFERRED=1
diff --git a/training/metax/llama2_7b_finetune-pytorch/config/requirements.txt b/training/metax/llama2_7b_finetune-pytorch/config/requirements.txt
new file mode 100644
index 000000000..7b69da024
--- /dev/null
+++ b/training/metax/llama2_7b_finetune-pytorch/config/requirements.txt
@@ -0,0 +1,14 @@
+loguru
+accelerate==0.21.0
+appdirs
+loralib
+bitsandbytes==0.39.0
+black
+datasets==2.14.5
+fire==0.5.0
+peft==0.4.0
+transformers==4.31.0
+sentencepiece==0.1.99
+py7zr
+scipy
+optimum
\ No newline at end of file
diff --git a/training/metax/llama2_7b_finetune-pytorch/extern/.gitkeep b/training/metax/llama2_7b_finetune-pytorch/extern/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/metax/llama3_8B-megatron/README.md b/training/metax/llama3_8B-megatron/README.md
new file mode 100644
index 000000000..62c6903fa
--- /dev/null
+++ b/training/metax/llama3_8B-megatron/README.md
@@ -0,0 +1,58 @@
+
+### Nvidia GPU配置与运行信息参考
+#### A100环境配置
+- ##### 硬件环境
+
+    - 机器、加速卡型号: 曦云®C500 64G
+    - 多机网络类型、带宽: InfiniBand，2x200 Gb/s
+
+- ##### 软件环境
+
+   - OS版本：Ubuntu 20.04.6
+   - OS kernel版本:  5.4.0-26-generic
+   - 加速卡驱动版本：2.3.0
+   - Docker 版本：24.0.7
+   - 训练框架版本：pytorch-2.0.0+mc2.23.0.13-cp38-cp38-linux_x86_64.whl, megatron-core 0.6.0
+   - 依赖软件版本：sentencepiece==0.2.0, transformers==4.40.1
+
+- ##### 并行策略
+
+   - 并行技术：张量、流水、数据混合并行，具体并行方案见“运行情况”章节
+   - 实施者：megatron-core
+   - 实施细节：PP2DP2TP2
+
+- ##### 优化策略
+
+   - flash attention 2
+   - recompute-activations
+   - transformer-engine local
+
+### 运行情况
+
+* 输入批尺寸
+  1. local_batchsize(micro_batchsize)，简写为LBS，即实际进入模型的张量批尺寸，为config_A100x1x8.py中所写，在本case中默认为1。
+  2. seqlength(max_position_embedding)，简写为MPE，即实际进入模型的序列长度，为config_A100x1x8.py中所写，在本case中默认为8192，原则上不可更改
+  3. global_batchsize恒等于local_batchsize\*gradient_accumulate_steps\*data_parallel_size。在本case中，data_parallel_size=world_size/TPsize/PPsize。在本case中默认为512，使得globalbatchsize=4M tokens。
+
+* 通用指标
+
+| 指标名称    | 指标值                   | 特殊说明                                     |
+| ------- | --------------------- | ---------------------------------------- |
+| 任务类别    | 自然语言理解                |                                          |
+| 模型      | llama3_8b             |                                          |
+| 数据集     | wudao                 | wudao数据集来源于智源研究院<br>bin/idx数据集文件来源于阿里云灵骏团队<br>使用llama3 tokenizer预处理 |
+| 数据精度    | precision,见“性能指标”     | 可选fp32/amp/fp16/bf16                     |
+| 超参修改    | parallel,见“性能指标”      | 格式为PPxDPyTPz，例如PP2DP4TP1                 |
+| 超参修改    | fix_hp,见“性能指标”        | 跑满硬件设备评测吞吐量所需特殊超参                        |
+| 硬件设备简称  | Metax C500           |                                          |
+| 硬件存储使用  | mem,见“性能指标”           | 通常称为“显存”,单位为GiB                          |
+| 计算使用率   | MFU,见“性能指标”           | 参见PaLM论文定义                               |
+| **吞吐量** | **token/p/s,见“性能指标”** | 平均单卡每秒处理的token数                          |
+
+* 性能指标
+
+精度对齐需第21步及之后，所有步的loss与nvidia对应步的loss平均相对误差小于2%。NVloss曲线请联系智源研究院获取
+
+| 配置                 | precision | parallel    | fix_hp | token/p/s | 是否精度对齐     | mem   | MFU         |
+| ------------------ | --------- | ----------- | ------ | --------- | ---------- | ----- | ----------- |
+| C500单机8卡（1x8）      | bf16      | PP2DP2TP2   | /      | /    | True | 51.6/64.0 | 40.4%       |
diff --git a/training/metax/llama3_8B-megatron/config/config_C500x1x8.py b/training/metax/llama3_8B-megatron/config/config_C500x1x8.py
new file mode 100644
index 000000000..db221aac0
--- /dev/null
+++ b/training/metax/llama3_8B-megatron/config/config_C500x1x8.py
@@ -0,0 +1,7 @@
+tokenizer_path = "llama3_8b_hf"
+localbs = 1
+train_steps = 300
+theoryflops = 312000000000000.0
+megatron_path = "/workspace/Megatron-LM_metax" # need to be aligned with DockerFile. In NGCtorch, it's /workspace/ + Megatron-LM
+tensor_parallel = 1
+pipeline_parallel = 4
diff --git a/training/metax/llama3_8B-megatron/config/config_C500x4x8.py b/training/metax/llama3_8B-megatron/config/config_C500x4x8.py
new file mode 100644
index 000000000..5f1494c84
--- /dev/null
+++ b/training/metax/llama3_8B-megatron/config/config_C500x4x8.py
@@ -0,0 +1,7 @@
+tokenizer_path = "llama3_8b_hf"
+localbs = 1
+train_steps = 300
+theoryflops = 280000000000000.0
+megatron_path = "/workspace/Megatron-LM_metax" # need to be aligned with DockerFile. In NGCtorch, it's /workspace/ + Megatron-LM
+tensor_parallel = 1
+pipeline_parallel = 2
diff --git a/training/metax/llama3_8B-megatron/config/training_adapter.sh b/training/metax/llama3_8B-megatron/config/training_adapter.sh
new file mode 100644
index 000000000..991665b73
--- /dev/null
+++ b/training/metax/llama3_8B-megatron/config/training_adapter.sh
@@ -0,0 +1,21 @@
+# for 1*8 3150
+VENDOR_ARGS=" \
+    --transformer-impl local  \
+    --use-distributed-optimizer \
+    --use-mcore-models \
+    --use-flash-attn \
+    --pipline-num-layers-list 7 9 9 7
+"
+# for 4*8
+# VENDOR_ARGS=" \
+#     --transformer-impl local  \
+#     --use-distributed-optimizer \
+#     --use-mcore-models \
+#     --use-flash-attn \
+#     --attention-dropout 0.0 \
+#     --hidden-dropout 0.0 \
+#     --recompute-granularity full \
+#     --recompute-method block \
+#     --recompute-num-layers 1 \
+#     --recompute-num-layers-list 2 0
+# "
\ No newline at end of file
diff --git a/training/metax/mobilenetv2-pytorch/README.md b/training/metax/mobilenetv2-pytorch/README.md
index 9596c5231..d900c3a10 100644
--- a/training/metax/mobilenetv2-pytorch/README.md
+++ b/training/metax/mobilenetv2-pytorch/README.md
@@ -11,9 +11,9 @@
 - ##### 软件环境
    - OS版本：Ubuntu 20.04.6
    - OS kernel版本:  5.4.0-26-generic
-   - 加速卡驱动版本：2.2.0
+   - 加速卡驱动版本：2.3.0
    - Docker 版本：24.0.7
-   - 训练框架版本：pytorch-2.0.0+mc2.19.2.5-cp38-cp38-linux_x86_64.whl
+   - 训练框架版本：pytorch-2.0.0+mc2.20.2.20-cp38-cp38-linux_x86_64.whl
    - 依赖软件版本：无
 
 
@@ -40,7 +40,7 @@
 
 | 配置                | precision | fix_hp         | e2e_time | p_whole | p_train | p_core | acc    | mem       |
 | ------------------- | --------- | -------------- | -------- | ------- | ------- | ------ | ------ | --------- |
-| MXC500 单机8卡（1x8）  | fp32      | /            |          |         |         |        | 68.31% | 49.0/64.0  |
-| MXC500 单机单卡（1x1） | fp32      | /            | /        |         |         |        | /      | 47.3/64.0 |
-| MXC500 两机8卡（2x8）  | fp32      | /            | /        |         |         |        | /      | 48.7/64.0 |
+| MXC500 单机8卡（1x8）  | fp32    | bs=704,lr=0.72 |          |         |         |        | 68.00% | 62.2/64.0  |
+| MXC500 单机单卡（1x1） | fp32    | bs=704,lr=0.72 | /        |         |         |        | /      | 59.3/64.0 |
+| MXC500 两机8卡（2x8）  | fp32    | bs=640,lr=0.72 | /        |         |         |        | /      | 58.7/64.0 |
 
diff --git a/training/metax/mobilenetv2-pytorch/config/config_C500x1x1.py b/training/metax/mobilenetv2-pytorch/config/config_C500x1x1.py
index 785b826b7..a5a924dde 100644
--- a/training/metax/mobilenetv2-pytorch/config/config_C500x1x1.py
+++ b/training/metax/mobilenetv2-pytorch/config/config_C500x1x1.py
@@ -1,6 +1,6 @@
 from config_common import *
 
-train_batch_size = 512
-eval_batch_size = 512
+train_batch_size = 704
+eval_batch_size = 704
 lr = 0.72
 max_epoch = 10
diff --git a/training/metax/mobilenetv2-pytorch/config/config_C500x1x8.py b/training/metax/mobilenetv2-pytorch/config/config_C500x1x8.py
index 711fca336..81df413b8 100644
--- a/training/metax/mobilenetv2-pytorch/config/config_C500x1x8.py
+++ b/training/metax/mobilenetv2-pytorch/config/config_C500x1x8.py
@@ -1,5 +1,5 @@
 from config_common import *
 
-train_batch_size = 512
-eval_batch_size = 512
+train_batch_size = 704
+eval_batch_size = 704
 lr = 0.72
diff --git a/training/metax/mobilenetv2-pytorch/config/config_C500x2x8.py b/training/metax/mobilenetv2-pytorch/config/config_C500x2x8.py
index 4a52686af..ec2d4ca6f 100644
--- a/training/metax/mobilenetv2-pytorch/config/config_C500x2x8.py
+++ b/training/metax/mobilenetv2-pytorch/config/config_C500x2x8.py
@@ -1,6 +1,6 @@
 from config_common import *
 
-train_batch_size = 512
-eval_batch_size = 512
+train_batch_size = 640
+eval_batch_size = 640
 lr = 0.72
 max_epoch = 100
diff --git a/training/metax/qwen1.5_MoE-megatron/README.md b/training/metax/qwen1.5_MoE-megatron/README.md
new file mode 100644
index 000000000..ea40ea2be
--- /dev/null
+++ b/training/metax/qwen1.5_MoE-megatron/README.md
@@ -0,0 +1,55 @@
+### 沐曦集成电路 C500 GPU配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+    - 机器、加速卡型号: 曦云®C500 64G 
+    - 多机网络类型、带宽: InfiniBand，2x200 Gb/s
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04.6
+   - OS kernel版本:  5.4.0-26-generic
+   - 加速卡驱动版本：2.2.0
+   - Docker 版本：24.0.7
+   - 训练框架版本：pytorch-2.0.0+mc2.18.0.8-cp38-cp38-linux_x86_64.whl, deepspeed 0.10.0, Megatron-DeepSpeed.git@11f2d9342
+   - 依赖软件版本：sentencepiece, transformers==4.34.1
+- ##### 并行策略
+    - 并行技术: data parallel, tensor parallel, pipeline parallel
+    - 实施者：Megatron Deepspeed
+- ##### 优化策略
+    - flash attention 2
+
+- ##### 依赖环境
+
+   - sentencepiece==0.2.0
+   - 注：不同版本的sentencepiece分词方式可能会有所不同，为了训练误差的比对，在本case中将sentencepiece的版本设置为0.2.0
+
+### 运行情况
+
+* 输入批尺寸
+
+1. local_batchsize(micro_batchsize)，简写为LBS，即实际进入模型的张量批尺寸，为config_  C500x1x8.py中所写，在本case中默认为1。
+2. seqlength(max_position_embedding)，简写为MPE，即实际进入模型的序列长度，为config_C500x1x8.py中所写，在本case中默认为8192。
+3. global_batchsize恒等于local_batchsize\*gradient_accumulate_steps\*data_parallel_size。在本case中默认为512。
+4. 在本case中，data_parallel_size=world_size/TPsize/PPsize。
+
+* 通用指标
+
+| 指标名称     | 指标值                     | 特殊说明                           |
+| ------------ | -------------------------- | ---------------------------------- |
+| 任务类别     | 自然语言理解               |                                    |
+| 模型         | qwen1.5_14B_MoE                  |                                    |
+| 数据集       | pile wikipedia   |                                    |
+| 数据精度       | precision,见“性能指标”  | 可选fp16/bf16                      |
+| 硬件设备简称 | MXC500                |                                    |
+| 硬件存储使用 | mem,见“性能指标”           | 通常称为“显存”,单位为GiB           |
+| 计算使用率 | MFU,见“性能指标”           | 参见PaLM论文定义 |
+| **吞吐量**   | **token/p/s,见“性能指标”** | 平均单卡每秒处理的token数          |
+
+* 性能指标
+
+本例训练100 step，此项实验也将作为精度对齐所用实验。精度对齐需第21step及之后，所有步的loss与nvidia对应步的loss平均相对误差小于2%。
+
+`注：因原仓库没有提供精度参考，因此我们基于源代码跑出NV版本的loss作为参考值。实验显示lm loss value随机初始值约为12，训练100轮后降到了8.53，呈现收敛特性。将其设为参考值是为了比较其他芯片在相同的配置下，loss的降低过程是否匹配，以进一步定量对比 mem 与 MFU 。`
+
+| 配置                |  precision | parallel |  fix_hp           | token/p/s | lm loss value| mem       | MFU       |
+| ------------------ | -------- | --------- | ---------------- | ------ |  ------- | --------- | --------- |
+| C500单机8卡（1x8）  |    bf16    | TP1PP2DP4 |  / | / | 8.55 | 63/64 | 26.91% |
+| C500单机8卡（1x8）  |    fp16    | TP1PP2DP4 |  /| / | 8.59 | 63/64 | 27.24% |
\ No newline at end of file
diff --git a/training/metax/qwen1.5_MoE-megatron/config/config_C500x1x8.py b/training/metax/qwen1.5_MoE-megatron/config/config_C500x1x8.py
new file mode 100644
index 000000000..e516162b6
--- /dev/null
+++ b/training/metax/qwen1.5_MoE-megatron/config/config_C500x1x8.py
@@ -0,0 +1,10 @@
+batchsize = 1
+gbs = 512
+seqlength = 8192
+padlength = 8192
+precision = 'bf16'
+tensor_parallel = 1
+pipeline_parallel = 2
+accumulate_steps = 1
+theoryflops = 280000000000000.0
+epochs = 1
\ No newline at end of file
diff --git a/training/metax/qwen1.5_MoE-megatron/config/requirements.txt b/training/metax/qwen1.5_MoE-megatron/config/requirements.txt
new file mode 100644
index 000000000..654de59b5
--- /dev/null
+++ b/training/metax/qwen1.5_MoE-megatron/config/requirements.txt
@@ -0,0 +1 @@
+sentencepiece==0.2.0
diff --git a/training/metax/qwen1.5_MoE-megatron/config/training_adapter.sh b/training/metax/qwen1.5_MoE-megatron/config/training_adapter.sh
new file mode 100644
index 000000000..499b9af53
--- /dev/null
+++ b/training/metax/qwen1.5_MoE-megatron/config/training_adapter.sh
@@ -0,0 +1 @@
+echo "[Prompt] metax adaption is NULL, for other Vendors"
\ No newline at end of file
diff --git a/training/metax/resnet50-pytorch/config/config_C550x1x8.py b/training/metax/resnet50-pytorch/config/config_C550x1x8.py
deleted file mode 100644
index 9ff943928..000000000
--- a/training/metax/resnet50-pytorch/config/config_C550x1x8.py
+++ /dev/null
@@ -1,3 +0,0 @@
-lr = 0.4
-train_batch_size = 128
-eval_batch_size = 128
diff --git a/training/metax/retinanet-pytorch/README.md b/training/metax/retinanet-pytorch/README.md
index e6e28ae73..8cb84f676 100644
--- a/training/metax/retinanet-pytorch/README.md
+++ b/training/metax/retinanet-pytorch/README.md
@@ -22,9 +22,9 @@ torchvision.models.resnet.ResNet50_Weights.IMAGENET1K_V1.value.url = 'https://do
 - ##### 软件环境
    - OS版本：Ubuntu 20.04.6
    - OS kernel版本:  5.4.0-26-generic
-   - 加速卡驱动版本：2.2.0
+   - 加速卡驱动版本：2.3.0
    - Docker 版本：24.0.7
-   - 训练框架版本：pytorch-2.0.0+mc2.18.0.8-cp38-cp38-linux_x86_64.whl
+   - 训练框架版本：pytorch-2.0.0+mc2.20.2.20-cp38-cp38-linux_x86_64.whl
    - 依赖软件版本：无
 
 
@@ -51,7 +51,7 @@ torchvision.models.resnet.ResNet50_Weights.IMAGENET1K_V1.value.url = 'https://do
 
 | 配置                | precision | fix_hp        | e2e_time | p_whole | p_train | p_core | mAP    | mem       |
 | ------------------- | --------- | ------------- | -------- | ------- | ------- | ------ | ------ | --------- |
-| MXC500 单机8卡（1x8）  | fp32    | /             |         |          |        |        | 0.3517 | 37.5/64.0 |
-| MXC500 单机单卡（1x1） | fp32    | /             | /       |          |        |        | /      | 25.1/64.0 |
-| MXC500 两机8卡（2x8）  | fp32    | /             | /       |          |        |        | /      | 59.6/64.0 |
+| MXC500 单机8卡（1x8）  | fp32    | bs=24,lr=0.04 |         |          |        |        | 0.3414 | 59.4/64.0 |
+| MXC500 单机单卡（1x1） | fp32    | bs=40,lr=0.01 | /       |          |        |        | /      | 61.9/64.0 |
+| MXC500 两机8卡（2x8）  | fp32    | bs=22,lr=0.02 | /       |          |        |        | /      | 62.0/64.0 |
 
diff --git a/training/metax/retinanet-pytorch/config/config_C500x1x1.py b/training/metax/retinanet-pytorch/config/config_C500x1x1.py
index 640b6acd4..ebfa2f2e8 100644
--- a/training/metax/retinanet-pytorch/config/config_C500x1x1.py
+++ b/training/metax/retinanet-pytorch/config/config_C500x1x1.py
@@ -1,7 +1,7 @@
 vendor: str = "metax"
 
-train_batch_size = 16
-eval_batch_size = 16
+train_batch_size = 40
+eval_batch_size = 40
 lr = 0.01
 max_epoch = 1
 cudnn_benchmark = False
\ No newline at end of file
diff --git a/training/metax/retinanet-pytorch/config/config_C500x1x8.py b/training/metax/retinanet-pytorch/config/config_C500x1x8.py
index edb21b323..2670b46ca 100644
--- a/training/metax/retinanet-pytorch/config/config_C500x1x8.py
+++ b/training/metax/retinanet-pytorch/config/config_C500x1x8.py
@@ -1,7 +1,7 @@
 vendor: str = "metax"
 
-train_batch_size = 16
-eval_batch_size = 16
+train_batch_size = 24
+eval_batch_size = 24
 lr = 0.04
-seed = 101
+seed = 192
 cudnn_benchmark = False
diff --git a/training/metax/retinanet-pytorch/config/config_C500x2x8.py b/training/metax/retinanet-pytorch/config/config_C500x2x8.py
index 9439e2aaf..011b8fb6a 100644
--- a/training/metax/retinanet-pytorch/config/config_C500x2x8.py
+++ b/training/metax/retinanet-pytorch/config/config_C500x2x8.py
@@ -1,8 +1,8 @@
 vendor: str = "metax"
 
-train_batch_size = 16
-eval_batch_size = 16
-lr = 0.04
+train_batch_size = 22
+eval_batch_size = 22
+lr = 0.02
 max_epoch = 18
 seed = 101
 cudnn_benchmark = False
\ No newline at end of file
diff --git a/training/metax/retinanet-pytorch/config/config_C550x1x8.py b/training/metax/retinanet-pytorch/config/config_C550x1x8.py
deleted file mode 100644
index edb21b323..000000000
--- a/training/metax/retinanet-pytorch/config/config_C550x1x8.py
+++ /dev/null
@@ -1,7 +0,0 @@
-vendor: str = "metax"
-
-train_batch_size = 16
-eval_batch_size = 16
-lr = 0.04
-seed = 101
-cudnn_benchmark = False
diff --git a/training/metax/swin_transformer-pytorch/config/environment_variables.sh b/training/metax/swin_transformer-pytorch/config/environment_variables.sh
index e49e02a04..471e3e168 100644
--- a/training/metax/swin_transformer-pytorch/config/environment_variables.sh
+++ b/training/metax/swin_transformer-pytorch/config/environment_variables.sh
@@ -1,4 +1,4 @@
 # =================================================
 # Export variables
 # =================================================
-
+export METAX_USE_TF32=1
diff --git a/training/metax/swin_transformer-pytorch/config/requirements.txt b/training/metax/swin_transformer-pytorch/config/requirements.txt
index 447a7b0e8..cd0372356 100644
--- a/training/metax/swin_transformer-pytorch/config/requirements.txt
+++ b/training/metax/swin_transformer-pytorch/config/requirements.txt
@@ -1,4 +1,3 @@
-http://repo.metax-tech.com/r/pypi/simple/torch-2.0.0+gite544b36-cp38-cp38-linux_x86_64.whl
 numpy
 tqdm
 schedule
diff --git a/training/nvidia/chatglm3_6b-deepspeed/README.md b/training/nvidia/chatglm3_6b-deepspeed/README.md
index 0437e8242..088544ec3 100644
--- a/training/nvidia/chatglm3_6b-deepspeed/README.md
+++ b/training/nvidia/chatglm3_6b-deepspeed/README.md
@@ -1,8 +1,8 @@
 ### Nvidia GPU配置与运行信息参考
 #### 环境配置
 - ##### 硬件环境
-    - 机器型号: NVIDIA DGX A800(80G) 
-    - 加速卡型号: NVIDIA_A800-SXM4-80GB
+    - 机器型号: NVIDIA DGX H100(80G) 
+    - 加速卡型号: NVIDIA_H100-SXM-80GB
     - CPU型号: AMD EPYC7742-64core@1.5G
     - 多机网络类型、带宽: InfiniBand，200Gb/s
     
@@ -22,8 +22,8 @@
 ### 运行情况
 
 * 输入批尺寸
-  1. local_batchsize(micro_batchsize)，简写为LBS，即实际进入模型的张量批尺寸，为config_A100x1x8.py中所写，在本case中默认为**1**
-  2. seqlength(max_position_embedding)，简写为MPE，即实际进入模型的序列长度，为config_A100x1x8.py中所写，在本case中默认为**8192**
+  1. local_batchsize(micro_batchsize)，简写为LBS，即实际进入模型的张量批尺寸，为config_H100x1x8.py中所写，在本case中默认为**1**
+  2. seqlength(max_position_embedding)，简写为MPE，即实际进入模型的序列长度，为config_H100x1x8.py中所写，在本case中默认为**4096**
   3. gradient_accumulate_steps，简写为GAS，即梯度累加步数，为ds_config.json中所写，在本case中默认为**1**
   4. global_batchsize恒等于local_batchsize\*gradient_accumulate_steps\*data_parallel_size，简写为GBS。在本case中，只存在数据并行，因此data_parallel_size=world_size。
 
@@ -43,6 +43,8 @@
 
 * 性能指标
 
-| 配置                |  fix_hp           | token/p/s | loss | mem       | MFU       |
-| ------------------- | ---------------- | ------ | ------- | --------- | --------- |
-| A800单机8卡（1x8）  |  mpe=4096        | 3486.4 | 4.65 | 61/80 | 40.2% |
+精度对齐需第21步及之后，所有步的loss与nvidia对应步的loss平均相对误差小于2%。NVloss曲线请联系智源研究院获取
+
+| 配置                 | precision  | fix_hp | token/p/s | 是否精度对齐     | mem   | MFU         |
+| ------------------ | ---------  | ------ | --------- | ---------- | ----- | ----------- |
+| H100单机8卡（1x8）      | bf16       |  mpe=4096     | 8429.22    | True(作为基线) | 61/80 | 30.7%       |
diff --git a/training/nvidia/chatglm3_6b-deepspeed/config/config_H100x1x8.py b/training/nvidia/chatglm3_6b-deepspeed/config/config_H100x1x8.py
new file mode 100644
index 000000000..e95913d89
--- /dev/null
+++ b/training/nvidia/chatglm3_6b-deepspeed/config/config_H100x1x8.py
@@ -0,0 +1,6 @@
+seqlength = 4096
+batchsize = 1
+datafilename = "openwebtext_chatglm3_100M.npy"
+theoryflops = 989000000000000.0
+epochs = 1
+flashattn = True
diff --git a/training/nvidia/chatglm3_6b-deepspeed/config/ds_config.json b/training/nvidia/chatglm3_6b-deepspeed/config/ds_config.json
index e2b0f2b02..3498d2a7a 100644
--- a/training/nvidia/chatglm3_6b-deepspeed/config/ds_config.json
+++ b/training/nvidia/chatglm3_6b-deepspeed/config/ds_config.json
@@ -1,3 +1,3 @@
 {
-    "gradient_accumulation_steps": 64
+    "gradient_accumulation_steps": 1
   }
diff --git a/training/nvidia/docker_image/flagscale_2409/Dockerfile b/training/nvidia/docker_image/flagscale_2409/Dockerfile
new file mode 100644
index 000000000..904d83e90
--- /dev/null
+++ b/training/nvidia/docker_image/flagscale_2409/Dockerfile
@@ -0,0 +1,4 @@
+FROM flagscale:24.08.18-ngc-24.05-conda
+RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple"
+RUN /bin/bash -c "uname -a"
+RUN /bin/bash -c alias python3=python
diff --git a/training/nvidia/docker_image/flagscale_2409/flagscale_2409_install.sh b/training/nvidia/docker_image/flagscale_2409/flagscale_2409_install.sh
new file mode 100644
index 000000000..a9bf588e2
--- /dev/null
+++ b/training/nvidia/docker_image/flagscale_2409/flagscale_2409_install.sh
@@ -0,0 +1 @@
+#!/bin/bash
diff --git a/training/nvidia/llama2_7b_finetune-pytorch/README.md b/training/nvidia/llama2_7b_finetune-pytorch/README.md
index 7a2b61046..9606cb628 100644
--- a/training/nvidia/llama2_7b_finetune-pytorch/README.md
+++ b/training/nvidia/llama2_7b_finetune-pytorch/README.md
@@ -62,6 +62,7 @@
 | 配置                |  fix_hp           | token/p/s | loss | mem       |acc(MMLU) |MFU       |
 | ------------------- | ---------------- | ------ | ------- | --------- | --------- |--------- |
 | A100单机单卡（1x1）  |  /  | 2788 | 1.64 | 37.3/40 | 0.38 |/|
+| A100单机单卡（1x1）  |  数据精度=fp16, local_batchsize=4  | 4017 | 1.77 | 32.0/40 | 0.43 |36.1%|
 
 >注：
 >finetune训练数据集为samsum_dataset,MMLU数据集在这里只做配合lora-finetune后功能测试使用，MMLU评测结果无finetune结果指导意义，这里关注吞吐量即可。
\ No newline at end of file
diff --git a/training/nvidia/llama3_70B-flagscale/README.md b/training/nvidia/llama3_70B-flagscale/README.md
new file mode 100644
index 000000000..d44c78c53
--- /dev/null
+++ b/training/nvidia/llama3_70B-flagscale/README.md
@@ -0,0 +1 @@
+此测例为FlagScale相关项目测例
diff --git a/training/nvidia/llama3_70B-flagscale/config/config_H100x4x8.py b/training/nvidia/llama3_70B-flagscale/config/config_H100x4x8.py
new file mode 100644
index 000000000..5d5fb369d
--- /dev/null
+++ b/training/nvidia/llama3_70B-flagscale/config/config_H100x4x8.py
@@ -0,0 +1,24 @@
+# scale_parent must under FlagPerf/ or data_dir/, otherwise you cannot mount it into baremetal, therefore cannot use shared storage
+scale_parent = "/share/project/shhh/xlcllm"
+scale_home = f"{scale_parent}/FlagScale"
+
+# this cmd should install scale at <scale_home>. <scale_home> is set by flagperf.training.benchmarks.llava1.5_7b.flagscale.run_pretraining.py
+scale_download_cmd = f"cd {scale_parent}; git clone https://github.com/FlagOpen/FlagScale.git; cd FlagScale; git checkout a44556c"
+
+# NV need nothing because all requirements have been established in base docker image. vendor can do anything related here
+scale_install_cmd = ""
+
+scale_conf_dir = f"{scale_home}/examples/llama/conf"
+configyaml = f"{scale_conf_dir}/config.yaml"
+trainyaml = f"{scale_conf_dir}/train/train_llama3_70b.yaml"
+dataset = f"SAMPLE50B/llama3/llama3_dataset"
+tokenizer = f"SAMPLE50B/llama3/llama3_tokenizer"
+
+cmds = {"before_start": "source /root/miniconda3/bin/activate flagscale"}
+# flagscale's requirements
+flagscale_chip_type = "H100"
+flagscale_ssh_port = 22
+flops = 989E12
+
+# for llava's algorithm
+steps = 30
diff --git a/training/nvidia/llama3_70B-flagscale/config/requirements.txt b/training/nvidia/llama3_70B-flagscale/config/requirements.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/nvidia/llama3_70B_continuetrain-flagscale/README.md b/training/nvidia/llama3_70B_continuetrain-flagscale/README.md
new file mode 100644
index 000000000..d44c78c53
--- /dev/null
+++ b/training/nvidia/llama3_70B_continuetrain-flagscale/README.md
@@ -0,0 +1 @@
+此测例为FlagScale相关项目测例
diff --git a/training/nvidia/llama3_70B_continuetrain-flagscale/config/config_H100x4x8.py b/training/nvidia/llama3_70B_continuetrain-flagscale/config/config_H100x4x8.py
new file mode 100644
index 000000000..c8962a242
--- /dev/null
+++ b/training/nvidia/llama3_70B_continuetrain-flagscale/config/config_H100x4x8.py
@@ -0,0 +1,25 @@
+# scale_parent must under FlagPerf/ or data_dir/, otherwise you cannot mount it into baremetal, therefore cannot use shared storage
+scale_parent = "/share/project/shhh/xlcllm"
+scale_home = f"{scale_parent}/FlagScale"
+
+# this cmd should install scale at <scale_home>. <scale_home> is set by flagperf.training.benchmarks.llava1.5_7b.flagscale.run_pretraining.py
+scale_download_cmd = f"cd {scale_parent}; git clone https://github.com/FlagOpen/FlagScale.git; cd FlagScale; git checkout a44556c"
+
+# NV need nothing because all requirements have been established in base docker image. vendor can do anything related here
+scale_install_cmd = ""
+
+scale_conf_dir = f"{scale_home}/examples/llama/conf"
+configyaml = f"{scale_conf_dir}/config.yaml"
+trainyaml = f"{scale_conf_dir}/train/train_llama3_70b_finetune.yaml"
+dataset = f"SAMPLE50B/llama3/llama3_dataset"
+tokenizer = f"SAMPLE50B/llama3/llama3_tokenizer"
+ckpt = f"llama3_ckpt"
+
+cmds = {"before_start": "source /root/miniconda3/bin/activate flagscale"}
+# flagscale's requirements
+flagscale_chip_type = "H100"
+flagscale_ssh_port = 22
+flops = 989E12
+
+# for llava's algorithm
+steps = 500
diff --git a/training/nvidia/llama3_70B_continuetrain-flagscale/config/requirements.txt b/training/nvidia/llama3_70B_continuetrain-flagscale/config/requirements.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/nvidia/llava1.5_7b-flagscale/README.md b/training/nvidia/llava1.5_7b-flagscale/README.md
new file mode 100644
index 000000000..d44c78c53
--- /dev/null
+++ b/training/nvidia/llava1.5_7b-flagscale/README.md
@@ -0,0 +1 @@
+此测例为FlagScale相关项目测例
diff --git a/training/nvidia/llava1.5_7b-flagscale/config/config_H100x4x8.py b/training/nvidia/llava1.5_7b-flagscale/config/config_H100x4x8.py
new file mode 100644
index 000000000..56b050a17
--- /dev/null
+++ b/training/nvidia/llava1.5_7b-flagscale/config/config_H100x4x8.py
@@ -0,0 +1,27 @@
+# scale_parent must under FlagPerf/ or data_dir/, otherwise you cannot mount it into baremetal, therefore cannot use shared storage
+scale_parent = "/share"
+scale_home = f"{scale_parent}/FlagScale"
+
+# this cmd should install scale at <scale_home>. <scale_home> is set by flagperf.training.benchmarks.llava1.5_7b.flagscale.run_pretraining.py
+scale_download_cmd = f"cd {scale_parent}; git clone https://github.com/FlagOpen/FlagScale.git; cd FlagScale; git checkout 085811f"
+
+# NV need nothing because all requirements have been established in base docker image. vendor can do anything related here
+scale_install_cmd = ""
+
+# locate energon. the copy from energon_install_path to flagscale/megatron/ is done by flagperf...run_pretraining.py
+energon_locate_cmd = r"pip show megatron-energon | grep Location | awk -F: '{print $2}' | xargs"
+
+scale_conf_dir = f"{scale_home}/examples/llava/conf"
+configyaml = f"{scale_conf_dir}/config.yaml"
+trainyaml = f"{scale_conf_dir}/train/train_llava1.5_7b.yaml"
+datasetyaml = f"{scale_home}/megatron/examples/multimodal/pretrain_dataset.yaml"
+prompt = f"{scale_home}/megatron/examples/multimodal/manual_prompts.json"
+
+cmds = {"before_start": "source /root/miniconda3/bin/activate flagscale"}
+# flagscale's requirements
+flagscale_chip_type = "H100"
+flagscale_ssh_port = 22
+flops = 989E12
+
+# for llava's algorithm
+steps = 30
diff --git a/training/nvidia/llava1.5_7b-flagscale/config/requirements.txt b/training/nvidia/llava1.5_7b-flagscale/config/requirements.txt
new file mode 100644
index 000000000..4f0d1d961
--- /dev/null
+++ b/training/nvidia/llava1.5_7b-flagscale/config/requirements.txt
@@ -0,0 +1 @@
+megatron-energon==2.2.0
diff --git a/training/nvidia/llava1.5_7b_continuetrain-flagscale/README.md b/training/nvidia/llava1.5_7b_continuetrain-flagscale/README.md
new file mode 100644
index 000000000..d44c78c53
--- /dev/null
+++ b/training/nvidia/llava1.5_7b_continuetrain-flagscale/README.md
@@ -0,0 +1 @@
+此测例为FlagScale相关项目测例
diff --git a/training/nvidia/llava1.5_7b_continuetrain-flagscale/config/config_H100x4x8.py b/training/nvidia/llava1.5_7b_continuetrain-flagscale/config/config_H100x4x8.py
new file mode 100644
index 000000000..5ec87fec7
--- /dev/null
+++ b/training/nvidia/llava1.5_7b_continuetrain-flagscale/config/config_H100x4x8.py
@@ -0,0 +1,27 @@
+# scale_parent must under FlagPerf/ or data_dir/, otherwise you cannot mount it into baremetal, therefore cannot use shared storage
+scale_parent = "/share"
+scale_home = f"{scale_parent}/FlagScale"
+
+# this cmd should install scale at <scale_home>. <scale_home> is set by flagperf.training.benchmarks.llava1.5_7b.flagscale.run_pretraining.py
+scale_download_cmd = f"cd {scale_parent}; git clone https://github.com/FlagOpen/FlagScale.git; cd FlagScale; git checkout 085811f"
+
+# NV need nothing because all requirements have been established in base docker image. vendor can do anything related here
+scale_install_cmd = ""
+
+# locate energon. the copy from energon_install_path to flagscale/megatron/ is done by flagperf...run_pretraining.py
+energon_locate_cmd = r"pip show megatron-energon | grep Location | awk -F: '{print $2}' | xargs"
+
+scale_conf_dir = f"{scale_home}/examples/llava/conf"
+configyaml = f"{scale_conf_dir}/config.yaml"
+trainyaml = f"{scale_conf_dir}/train/train_llava1.5_7b.yaml"
+datasetyaml = f"{scale_home}/megatron/examples/multimodal/pretrain_dataset.yaml"
+prompt = f"{scale_home}/megatron/examples/multimodal/manual_prompts.json"
+
+cmds = {"before_start": "source /root/miniconda3/bin/activate flagscale"}
+# flagscale's requirements
+flagscale_chip_type = "H100"
+flagscale_ssh_port = 22
+flops = 989E12
+
+# for llava's algorithm
+steps = 5000
diff --git a/training/nvidia/llava1.5_7b_continuetrain-flagscale/config/requirements.txt b/training/nvidia/llava1.5_7b_continuetrain-flagscale/config/requirements.txt
new file mode 100644
index 000000000..4f0d1d961
--- /dev/null
+++ b/training/nvidia/llava1.5_7b_continuetrain-flagscale/config/requirements.txt
@@ -0,0 +1 @@
+megatron-energon==2.2.0
diff --git a/training/run_benchmarks/config/cluster_conf.py b/training/run_benchmarks/config/cluster_conf.py
index 0723c086c..be628e197 100644
--- a/training/run_benchmarks/config/cluster_conf.py
+++ b/training/run_benchmarks/config/cluster_conf.py
@@ -10,4 +10,4 @@
 MASTER_PORT = "29501"
 
 # ssh connection port
-SSH_PORT = "22"
\ No newline at end of file
+SSH_PORT = "22"
diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py
old mode 100644
new mode 100755
index 227fca7a0..7f8a5860a
--- a/training/run_benchmarks/config/test_conf.py
+++ b/training/run_benchmarks/config/test_conf.py
@@ -22,7 +22,7 @@
 #   mthreads:
 #       " --env MTHREADS_VISIBLE_DEVICES=all"
 #   metax:
-#       " --device=/dev/dri --device=/dev/mxcd --group-add video"
+#       " --device=/dev/infiniband --device=/dev/dri --device=/dev/mxcd --group-add video"
 #   dcu:
 #       "-v /opt/hyhal/:/opt/hyhal/ --device=/dev/kfd --device=/dev/dri/ --group-add video"
 ACCE_CONTAINER_OPT = " --gpus all"
@@ -37,7 +37,7 @@
 ACCE_VISIBLE_DEVICE_ENV_NAME = "CUDA_VISIBLE_DEVICES"
 
 # Set pip source, which will be used in preparing envs in container
-PIP_SOURCE = "https://mirror.baidu.com/pypi/simple"
+PIP_SOURCE = "https://pypi.tuna.tsinghua.edu.cn/simple"
 
 # The path that flagperf deploy in the cluster.
 # Users must set FLAGPERF_PATH to where flagperf deploy
@@ -62,13 +62,16 @@
 '''
 CASES = {
     # nvidia cases
-    "llama3_8B:megatron_core060:A100:1:8:1": "/data/llama3_8b_pretrain"
+    #"llama3_8B:megatron_core060:A100:1:8:1": "/data/llama3_8b_pretrain"
     # "llama3_70B:megatron_core060:H100:8:8:1": "/data/llama3_70b_pretrain"
     # "bert:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/bert/train/",
     # "glm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/glm/train/",
     # "cpm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/cpm/train/",
 
-
+    #"llava1.5_7b:flagscale_2409:H100:4:8:1": "/workspace/data_dir"
+    #"llava1.5_7b_continuetrain:flagscale_2409:H100:4:8:1": "/workspace/data_dir"
+    #"llama3_70B:flagscale_2409:H100:4:8:1": "/workspace/data_dir"
+    #"llama3_70B_continuetrain:flagscale_2409:H100:4:8:1": "/workspace/data_dir"
     # "llava1.5_7b:deepspeed-torch:A800:1:8:1": "/raid/dataset/LLAVA/",
     #"llama2_7b_finetune:pytorch_2.0.1:A100:1:1:1": "/raid/dataset/llama2_finetune/",
     #"aquila2_7b_finetune:flagscale:A800:1:8:1": "/raid/dataset/aquila2_7b_finetune",
@@ -85,14 +88,14 @@
     # "resnet50:pytorch_1.8:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
     # "mask_rcnn:pytorch_1.8:A100:1:8:1": "/raid/dataset/maskrcnn/coco2017",
     # "dlrm:pytorch_1.10:A100:1:8:1": "/raid/dataset/criteo_1TB_click_logs/binary_dataset/",
-    
+
     # "wav2vec2:pytorch_1.13:A100:1:8:1": "/raid/dataset/wav2vec2_data/LibriSpeech",
     # "WaveGlow:pytorch_1.13:A100:1:8:1": "/raid/dataset/LJSpeech/",
     # "resnet50:tensorflow2:A100:1:8:1": "/raid/dataset/ImageNet2012/tf_records/",
     # "moflow:pytorch_1.13:A100:1:8:1": "/raid/dataset/MoFlow/data/",
 
     # "distilbert:pytorch_1.12:A100:1:8:1": "/raid/dataset/distilbert/",
-    
+
     # "transformer:pytorch_1.13:A100:1:8:1": "/raid/dataset/transformer/wmt14_en_de_joined_dict",
     # "swin_transformer:pytorch_1.8:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
     # "transformer_xl:pytorch_1.8:A100:1:8:1": "/raid/dataset/transformer_xl/",
@@ -102,7 +105,7 @@
     # "bert_hf:pytorch_1.13:A100:1:8:1": "/raid/dataset/bert_hf_train",
     # "longformer:pytorch_1.12:A100:1:8:1": "/raid/dataset/longformer_train/",
     # "detr:pytorch_1.13:A100:1:8:1": "/raid/dataset/detr/coco2017/",
-    
+
     # "llama2_7b:deepspeed:A100:1:8:1": "/raid/dataset/llama2_7b_pretrain",
     # "aquila2_7b:flagscale:A100:1:8:1": "/raid/dataset/aquila2_7b_pretrain",
     # "llama2_70B:megatron:H800:4:8:1": "/raid/dataset/llama2_70B_pretrain",
@@ -123,7 +126,7 @@
     # "gpt3_13B:paddle_2.5.1:TP2PP1SH1SP4A10040G:1:8:1":"/raid/dataset/gpt-3/"
     # "gpt3_13B:paddle_2.5.1:TP2PP1SH2SP4A10040G:1:8:1":"/raid/dataset/gpt-3/"
     # "gpt3_13B:paddle_2.5.1:TP2PP4SH1SP1A10040G:1:8:1":"/raid/dataset/gpt-3/"
-    
+
     # "qwen1.5_MoE:megatron_pai:A800:1:8:1":"/raid/datasets/qwen1.5_MoE/"
     # "mixtral_8x7B:megatron_core060:H100:4:8:1": "/raid/datasets/mistral"
 
@@ -149,6 +152,7 @@
     # "llama2_70B:megatron:R300:10:8:1": "/raid/dataset/llama2_70B_pretrain",
     # "baichuan2_13b:deepspeed:R300:1:8:1": "/raid/dataset/baichuan_data/",
     # "baichuan2_13b:deepspeed_new:R300:1:1:1": "/raid/dataset/baichuan_data/",
+    # "chatglm3_6b:deepspeed_v0.14.4:R300:1:8:1": "/raid/dataset/chatglm3_6b_data/",
 
     # iluvatar cases
     # "bigtransfer:pytorch:BI-V100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
@@ -158,6 +162,8 @@
     # "t5_small:pytorch:BI-V100:1:8:1": "/raid/dataset/t5_small",
     # "baichuan2_13b:deepspeed:BI-V150:2:8:1": "/raid/dataset/baichuan2_13b",
     # "llava1.5_13b:deepspeed-torch:BI-V150:1:16:1": "/raid/dataset/llava1.5_13b",
+    # "mixtral_8x7B:megatron:BI-V150:4:16:1": "/raid/dataset/mixtral_8x7B",   ##单机测试
+    # "mixtral_8x7B:megatron:BI-V150:1:16:1": "/raid/dataset/mixtral_8x7B",   ##四机测试
 
     # mthreads cases
     # "resnet50:pytorch_2.0:S4000:1:8:1": "/data/flagperf/ImageNet",
@@ -166,6 +172,11 @@
     # "llama2_7b:deepspeed:S4000:1:8:1": "/data/flagperf/llama/openwebtext",
 
     # metax cases
+    #"llama3_8B:megatron_core060:C500:1:8:1": "/data/llama3_8b"
+    # "llama2_70B:megatron:C500:4:8:1": "/data/llama2-70B"
+    #"chatglm3_6b:deepspeed:C500:1:8:1": "/raid/dataset//chatglm3-6b"
+    #"llama2_7b:megatron-deepspeed:C500:1:8:1": "/raid/dataset/llama2-7b"
+    #"llama3_8B:megatron_core060:C500:1:8:1": "/data/llama3_8b_pretrain"
     # "aquila2_7b:flagscale:C500:1:8:1": "/raid/dataset/Aquila2_7b_data"
     # "faster_rcnn:pytorch_2.0:C500:1:8:1": "/raid/dataset/coco2017/",
     # "retinanet:pytorch_2.0:C500:1:8:1": "/raid/dataset/coco2017/",
@@ -191,7 +202,5 @@
     # "llama1_7B:paddle_2.6.0:TP1PP1SH2SP8C50080G:1:8:1":"/raid/dataset/llama/"
     #"gpt3_13B:paddle_2.6.0:TP2PP1SH2SP4C50040G:1:8:1":"/raid/data_set/data-gpt3"
     #"gpt3_13B:paddle_2.6.0:TP1PP1SH2SP8C50080G:1:8:1":"/raid/data_set/data-gpt3"
-    
+    # "qwen1.5_MoE:megatron_pai:C500:1:8:1":"/raid/datasets/qwen1.5_MoE/"
 }
-
-
diff --git a/training/run_benchmarks/flagscale/start_flagscale_task.py b/training/run_benchmarks/flagscale/start_flagscale_task.py
index 46fd1e8b2..b250cde3e 100644
--- a/training/run_benchmarks/flagscale/start_flagscale_task.py
+++ b/training/run_benchmarks/flagscale/start_flagscale_task.py
@@ -1,8 +1,5 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
-'''This script is called in container to execute the real training task.
-   Support pytorch DDP only.
-'''
 import os
 import sys
 import subprocess
@@ -48,6 +45,10 @@ def parse_args():
                         type=int,
                         required=True,
                         help="how many processes will run on each host.")
+    parser.add_argument("--hosts",
+                        type=str,
+                        required=True,
+                        help="hosts to run the testcase.")
 
     parser.add_argument("--vendor",
                         type=str,
@@ -119,7 +120,9 @@ def main():
 
     exec_cmd = "cd " + os.path.dirname(train_script_path) + ";"
     exec_cmd = exec_cmd + "python run_pretraining.py"
-    exec_cmd = exec_cmd + " --world_size=" + str(task_args.nproc)
+    exec_cmd = exec_cmd + " --world_size=" + str(task_args.nproc * task_args.nnodes) 
+    exec_cmd = exec_cmd + " --hosts=" + task_args.hosts
+    exec_cmd = exec_cmd + " --host_addr=" + task_args.host_addr
     exec_cmd = exec_cmd + " --vendor=" + task_args.vendor
     exec_cmd = exec_cmd + " --data_dir=" + task_args.data_dir
     exec_cmd = exec_cmd + " --log_dir=" + task_log_dir
@@ -127,6 +130,7 @@ def main():
 
     task_log_file = os.path.join(task_log_dir, "rank0.out.log")
 
+    START_LOGGER.info(exec_cmd)
     with open(task_log_file, "w") as f:
         p = subprocess.Popen(exec_cmd,
                              shell=True,
diff --git a/training/run_benchmarks/megatron/start_megatron_task.py b/training/run_benchmarks/megatron/start_megatron_task.py
index 9a71ebac4..b0e369bda 100644
--- a/training/run_benchmarks/megatron/start_megatron_task.py
+++ b/training/run_benchmarks/megatron/start_megatron_task.py
@@ -117,7 +117,7 @@ def main():
     START_LOGGER.info("Hello Flagscale")
 
     exec_cmd = "cd " + os.path.dirname(train_script_path) + ";"
-    exec_cmd = exec_cmd + "python run_pretraining.py"
+    exec_cmd = exec_cmd + "python3 run_pretraining.py"
     exec_cmd = exec_cmd + " --nproc_per_node=" + str(task_args.nproc)
     exec_cmd = exec_cmd + " --nnodes=" + str(task_args.nnodes)
     exec_cmd = exec_cmd + " --node_rank=" + str(task_args.node_rank)
diff --git a/training/run_benchmarks/run.py b/training/run_benchmarks/run.py
index 5a845a611..be3dd3fe8 100644
--- a/training/run_benchmarks/run.py
+++ b/training/run_benchmarks/run.py
@@ -235,8 +235,7 @@ def clear_caches_cluster(clear, nnodes):
 
 def start_monitors_in_cluster(dp_path, case_log_dir, nnodes):
     '''Start sytem and vendor's monitors.'''
-    start_mon_cmd = "cd " + dp_path + " && " + sys.executable \
-                    + " ../utils/sys_monitor.py -o restart -l "
+    start_mon_cmd = "cd " + dp_path + " && " + sys.executable + " ../utils/sys_monitor.py -o restart -v " + tc.VENDOR + " -l "
     timeout = 60
     RUN_LOGGER.debug("Run cmd in the cluster to start system monitors: " +
                      start_mon_cmd)
diff --git a/utils/run_cmd.py b/utils/run_cmd.py
index 9e84697dc..7a0526291 100644
--- a/utils/run_cmd.py
+++ b/utils/run_cmd.py
@@ -10,6 +10,7 @@ def run_cmd_wait(cmd, timeout):
     '''Run a shell command and wait <timeout> second(s).'''
     process = subprocess.Popen(cmd,
                                shell=True,
+                               executable="/bin/bash",
                                stdout=subprocess.PIPE,
                                stderr=subprocess.STDOUT,
                                encoding='utf-8')
diff --git a/utils/sys_monitor.py b/utils/sys_monitor.py
index 3612f5fff..f6c84240d 100644
--- a/utils/sys_monitor.py
+++ b/utils/sys_monitor.py
@@ -39,6 +39,7 @@ def __init__(self,
                  stderr=os.devnull,
                  home_dir='.',
                  umask=0o22,
+                 vendor="nvidia",
                  verbose=0):
         self.stdin = stdin
         self.stdout = stdout
@@ -58,6 +59,7 @@ def __init__(self,
         self.umask = umask
         self.verbose = verbose
         self.daemon_alive = True
+        self.vendor=vendor
 
     def get_pid(self):
         try:
@@ -102,6 +104,10 @@ def pwr_mon(file):
             TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
             cmd = "ipmitool sdr list|grep -i Watts|awk 'BEGIN{FS = \"|\"}{for (f=1; f <= NF; f+=1) {if ($f ~ /Watts/)" \
                   " {print $f}}}'|awk '{print $1}'|sort -n -r|head -n1"
+            # support cambriocn mlu 
+            if "cambricon" in self.vendor:
+                cmd = "echo $(( $(ipmitool sdr list | grep -i Watts | awk 'BEGIN{FS=\"|\"} {for (f=1; f<=NF; f++) {if ($f ~ /Watts/) print $f}}' | awk '{print $1}' | sort -n -r | head -n 1) + $(cnmon info -c 0 | grep 'Machine' | awk '{print $3}') ))"
+            
             res, out = rcw(cmd, 10)
             if res:
                 result = "error"
@@ -129,6 +135,7 @@ def timer_pwr_mon():
         schedule.every(self.rate1).seconds.do(timer_cpu_mon)
         schedule.every(self.rate1).seconds.do(timer_mem_mon)
         schedule.every(self.rate2).seconds.do(timer_pwr_mon)
+        schedule.run_all()
         while True:
             schedule.run_pending()
             time.sleep(5)
@@ -249,6 +256,12 @@ def parse_args():
                        required=False,
                        default='./logs/',
                        help='log path')
+    parse.add_argument('-v',
+                       type=str,
+                       metavar='[vendor]',
+                       required=False,
+                       default='nvidia',
+                       help='gpu vendor')
     args = parse.parse_args()
     return args
 
@@ -258,6 +271,7 @@ def main():
     sample_rate2 = 120
     args = parse_args()
     operation = args.o
+    vendor=args.v
     path = args.l
     pid_fn = str('/tmp/sys_monitor.pid')
     log_fn = str(path + '/sys_monitor.log')
@@ -268,6 +282,7 @@ def main():
                        err_fn,
                        path,
                        verbose=1,
+                       vendor=vendor,
                        rate1=sample_rate1,
                        rate2=sample_rate2)
     if operation == 'start':
@@ -288,4 +303,4 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()