From cb81f879d9a882a351ba3c0027e091f12df9d365 Mon Sep 17 00:00:00 2001
From: POI-WX <131418410+POI-WX@users.noreply.github.com>
Date: Wed, 6 Dec 2023 18:25:42 +0800
Subject: [PATCH 01/58] [DIPU] Wx/modify maximum schema due to the case in the
 inference of internlm (#494)

* improve maximum schema due to the case in the inference of internlm

* fix bug according to comments

* fix bug
---
 .../diopi_functions.yaml                      | 15 +++++--
 .../python/unittests/test_minimum_maximum.py  | 40 +++++++++++++++++++
 2 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
index 8812397c5a..4b58185a24 100755
--- a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
+++ b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
@@ -1134,8 +1134,12 @@
   interface: diopiMaxAll(ctx, out, self)
 
 - schema: "maximum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"
-  no_device_check_args: [other]
-  interface: diopiMaximum(ctx, out, self, other)
+  no_device_check_args: [self, other]
+  ins: [selfTemp, otherTemp]
+  custom_code_at_the_beginning: |
+    auto selfTemp = (self.numel() == 1 && self.is_cpu()) ? self.to(other.device()) : self;
+    auto otherTemp = (other.numel() == 1 && other.is_cpu()) ? other.to(self.device()) : other;
+  interface: diopiMaximum(ctx, out, selfTemp, otherTemp)
 
 - schema: "max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_indices) -> (Tensor(a!) max, Tensor(b!) max_indices)"
   custom_code_at_the_beginning: |
@@ -1679,7 +1683,12 @@
   interface: diopiClampMaxInp(ctx, self, max)
 
 - schema: "minimum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"
-  interface: diopiMinimum(ctx,out, self, other)
+  no_device_check_args: [self, other]
+  ins: [selfTemp, otherTemp]
+  custom_code_at_the_beginning: |
+    auto selfTemp = (self.numel() == 1 && self.is_cpu()) ? self.to(other.device()) : self;
+    auto otherTemp = (other.numel() == 1 && other.is_cpu()) ? other.to(self.device()) : other;
+  interface: diopiMinimum(ctx, out, selfTemp, otherTemp)
 
 - schema: "scatter.value_out(Tensor self, int dim, Tensor index, Scalar value, *, Tensor(a!) out) -> Tensor(a!)"
   interface: diopiScatterScalar(ctx, out, self, dim, value, index, "")
diff --git a/dipu/tests/python/unittests/test_minimum_maximum.py b/dipu/tests/python/unittests/test_minimum_maximum.py
index eecc57bc18..a6b00383d4 100644
--- a/dipu/tests/python/unittests/test_minimum_maximum.py
+++ b/dipu/tests/python/unittests/test_minimum_maximum.py
@@ -15,6 +15,26 @@ def test_minimum(self):
         r_cpu = torch.minimum(a.to(self.cpu), b.to(self.cpu))
         self.assertEqual(r_dipu.to(self.cpu), r_cpu)
 
+    def test_minimum_scalar(self):
+        # special test cases from the inference of internlm
+        a = torch.randn((3, 4))
+        b = torch.tensor(torch.finfo(a.dtype).max)
+        # scalar on cpu
+        r_dipu1 = torch.minimum(a.to(self.dipu), b)
+        # scalar on device
+        r_dipu2 = torch.minimum(a.to(self.dipu), b.to(self.dipu))
+        r_cpu = torch.minimum(a, b)
+        self.assertEqual(r_dipu1.to(self.cpu), r_cpu)
+        self.assertEqual(r_dipu2.to(self.cpu), r_cpu)
+
+    def test_minimum_different_devices(self):
+        a = torch.tensor([1, -2, 3])
+        b = torch.tensor([4, 0, 2]).to(self.dipu)
+        with self.assertRaises(RuntimeError) as context:
+            torch.minimum(a, b)
+        self.assertIn(
+            'Expected all tensors to be on the same device', str(context.exception))
+
     def test_maximum(self):
         a = torch.tensor((1, 2, -1))
         b = torch.tensor((3, 0, 4))
@@ -22,6 +42,26 @@ def test_maximum(self):
         r_cpu = torch.maximum(a.to(self.cpu), b.to(self.cpu))
         self.assertEqual(r_dipu.to(self.cpu), r_cpu)
 
+    def test_maximum_scalar(self):
+        # special test cases from the inference of internlm
+        a = torch.randn((3, 4))
+        b = torch.tensor(torch.finfo(a.dtype).min)
+        # scalar on cpu
+        r_dipu1 = torch.maximum(a.to(self.dipu), b)
+        # scalar on device
+        r_dipu2 = torch.maximum(a.to(self.dipu), b.to(self.dipu))
+        r_cpu = torch.maximum(a, b)
+        self.assertEqual(r_dipu1.to(self.cpu), r_cpu)
+        self.assertEqual(r_dipu2.to(self.cpu), r_cpu)
+
+    def test_maximum_different_devices(self):
+        a = torch.tensor([1, -2, 3])
+        b = torch.tensor([4, 0, 2]).to(self.dipu)
+        with self.assertRaises(RuntimeError) as context:
+            torch.maximum(a, b)
+        self.assertIn(
+            'Expected all tensors to be on the same device', str(context.exception))
+
 
 if __name__ == "__main__":
     run_tests()

From d6c0094275afccf0147228926a53ba782519ca59 Mon Sep 17 00:00:00 2001
From: wiryls <7984500+wiryls@users.noreply.github.com>
Date: Wed, 6 Dec 2023 19:25:30 +0800
Subject: [PATCH 02/58] [both] fix, format and remove spaces in README.md
 (#497)

* doc(readme): fix, format and remove spaces

* fix: typo and try auto-correct

* feat(ci): add autocorrect into ci

* fix: remove autocorrect form ci as it's not ready
---
 .github/workflows/format.yml                  |  15 +-
 README.md                                     |   2 +-
 dicp/README.md                                |  85 +++++++++
 dicp/readme.md                                |  85 ---------
 dipu/Contributors.md                          |  14 +-
 dipu/QuickStart.md                            |  20 +--
 dipu/README.md                                |  24 +--
 .../profiler/{readme.md => README.md}         | 165 ++++++++++--------
 8 files changed, 220 insertions(+), 190 deletions(-)
 create mode 100644 dicp/README.md
 delete mode 100644 dicp/readme.md
 rename dipu/torch_dipu/profiler/{readme.md => README.md} (75%)

diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml
index cbad72ae4a..32efe64a4e 100644
--- a/.github/workflows/format.yml
+++ b/.github/workflows/format.yml
@@ -11,19 +11,22 @@ jobs:
   markdownlint:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v4
+    - name: Checkout code
+      uses: actions/checkout@v4
       with:
-        fetch-depth: 2
-    - uses: tj-actions/changed-files@v40
+        fetch-depth: 8
+    - name: Collect changed files
+      uses: tj-actions/changed-files@v40
       id: changed-files
       with:
         files: '**/*.md'
-        separator: ","
-    - uses: DavidAnson/markdownlint-cli2-action@v14
+        separator: ','
+    - name: MarkdownLint
       if: steps.changed-files.outputs.any_changed == 'true'
+      uses: DavidAnson/markdownlint-cli2-action@v14
       with:
         globs: ${{ steps.changed-files.outputs.all_changed_files }}
-        separator: ","
+        separator: ','
 
   clang-format:
     needs: markdownlint
diff --git a/README.md b/README.md
index a3a17decde..7dd7aafe2f 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ Deeplink.framework 是 DeepLink 推出的介于 AI 训练框架和硬件语言
 
 ### DIPU
 
-DIPU (Device Independent Process Unit) 是由一组抽象设备 runtime 接口，一组框架能力相关的运行时基类/接口，一个针对 DIOPI 标准算子的适配层共同组成的拓展包。 用来在训练框架 PyTorch 上接入 DIOPI 算子库，实现 Eager 模式的推理和训练。其能够在编译时，决定抽象设备被影射的方式；并使用统一的运行时，减少在多硬件上适配训练框架的成本。DIPU 即可以基于统一的设备运行时来屏蔽厂商的实际设备；也可以基于统一的框架相关的运行时基类，由厂商自行实现特有的运行时逻辑。
+DIPU (Device Independent Process Unit) 是由一组抽象设备 runtime 接口，一组框架能力相关的运行时基类/接口，一个针对 DIOPI 标准算子的适配层共同组成的拓展包。用来在训练框架 PyTorch 上接入 DIOPI 算子库，实现 Eager 模式的推理和训练。其能够在编译时，决定抽象设备被影射的方式；并使用统一的运行时，减少在多硬件上适配训练框架的成本。DIPU 即可以基于统一的设备运行时来屏蔽厂商的实际设备；也可以基于统一的框架相关的运行时基类，由厂商自行实现特有的运行时逻辑。
 
 ### DICP
 
diff --git a/dicp/README.md b/dicp/README.md
new file mode 100644
index 0000000000..db01a09b6c
--- /dev/null
+++ b/dicp/README.md
@@ -0,0 +1,85 @@
+<!-- markdownlint-disable-next-line MD041 MD033 -->
+<div align=center>
+<!-- markdownlint-disable-next-line MD033 -->
+<img src="https://deeplink.readthedocs.io/zh-cn/latest/_static/image/logo.png" alt="DeepLink Logo">
+</div>
+
+# DICP
+
+标准编译协议（Device-Independent Compile Protocol, DICP）定义了统一的计算描述（中间表示），通过计算图获取深度学习模型中的计算任务表达为上述中间表示，然后通过计算图优化技术自动生成人工智能芯片设备代码，从而提高研发效率和计算的执行性能。中间表示是介于源语言和目标语言之间的程序表示，能够极大程度地提高编译流程的可拓展性，同时也能降低优化流程对前端和后端的破坏。多层次中间表示包含从应用到芯片端的多种表示层次，不同层次旨在解决不同尺度的问题。
+
+DICP 主要的核心功能如下：
+
+1. 通过接入编译路线带来性能优势，在大模型场景最大限度释放芯片能力。
+2. 作为训练框架与国产硬件芯片之间的通用桥梁，支持多种前后端，带来使用易用性。
+3. 提供易用、高效的一站式编译适配流程，灵活支持国产硬件图编译器的特性，提高芯片适配效率。
+
+下图描述了 DICP 在编译链路中的位置：
+
+![DICP 在编译链路中的位置](https://deeplink.readthedocs.io/zh-cn/latest/_static/image/DICP/dicp_flow.png)
+
+1. 训练框架通过图获取模块将用户的模型代码转换成统一的中间表达。此处的中间表达完全与芯片无关。所以在之后的编译协议部分中，需要建立起与后端芯片的联系。这样才能高效的完成接入。
+2. 编译协议完成了衔接框架与芯片编译器的工作，其中包含硬件相关的切图，统一中间表达与芯片所支持的算子之间的映射关系以及数据格式的转换模块。
+3. 在编译协议吸收了芯片特点之后，由代码生成模块生成最终的代码，并通过芯片的编译器生成二进制可执行文件之后由框架调用。
+
+## 基于 DICP 的国产硬件接入 PyTorch 2 实践
+
+<!-- ### DICP vs 纯 Dynamo -->
+
+基于上述 DICP，国产硬件可快速接入 PyTorch 2 的编译路线。此路线中的 TorchDynamo 组件，可使国产硬件在运行时的 overhead 大幅缩小。
+并且针对国产硬件实现了以下特性：
+
+- 灵活支持国产硬件图编译器的特性
+- 支持多种国产硬件数据格式
+- 支持动态 shape
+
+### 运行逻辑
+
+DICP 的运行逻辑如下图所示：
+
+<!-- (**这张图有问题，需要讨论 by jinminxi**) -->
+![DICP 的运行逻辑](https://deeplink.readthedocs.io/zh-cn/latest/_static/image/DICP/structure.png)
+
+其中：
+
+1. **算子映射**：主要解决框架层算子与后端图编译器的算子之间的语义差别，包括 1 对 1 和 1 对多的转换。
+2. **Shape & Dtype 推导**：进行 Shape & data_type 的推导，补全整张静态图上的信息，便于之后在代码生成模块能生成代码。
+3. **子图改写**：将多个小算子融合成为一个或多个适合图编译器的算子，配合后端图编译器将计算效率最大化。
+4. **数据格式调整**：是根据后端芯片与其图编译器的特性，针对特定的算子调整其输入输出的数据格式，使得最大程度的发挥芯片性能。
+
+### 目录结构
+
+- `dicp/dynamo_bridge`：多后端通用的接入代码，包含了
+  1. 接收从 AOTAutograd 下发而来的 FX Graph
+  2. 启动各个厂商的 IR 转换与优化
+  3. 启动 CodeGen 以及 JIT 缓存的逻辑。
+- `dicp/vender`: 主要包含了各个厂商 IR 的定义，AtenIR 到厂商 IR 的转换，厂商 IR 上的优化以及最后的代码生成模块。
+- `test`: 包含了 model 测试与 op 测试
+
+### Demo
+
+#### 安装 DICP
+
+```bash
+cd /path_to_dicp
+pip install .
+```
+
+#### 在华为 910 上执行 llama7B 前向推理
+
+```bash
+export DIPU_MOCK_CUDA = false
+export DICP_TOPS_DIPU = True
+export TEST_DIR = /path_to_dicp/test/
+export LLAMA_MODEL_DIR=/path_to_llama_model
+bash /path_to_dicp/test/model/run_test_model.sh llama ascendgraph false
+```
+
+#### 在燧原 T20 上执行 resnet50 训练
+
+```bash
+export DIPU_MOCK_CUDA = false
+export DICP_TOPS_DIPU = True
+export TEST_DIR = /path_to_dicp/test/
+bash /path_to_dicp/test/model/run_test_model.sh resnet50 topsgraph false
+```
diff --git a/dicp/readme.md b/dicp/readme.md
deleted file mode 100644
index 6a5fc8de06..0000000000
--- a/dicp/readme.md
+++ /dev/null
@@ -1,85 +0,0 @@
-<div align=center>
-<img src="https://deeplink.readthedocs.io/zh-cn/latest/_static/image/logo.png">
-</div>
-
-# DICP
-
-标准编译协议（Device-Independent Compile Protocol,DICP）定义了统一的计算描述（中间表示），通过计算图获取深度学习模型中的计算任务表达为上述中间表示，然后通过计算图优化技术自动生成人工智能芯片设备代码，从而提高研发效率和计算的执行性能。中间表示是介于源语言和目标语言之间的程序表示，能够极大程度地提高编译流程的可拓展性，同时也能降低优化流程对前端和后端的破坏。多层次中间表示包含从应用到芯片端的多种表示层次，不同层次旨在解决不同尺度的问题。
-
-DICP主要的核心功能如下：
-1. **通过接入编译路线带来性能优势，在大模型场景最大限度释放芯片能力**
-2. **作为训练框架与国产硬件芯片之间的通用桥梁，支持多种前后端，带来使用易用性**
-3. **提供易用、高效的一站式编译适配流程，灵活支持国产硬件图编译器的特性，提高芯片适配效率**
-
-下图描述了DICP在编译链路中的位置：
-
-<div align=center>
-<img src="https://deeplink.readthedocs.io/zh-cn/latest/_static/image/DICP/dicp_flow.png">
-<p>*DICP在编译链路中的位置</p>
-
-</div>
-
-1. 训练框架通过图获取模块将用户的模型代码转换成统一的中间表达。此处的中间表达完全与芯片无关。所以在之后的编译协议部分中，需要建立起与后端芯片的联系。这样才能高效的完成接入。
-2. 编译协议完成了衔接框架与芯片编译器的工作，其中包含硬件相关的切图，统一中间表达与芯片所支持的算子之间的映射关系以及数据格式的转换模块。
-3. 在编译协议吸收了芯片特点之后，由代码生成模块生成最终的代码，并通过芯片的编译器生成二进制可执行文件之后由框架调用。
-
-
-
-## 基于DICP的国产硬件接入PyTorch2实践
-
-<!-- ### DICP vs 纯Dynamo -->
-
-基于上述DICP，国产硬件可快速接入Pytorch2的编译路线。此路线中的TorchDynamo组件，可使国产硬件在运行时的overhead大幅缩小。  
-并且针对国产硬件实现了以下特性：
-  - 灵活支持国产硬件图编译器的特性
-  - 支持多种国产硬件数据格式
-  - 支持动态shape
-
-### 运行逻辑
-DICP的运行逻辑如下图所示:
-<!-- (**这张图有问题，需要讨论 by jinminxi**) -->
-
-<div align=center>
-<img src="https://deeplink.readthedocs.io/zh-cn/latest/_static/image/DICP/structure.png">
-</div>
-
-其中：
-1. **算子映射**： 主要解决框架层算子与后端图编译器的算子之间的语义差别，包括1对1和1对多的转换。  
-2. **Shape&Dtype推导**： 进行Shape&data_type的推导，补全整张静态图上的信息，便于之后在代码生成模块能生成代码。  
-3. **子图改写**： 将多个小算子融合成为一个或多个适合图编译器的算子，配合后端图编译器将计算效率最大化。
-4. **数据格式调整**： 是根据后端芯片与其图编译器的特性，针对特定的算子调整其输入输出的数据格式，使得最大程度的发挥芯片性能。
-
-### 目录结构
-* dicp/dynamo_bridge： 多后端通用的接入代码，包含了
-  1. 接收从AOTAutograd下发而来的FX Graph
-  2. 启动各个厂商的IR转换与优化
-  3. 启动CodeGen以及JIT缓存的逻辑。
-* dicp/vender: 主要包含了各个厂商IR的定义，AtenIR到厂商IR的转换，厂商IR上的优化以及最后的代码生成模块。
-* test: 包含了model测试与op测试
-
-
-### Demo
-
-#### 安装DICP
-
-```
-cd /path_to_dicp
-pip install .
-```
-
-#### 在华为910上执行llama7B前向推理
-```
-export DIPU_MOCK_CUDA = false
-export DICP_TOPS_DIPU = True
-export TEST_DIR = /path_to_dicp/test/
-export LLAMA_MODEL_DIR=/path_to_llama_model
-bash /path_to_dicp/test/model/run_test_model.sh llama ascendgraph false
-```
-
-#### 在燧原T20上执行resnet50训练
-```
-export DIPU_MOCK_CUDA = false
-export DICP_TOPS_DIPU = True
-export TEST_DIR = /path_to_dicp/test/
-bash /path_to_dicp/test/model/run_test_model.sh resnet50 topsgraph false
-```
diff --git a/dipu/Contributors.md b/dipu/Contributors.md
index bbfd7ae213..e612cf0bdd 100644
--- a/dipu/Contributors.md
+++ b/dipu/Contributors.md
@@ -18,7 +18,7 @@
 
 ### 拉取请求工作流
 
-如果你对拉取请求不了解，没关系，接下来的内容将会从零开始，一步一步地指引你如何创建一个拉取请求。如果你想深入了解拉取请求的开发模式，可以参考[GitHub 官方文档](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)
+如果你对拉取请求不了解，没关系，接下来的内容将会从零开始，一步一步地指引你如何创建一个拉取请求。如果你想深入了解拉取请求的开发模式，可以参考 [GitHub 官方文档](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)
 
 #### 复刻仓库
 
@@ -43,7 +43,7 @@ upstream git@github.com:DeepLink-org/deeplink.framework (fetch)
 upstream git@github.com:DeepLink-org/deeplink.framework (push)
 ```
 
-> 这里对 origin 和 upstream 进行一个简单的介绍，当我们使用 `git clone` 来克隆代码时，会默认创建一个 origin 的 remote，它指向我们克隆的代码库地址，而 upstream 则是我们自己添加的，用来指向原始代码库地址。当然如果你不喜欢他叫 upstream，也可以自己修改，比如叫 dipu 。我们通常向 origin 提交代码（即 fork 下来的远程仓库），然后向 upstream 提交一个 pull request。如果提交的代码和最新的代码发生冲突，再从 upstream 拉取最新的代码，和本地分支解决冲突，再提交到 origin。
+> 这里对 origin 和 upstream 进行一个简单的介绍，当我们使用 `git clone` 来克隆代码时，会默认创建一个 origin 的 remote，它指向我们克隆的代码库地址，而 upstream 则是我们自己添加的，用来指向原始代码库地址。当然如果你不喜欢他叫 upstream，也可以自己修改，比如叫 dipu。我们通常向 origin 提交代码（即 fork 下来的远程仓库），然后向 upstream 提交一个 pull request。如果提交的代码和最新的代码发生冲突，再从 upstream 拉取最新的代码，和本地分支解决冲突，再提交到 origin。
 
 #### 创建开发分支
 
@@ -59,7 +59,7 @@ git checkout -b xxx/refactor_contributing_doc
 git pull upstream main
 ```
 
-#### 提交代码并在本地通过dipu测试
+#### 提交代码并在本地通过 DIPU 测试
 
 提交的代码需要通过 DIPU 在各设备上的测例和模型 one_iter 测试。
 
@@ -78,11 +78,11 @@ git push -u origin {branch_name}
 1. 在 GitHub 的 pull request 界面创建拉取请求
 2. 根据指引修改 pull request 描述，以便于其他开发者更好地理解你的修改
 
-描述规范详见[拉取请求规范](#拉取请求规范)
+描述规范详见 [拉取请求规范](#拉取请求规范)
 
 注意事项：
 
-- Pull request 描述应该包含修改理由、修改内容以及修改后带来的影响，并关联相关 issue（具体方式见[文档](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)）。
+- Pull request 描述应该包含修改理由、修改内容以及修改后带来的影响，并关联相关 issue（具体方式见 [GitHub 官方文档](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)）。
 - 如果是第一次为 DIPU 做贡献，需要签署 CLA。
 - 检查提交的 pull request 是否通过 CI（持续集成）。
 - 如果 pull request 通过了 CI 检查，那么就可以等待其他开发者的 review，并根据 reviewer 的意见，修改代码，并重复上述步骤，直到 reviewer 同意合入 pull request。
@@ -117,7 +117,7 @@ git merge upstream/main
 - 每次 commit 时需要提供清晰且有意义 commit 信息。
 - 提供清晰且有意义的 pull request 描述：
   - 标题写明白任务名称，参考格式：`[Prefix] Short description of the pull request (Suffix)`；
-    - Prefix 参考：新增功能 `[Feature]`, 修 bug `[Fix]`, 文档相关 `[Docs]`, 开发中 `[WIP]` (暂时不会被 review)。
-  - 描述里介绍 pull request 的主要修改内容，结果，以及对其他部分的影响, 参考 pull request 模板；
+    - Prefix 参考：新增功能 `[Feature]`, 修 bug `[Fix]`, 文档相关 `[Docs]`, 开发中 `[WIP]` （暂时不会被 review）。
+  - 描述里介绍 pull request 的主要修改内容，结果，以及对其他部分的影响，参考 pull request 模板；
   - 关联相关的 issue 和其他 pull request。
 - 如果引入了其他三方库，或借鉴了三方库的代码，请确认它们的许可证和 DIPU License 兼容，并在借鉴的代码上补充 `This code is inspired from <LINK>`。
diff --git a/dipu/QuickStart.md b/dipu/QuickStart.md
index 10ccf63796..b5f640a2ad 100644
--- a/dipu/QuickStart.md
+++ b/dipu/QuickStart.md
@@ -167,7 +167,7 @@ export DIPU_FORCE_FALLBACK_OPS_LIST=add.out,conv2d
 python -c "import torch_dipu"
 ```
 
-Fallback scalar 版本的重载函数， tensor 版本的重载函数类似：
+Fallback scalar 版本的重载函数，tensor 版本的重载函数类似：
 
 ```bash
 export DIPU_FORCE_FALLBACK_OPS_LIST='.*.Scalar'
@@ -203,7 +203,7 @@ add_custom_command(
 以上方法是对所有算子开启自动精度对比。如果只需要对特定算子做精度对比，也可只给需要的算子做精度对比，只需要在相关的配置文件（如 `dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml`）给相应的算子添加 `autocompare: True` 即可。
 
 ```shell
-$ unset  DIPU_FORCE_FALLBACK_OPS_LIST # 主要是确保要比较的算子没有强制fallback到cpu,可选
+$ unset  DIPU_FORCE_FALLBACK_OPS_LIST # 主要是确保要比较的算子没有强制 fallback 到 cpu, 可选
 $ python
 >>> import torch
 >>> import torch_dipu
@@ -229,7 +229,7 @@ autocompare:    add.out other: allclose
 >>>
 ```
 
-可以看到，CPU 计算结果与设备计算结果 `allclose`，也能看到CPU和设备计算结果的 `shape`、`dtype` 等信息。特别的，需要注意以下几个问题：
+可以看到，CPU 计算结果与设备计算结果 `allclose`，也能看到 CPU 和设备计算结果的 `shape`、`dtype` 等信息。特别的，需要注意以下几个问题：
 
 1. `dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml` 中配置了 `autograd:True` 的算子 (`cross_entropy_loss`、`conv2d`、`dropout`、`dropout_`、`linear`) 暂不支持 *backward* 的精度自动对比。如模型精度对不齐，可根据需要先将这几个算子 fallback 到 CPU 来确定问题。
 2. 随机数生成相关的算子（`dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml` 中配置了 `autocompare:False`）没有做 `autocompare`，因为结果总是 `not_allclose`。
@@ -245,12 +245,11 @@ autocompare:    add.out other: allclose
 >>> import os
 diopi dyload init
 >>> x = torch.randn(3,4).cuda()
->>> os.environ['DIPU_DUMP_OP_ARGS']='1' # 只打印调用的底层算子名以及相关的diopi函数
+>>> os.environ['DIPU_DUMP_OP_ARGS']='1' # 只打印调用的底层算子名以及相关的 diopi 函数
 >>> y = x + x
 [dipu_add_out:349]:add.out  diopiAdd
 
-
->>> os.environ['DIPU_DUMP_OP_ARGS']='2'  # 打印调用的底层算子名，相关的diopi函数，算子参数
+>>> os.environ['DIPU_DUMP_OP_ARGS']='2'  # 打印调用的底层算子名，相关的 diopi 函数，算子参数
 >>> y = x + 3
 [dipu_add_out:349]:add.out  diopiAdd
 [dipu_add_scalar_out:248]:add.Scalar_out  diopiAddScalar
@@ -259,8 +258,7 @@ diopi dyload init
         add.Scalar_out: alpha:1
         add.Scalar_out: out:numel:12, sizes:[3, 4], stride:[4, 1], is_view:0, TensorOptions(dtype=float, device=privateuseone:0, layout=Strided, requires_grad=false (default), pinned_memory=false (default), memory_format=(nullopt)), data_ptr:0x7ff8c8c00400
 
-
->>> os.environ['DIPU_DUMP_OP_ARGS']='3' # 打印调用的底层算子名，相关的diopi函数，算子参数， tensor的值
+>>> os.environ['DIPU_DUMP_OP_ARGS']='3' # 打印调用的底层算子名，相关的 diopi 函数，算子参数， tensor 的值
 >>> y = x * 3
 [dipu_mul_out:815]:mul.out  diopiMul
 [dipu_mul_scalar_out:753]:mul.Scalar_out  diopiMulScalar
@@ -289,7 +287,7 @@ diopi dyload init
 
 ### 核心代码添加
 
-- 在 `dipu/torch_dipu/csrc_dipu/runtime/device/basedef.h` 中定义了DIPU支持的硬件类型，我们需要在 `VendorDeviceType` 枚举类中添加 `DROPLET` 的硬件后端，并在这个文件中的`VendorTypeToStr` 函数里添加新硬件支持。后续这个文件中可能有更多的函数会涉及到硬件类型，按需添加即可。
+- 在 `dipu/torch_dipu/csrc_dipu/runtime/device/basedef.h` 中定义了 DIPU 支持的硬件类型，我们需要在 `VendorDeviceType` 枚举类中添加 `DROPLET` 的硬件后端，并在这个文件中的`VendorTypeToStr` 函数里添加新硬件支持。后续这个文件中可能有更多的函数会涉及到硬件类型，按需添加即可。
 - `dipu/torch_dipu/csrc_dipu/vendor` 文件夹中存有各个硬件后端的 *runtime* 接入代码，我们需要根据 `dipu/torch_dipu/csrc_dipu/runtime/device/deviceapis.h` 中的声明，创建 `deviceimpl.cpp` 去根据硬件自己底层的 *runtime* 接口实现对应的函数。下面是 `deviceapis.h` 中的 `createStream` 函数的在国产硬件上的实现样例：
 
 ``` cpp
@@ -302,7 +300,7 @@ void createStream(deviceStream_t* stream, bool prior) {
 }
 ```
 
-- 如果有多机多卡训练的需求，需要根据 `dipu/torch_dipu/csrc_dipu/runtime/device/diclapis.h` 中的声明，创建 `communiatorimpl.cpp` 去根据硬件自己底层的 *runtime* 接口实现对应的函数。
+- 如果有多机多卡训练的需求，需要根据 `dipu/torch_dipu/csrc_dipu/runtime/device/diclapis.h` 中的声明，创建 `communicatorimpl.cpp` 去根据硬件自己底层的 *runtime* 接口实现对应的函数。
 - DIPU 在 `dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.h` 中声明了 `DIPUGeneratorImpl` 这一个基本类型，如果我们的硬件实现了自己的 `generator` 基础函数，可以在这基础上实现自己的 `DeviceGeneratorImpl`，并实现基础的 `generator` 相关函数。国产硬件暂无这方面的实现。
 
 ### 增加编译脚本
@@ -326,4 +324,4 @@ void createStream(deviceStream_t* stream, bool prior) {
 
 - 根据 DIPU 的编译介绍，我们在编译了 DIPU 之后，需要注意将 `LIBRARY_PATH`、`LD_LIBRARY_PATH`、`PYTHONPATH` 都设置好避免后续使用出现问题。
 - `dipu/tests` 文件夹中有许多基础功能的测试，建议首先尝试测试 `python -u dipu/tests/python/unittests/test_add.py`，该文件测试跑通基本意味着我们的设备 *runtime* 接入没有问题了。
-- 编译脚本参考[编译 DIPU](#编译-dipu)，测试脚本可以参考[验证 DIPU](#验证-dipu)。
+- 编译脚本参考 [编译 DIPU](#编译-dipu)，测试脚本可以参考 [验证 DIPU](#验证-dipu)。
diff --git a/dipu/README.md b/dipu/README.md
index 3b55bac80d..ce128bcf4c 100644
--- a/dipu/README.md
+++ b/dipu/README.md
@@ -8,7 +8,7 @@
 
 ## 介绍
 
-DIPU (device independent process unit) 是由 **一组抽象设备 Runtime 接口，一组框架能力相关的运行时基类/接口，一个针对 DIOPI 标准算子的适配层** 共同组成的拓展包。 用来在训练框架 PyTorch 上接入 DIOPI 算子库，实现 Eager 模式的推理和训练。其能够在编译时，决定抽象设备被影射的方式；并使用统一的运行时，减少在多硬件上适配训练框架的成本。DIPU 即可以基于统一的设备运行时来屏蔽厂商的实际设备；也可以基于统一的框架相关的运行时基类，由厂商自行实现特有的运行时逻辑。
+DIPU (device independent process unit) 是由 **一组抽象设备 Runtime 接口，一组框架能力相关的运行时基类/接口，一个针对 DIOPI 标准算子的适配层** 共同组成的拓展包。用来在训练框架 PyTorch 上接入 DIOPI 算子库，实现 Eager 模式的推理和训练。其能够在编译时，决定抽象设备被影射的方式；并使用统一的运行时，减少在多硬件上适配训练框架的成本。DIPU 即可以基于统一的设备运行时来屏蔽厂商的实际设备；也可以基于统一的框架相关的运行时基类，由厂商自行实现特有的运行时逻辑。
 
 虽然 PyTorch 定义了一套基础的运行时接口 `c10`，可以基于这个接口直接抽象各个设备接口，但是 `c10` 首先是个直面框架层的接口，每个接入的设备都需要实现大量类似的逻辑来完成 `c10` 的实现，对于多设备的支持很不方便。DIPU 先把 `c10` 的运行时适配到 DIPU 自己的运行时，把通用的逻辑抽取出来，可以让厂商仅实现必要的设备接口即可工作。
 
@@ -25,7 +25,7 @@ DIPU 结构上分为 Python 和 CPP 两部分：
 Runtime 主要有以下几个部分：
 
 1. *Core & Distributed*
-   - PyTorch 把一些基本的设备层接口放到了一个叫 `c10` 的目录下，不同的设备接入者需要实现该接口来接入 PyTorch。详见[参考文档](http://blog.ezyang.com/2019/05/pytorch-internals/)对于`c10` 的介绍。
+   - PyTorch 把一些基本的设备层接口放到了一个叫 `c10` 的目录下，不同的设备接入者需要实现该接口来接入 PyTorch。详见 [参考文档](http://blog.ezyang.com/2019/05/pytorch-internals/) 对于`c10` 的介绍。
    - DIPU 的这一部分主要就是对 PyTorch 的 `c10` 和 `c10d` 相关接口的实现，把设备无关的部分抽象出一组运行时基类。目前包含 `DIPUAllocator`、`DIPUGenerator`、`DIPUStream/Event/Guard`、`ProcessGroupDICL` 等。这些类会把设备相关的请求代理到 *device* 部分定义的一组设备接口。另外用户也可以继承上述基类，实现并注册自己的子类，实现设备特化的某些行为（这个能力的支持目前尚待完善）。
 2. *Device*
    - 包含 `deviceapis.h` 和 `diclapis.h` 两个接口文件。主要是设备 `memory/stream/event/communcation` 相关的接口函数（这部分接口后续有考虑挪到 DIOPI 中，成为 DIOPI 的 *Device* 接口，见上图）。
@@ -40,7 +40,7 @@ Aten 的能力主要依赖于 PyTorch 提供的注册自定义 *backend* 的能
 
 #### DiopiRT (`csrc/dipu/diopirt`)
 
-用于实现 DIOPI 要求的 *Runtime*，具体参考 [DIOPI项目](https://github.com/DeepLink-org/DIOPI)。
+用于实现 DIOPI 要求的 *Runtime*，具体参考 [DIOPI 项目](https://github.com/DeepLink-org/DIOPI)。
 
 #### Binding to Python (`csrc/dipu/binding`)
 
@@ -52,10 +52,10 @@ Aten 的能力主要依赖于 PyTorch 提供的注册自定义 *backend* 的能
 
 一般的，除了要实现上面 *Device* 部分要求的接口函数外，*Vendor* 还需要实现一个特殊的 `vendorapi.h`，在这里导出设备 `device/stream/event/comm` 相关的数据结构定义。未来计划在设备层允许 *Vendor* 注册特化的 *Runtime* 子类，或者实现子类的构建器/工厂方法接口，实现设备特化的 *Runtime* 行为。
 
-### Python层
+### Python 层
 
 1. DIPU 设备层接口 (`torch_dipu/dipu`):
-   - 包含CPP层的 *Runtime* 接口对应的 Python 层。这部分会导出部分函数给用户侧，导出的函数类比 PyTorch 的 `torch/cuda` 部分。
+   - 包含 CPP 层的 *Runtime* 接口对应的 Python 层。这部分会导出部分函数给用户侧，导出的函数类比 PyTorch 的 `torch/cuda` 部分。
 2. DIPU 采用 `monkey-patch` 的方式模拟了部分 PyTorch tensor 接口，让它们可以处理 DIPU 特殊的参数，该部分的设计还在优化中。
 3. DIPU 拥有一定的模拟 CUDA 接口的能力。简单来说就是在 Python 层 用前面 DIPU 设备层的接口来替换 `torch.cuda` 的同名接口。
 
@@ -65,17 +65,17 @@ Aten 的能力主要依赖于 PyTorch 提供的注册自定义 *backend* 的能
 
 ### Dispatch 机制与 DIOPI 算子库
 
-PyTorch 的算子注册和分派有很多步骤，详见[参考文档](https://github.com/pytorch/pytorch/wiki/PyTorch-dispatcher-walkthrough)。
+PyTorch 的算子注册和分派有很多步骤，详见 [参考文档](https://github.com/pytorch/pytorch/wiki/PyTorch-dispatcher-walkthrough)。
 
-DIPU CPP 层适配的 ATen 算子对应的是分派过程中最底层（*backend* 层） 的算子或者 *composite* 层里等效为 *backend* 的算子。
+DIPU CPP 层适配的 ATen 算子对应的是分派过程中最底层（*backend* 层）的算子或者 *composite* 层里等效为 *backend* 的算子。
 
-这里面有一定的灵活性，以`Linear` 算子为例，在 PyTorch 的 `cpu/cuda` 设备上，它被实现为一个 `composite` 算子，实际的 *backend* 层算子是组合算子内部调用的 `addmm` 或者更底层的 `mm`。 而在 DIPU (`privateuse1`) 设备中，目前是注册了一个 `Linear` 算子（DIOPI 有这个算子）来替代组合算子，所以分派会直接走到新的 *backend* 层算子 `Linear`，而不会在调用原来的 `addmm/mm`。但是如果对应设备的 DIOPI 的 IMPL 算子库 没有实现 `diopiLinear` 而是实现了 `mm` 算子，也是可以正常走通 `Linear` 的调用流程的。
+这里面有一定的灵活性，以`Linear` 算子为例，在 PyTorch 的 `cpu/cuda` 设备上，它被实现为一个 `composite` 算子，实际的 *backend* 层算子是组合算子内部调用的 `addmm` 或者更底层的 `mm`。而在 DIPU (`privateuse1`) 设备中，目前是注册了一个 `Linear` 算子（DIOPI 有这个算子）来替代组合算子，所以分派会直接走到新的 *backend* 层算子 `Linear`，而不会在调用原来的 `addmm/mm`。但是如果对应设备的 DIOPI 的 IMPL 算子库 没有实现 `diopiLinear` 而是实现了 `mm` 算子，也是可以正常走通 `Linear` 的调用流程的。
 
 ### 无侵入式的 PyTorch 扩展包
 
-DIPU 没有直接修改 PyTorch 的代码，而是使用 out-of-tree 的方式接入新设备，详见[参考文档](https://pytorch.org/tutorials/advanced/extend_dispatcher.html)。
+DIPU 没有直接修改 PyTorch 的代码，而是使用 out-of-tree 的方式接入新设备，详见 [参考文档](https://pytorch.org/tutorials/advanced/extend_dispatcher.html)。
 
-PyTorch 要求 out-of-tree 的代码必须定义一个私有的 *Backend Key*，DIPU目前没有和 PyTorch 做官方的沟通，因此 PyTorch 主干里没有 `DIPU` 这个设备，目前是暂时借用 `PrivateUse1` 这个 Key（后续考虑改为借用 `XPU` 设备 Key，因为这个 Key 在 PyTorch 主干代码中有更好的支持）。
+PyTorch 要求 out-of-tree 的代码必须定义一个私有的 *Backend Key*，DIPU 目前没有和 PyTorch 做官方的沟通，因此 PyTorch 主干里没有 `DIPU` 这个设备，目前是暂时借用 `PrivateUse1` 这个 Key（后续考虑改为借用 `XPU` 设备 Key，因为这个 Key 在 PyTorch 主干代码中有更好的支持）。
 
 基于用户私有的 *Backend Key* 和 `Dispatch Key`，PyTorch 会把算子调用请求分发到对应设备的算子实现。另外 `c10` 本身提供了一些注册能力，比如 `C10_REGISTER_GUARD_IMPL`，可以让用户把私有设备的 *Runtime* 代码注册到框架中。
 
@@ -83,7 +83,7 @@ PyTorch 要求 out-of-tree 的代码必须定义一个私有的 *Backend Key*，
 
 ### 算子适配能力
 
-为了更好的接入 DIOPI 算子，DIPU 提供了一组算子适配相关的辅助能力，比如灵活的算子 Fallback to CPU 的能力、算子精度自动对比的能力（对比 DIOPI 算子和 PyTorch 原生的 CPU 算子），算子执行过程中打印算子参数的能力。基于这些能力，接入算子时可以更方便排查算子精度等问题。 相关能力的具体说明参见 [Quick Start 文档](https://deeplink.readthedocs.io/zh-cn/latest/doc/DIPU/quick_start.html)的“算子库接入”章节。
+为了更好的接入 DIOPI 算子，DIPU 提供了一组算子适配相关的辅助能力，比如灵活的算子 Fallback to CPU 的能力、算子精度自动对比的能力（对比 DIOPI 算子和 PyTorch 原生的 CPU 算子），算子执行过程中打印算子参数的能力。基于这些能力，接入算子时可以更方便排查算子精度等问题。相关能力的具体说明参见 [Quick Start 文档](https://deeplink.readthedocs.io/zh-cn/latest/doc/DIPU/quick_start.html) 的“算子库接入”章节。
 
 ## 质量保障体系
 
@@ -94,7 +94,7 @@ PyTorch 要求 out-of-tree 的代码必须定义一个私有的 *Backend Key*，
 2. 简单开发的手工测例。这部分测例更注重算子能否跑通，对算子要求较低。
 3. 模型测试。我们开发了 `one_iter` 精度对比工具，会先在精度正确性没问题的设备（如 CPU 和 CUDA）上训练模型，保存每一层的算子输入、输出、权重、梯度数据，再在待测试设备上训练模型，逐层对比训练精度。
 
-> 更多信息请参考 [dipu/tests](./dipu/tests) 目录。
+> 更多信息请参考 [dipu/tests](./tests) 目录。
 
 ## Learn More
 
diff --git a/dipu/torch_dipu/profiler/readme.md b/dipu/torch_dipu/profiler/README.md
similarity index 75%
rename from dipu/torch_dipu/profiler/readme.md
rename to dipu/torch_dipu/profiler/README.md
index 6a91f325c6..46f1e2fdb5 100644
--- a/dipu/torch_dipu/profiler/readme.md
+++ b/dipu/torch_dipu/profiler/README.md
@@ -1,10 +1,14 @@
 # Profiler
 
 ## 简介
-DeepLink Profiler是一个允许在训练和推理过程中收集性能指标的工具。Profiler的上下文管理器API可用于了解哪些模型算子最耗时，并检查其输入形状和堆栈跟踪，研究设备kernel活动并可视化执行跟踪。当使用DeepLink进行模型训练时，可以使用DeepLink Profiler定位性能瓶颈，指导性能优化。
+
+DeepLink Profiler 是一个允许在训练和推理过程中收集性能指标的工具。Profiler 的上下文管理器 API 可用于了解哪些模型算子最耗时，并检查其输入形状和堆栈跟踪，研究设备 kernel 活动并可视化执行跟踪。当使用 DeepLink 进行模型训练时，可以使用 DeepLink Profiler 定位性能瓶颈，指导性能优化。
+
 ## 使用说明
-本教程将以resnet18模型为例，讲解如何使用DeepLink Profiler分析模型性能。
-1. 导入必要的库
+
+本教程将以 resnet18 模型为例，讲解如何使用 DeepLink Profiler 分析模型性能。
+
+### 1. 导入必要的库
 
 ``` python
 import torch_dipu
@@ -13,22 +17,23 @@ import torchvision.models as models
 from torch.profiler import profile, record_function, ProfilerActivity
 ```
 
-2. 实例化resnet18模型
+### 2. 实例化 resnet18 模型
 
 ```python
 model = models.resnet18()
 inputs = torch.randn(5, 3, 224, 224)
 ```
 
-3. 使用DeepLink profiler分析模型执行时间
+### 3. 使用 DeepLink Profiler 分析模型执行时间
+
+DeepLink Profiler 接口对齐了 PyTorch Profiler，通过上下文管理器启用，并接受很多参数，常用的参数有
 
-DeepLink profiler接口对齐了PyTorch Profiler，通过上下文管理器启用，并接受很多参数，常用的参数有
 + `activities`：要收集的打点列表
-   * `ProfilerActivity.CPU`：收集PyTorch算子、TorchScript函数以及用户自定义代码标签
-    * `ProfilerActivity.CUDA`：收集设备kernel打点
+  + `ProfilerActivity.CPU`：收集 PyTorch 算子、TorchScript 函数以及用户自定义代码标签
+  + `ProfilerActivity.CUDA`：收集设备 kernel 打点
 + `record_shapes`：是否记录算子输入的形状
 + `profile_memory`：是否统计模型张量内存消耗
-+ `use_cuda`：是否统计设备kernel执行时间
++ `use_cuda`：是否统计设备 kernel 执行时间
 + `with_stack`：是否打印调用栈
 
 ```Python
@@ -36,14 +41,16 @@ with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
     with record_function("model_inference"):
         model(inputs)
 ```
+
 打印出上面执行的统计数据：
+
 ```Python
 print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
 ```
 
-输出如下
+输出如下：
 
-```
+```text
 ---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------
                              Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls
 ---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------
@@ -60,16 +67,20 @@ print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
 ---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------
 Self CPU time total: 253.751ms
 ```
-从输出中可以发现，大部分的执行时间花在conv2d。
 
-需要说明的是，cpu time是指这个算子执行的总时间；同时，该算子有可能调用其他算子，self cpu time是该算子的总时间减去调用其他算子的时间。
+从输出中可以发现，大部分的执行时间花在 conv2d。
+
+需要说明的是，cpu time 是指这个算子执行的总时间；同时，该算子有可能调用其他算子，self cpu time 是该算子的总时间减去调用其他算子的时间。
+
+要获得更精细的结果粒度并包括运算符输入形状，需要设置 `group_by_input_shape=True`（注意：这需要将 profiler 的输入参数 `record_shape` 设置为 `True`）
 
-要获得更精细的结果粒度并包括运算符输入形状，需要设置`group_by_input_shape=True`（注意：这需要将profile的输入参数`record_shape`设置为True）
 ```Python
 print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10))
 ```
-输出如下
-```
+
+输出如下：
+
+```text
 ---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------
                              Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls                                                                      Input Shapes
 ---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------
@@ -85,9 +96,11 @@ print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total
                 aten::thnn_conv2d         0.01%      15.000us        14.36%      34.465ms      34.465ms             1                                 [[5, 3, 224, 224], [64, 3, 7, 7], [], [], [], []]
 ---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------
 ```
-从输出可以看到，resnet18模型中卷积包含了几种不同的形状。
 
-Profiler还可用于分析在GPU和其他AI加速芯片上执行的模型的性能：
+从输出可以看到，resnet18 模型中卷积包含了几种不同的形状。
+
+Profiler 还可用于分析在 GPU 和其他 AI 加速芯片上执行的模型的性能：
+
 ```Python
 model = models.resnet18().cuda()
 inputs = torch.randn(5, 3, 224, 224).cuda()
@@ -100,9 +113,9 @@ with profile(activities=[
 print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=20))
 ```
 
-输出如下
+输出如下：
 
-```
+```text
 -------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                              Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
 -------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
@@ -130,11 +143,13 @@ print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=20))
 Self CPU time total: 143.583ms
 Self CUDA time total: 168.781ms
 ```
-从输出可以看到，`diopiConvolution2d`和`diopiBatchNorm`是两个算子耗时最长。
 
-4. 分析内存消耗
+从输出可以看到，`diopiConvolution2d` 和 `diopiBatchNorm` 是两个算子耗时最长。
+
+### 4. 分析内存消耗
+
+PyTorch Profiler 还可以统计算子分配或释放的内存量。要启用内存分析功能，请将 `profile_memory` 设置成 `True`。
 
-PyTorch profiler还可以统计算子分配或释放的内存量。要启用内存分析功能，请将profile_memory设置成True。
 ```Python
 model = models.resnet18()
 inputs = torch.randn(5, 3, 224, 224)
@@ -143,8 +158,10 @@ with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shap
 
 print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))
 ```
-输出如下
-```
+
+输出如下：
+
+```text
 ---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                              Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls
 ---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
@@ -162,28 +179,27 @@ print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))
 Self CPU time total: 119.442ms
 ```
 
+### 5. 使用 Chrome trace viewer 进行可视化
 
-5. 使用chrome trace viewer进行可视化
+Profiling 结果可以输出成 json 文件
 
-Profiling结果可以输出成json文件
 ```Python
 model = models.resnet18().cuda()
 inputs = torch.randn(5, 3, 224, 224).cuda()
 with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
     model(inputs)
-    
+
 prof.export_chrome_trace("trace.json")
 ```
 
-使用Chrome trace viewer (chrome://tracing)工具查看trace.json文件，可视化结果如下图
+使用 Chrome trace viewer (`chrome://tracing`) 工具查看 `trace.json` 文件，可视化结果如下图：
+
+![trace](/dipu/img/profiler/trace_json.png)
 
-<div align=center>
-<img src="/dipu/img/profiler/trace_json.png">
-</div>
+### 6. 打印调用链
 
-6. 打印调用链
+Profiler 可用于分析 Python 和 TorchScript 堆栈跟踪。
 
-Profiler可用于分析Python和TorchScript堆栈跟踪。
 ```Python
 with profile(
     activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
@@ -194,8 +210,10 @@ with profile(
 
 print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total", row_limit=2))
 ```
-输出如下
-```
+
+输出如下：
+
+```text
 -------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  -----------------------------------------------------------------
                                              Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  Source Location
 -------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  -----------------------------------------------------------------
@@ -215,13 +233,16 @@ print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total"
 Self CPU time total: 139.666ms
 Self CUDA time total: 169.640ms
 ```
-7. 使用Profiler分析长时间运行任务
 
-Profiler提供了一个额外的API来处理长时间运行的作业（如模型训练）。跟踪所有的执行可能很慢，并导致非常大的跟踪文件。要避免这种情况，请使用可选参数：
-  1. `schedule`：指定一个函数，该函数以整数参数作为输入，并返回一个动作给Profiler。使用这个参数的最佳方式是使用`torch.profiler.schedule`辅助函数，它可以为您生成一个schedule
-  2. `on_trace_ready`：指定一个函数，该函数将Profiler的引用作为输入，并在每次准备好新跟踪时由Profiler调用。
+### 7. 使用 Profiler 分析长时间运行任务
+
+Profiler 提供了一个额外的 API 来处理长时间运行的作业（如模型训练）。跟踪所有的执行可能很慢，并导致非常大的跟踪文件。要避免这种情况，请使用可选参数：
+
+1. `schedule`：指定一个函数，该函数以整数参数作为输入，并返回一个动作给 Profiler。使用这个参数的最佳方式是使用 `torch.profiler.schedule` 辅助函数，它可以为您生成一个 schedule。
+2. `on_trace_ready`：指定一个函数，该函数将 Profiler 的引用作为输入，并在每次准备好新跟踪时由 Profiler 调用。
+
+为了说明 API 是如何工作的，让我们首先考虑以下带有 `torch.profiler.schedule` 函数的示例：
 
-为了说明API是如何工作的，让我们首先考虑以下带有`torch.profiler.schedule`函数的示例：
 ```Python
 from torch.profiler import schedule
 
@@ -232,22 +253,25 @@ my_schedule = schedule(
     active=3,
     repeat=2)
 ```
-Profiler假设长时间运行的任务由多个步骤组成，步骤编号从零开始。上面的示例定义了分析器的以下操作序列：
-1. 参数`skip_first`告诉分析器在前10个步骤中忽略追踪（`skip_first`的默认值为零）；
-2. 在前`skip_first`个步骤之后，分析器开始执行分析器周期；
+
+Profiler 假设长时间运行的任务由多个步骤组成，步骤编号从零开始。上面的示例定义了分析器的以下操作序列：
+
+1. 参数 `skip_first` 告诉分析器在前 10 个步骤中忽略追踪（`skip_first` 的默认值为零）；
+2. 在前 `skip_first` 个步骤之后，分析器开始执行分析器周期；
 3. 每个周期包括三个阶段：
-    1. 空闲阶段（`wait=5`步骤），在此阶段分析器处于非活动状态；
-    2. 预热阶段（`warmup=1`步骤），在此阶段分析器开始追踪，但结果会被丢弃。此阶段用于丢弃追踪开始时分析器获取的样本，因为它们通常会被额外的开销所影响；
-    3. 活动追踪阶段（`active=3`步骤），在此阶段分析器进行追踪和记录数据；
-4. 可选的repeat参数指定循环的上限。默认情况下（零值），分析器将在任务运行时执行循环。
+    1. 空闲阶段（`wait=5` 步骤），在此阶段分析器处于非活动状态；
+    2. 预热阶段（`warmup=1` 步骤），在此阶段分析器开始追踪，但结果会被丢弃。此阶段用于丢弃追踪开始时分析器获取的样本，因为它们通常会被额外的开销所影响；
+    3. 活动追踪阶段（`active=3` 步骤），在此阶段分析器进行追踪和记录数据；
+4. 可选的 repeat 参数指定循环的上限。默认情况下（零值），分析器将在任务运行时执行循环。
 
-因此，在上面的示例中，分析器将跳过前15个步骤，将下一个步骤用于预热，积极记录接下来的3个步骤，再跳过另外5个步骤，将下一个步骤用于预热，再积极记录另外3个步骤。由于指定了repeat=2参数值，分析器将在第一个两个周期后停止记录。
+因此，在上面的示例中，分析器将跳过前 15 个步骤，将下一个步骤用于预热，积极记录接下来的 3 个步骤，再跳过另外 5 个步骤，将下一个步骤用于预热，再积极记录另外 3 个步骤。由于指定了 `repeat=2` 参数值，分析器将在第一个两个周期后停止记录。
 
-在每个周期结束时，分析器调用指定的on_trace_ready函数，并将自身作为参数传递。该函数用于处理新的追踪结果，可以通过获取表格输出或将输出保存为追踪文件来进行处理。
+在每个周期结束时，分析器调用指定的 `on_trace_ready` 函数，并将自身作为参数传递。该函数用于处理新的追踪结果，可以通过获取表格输出或将输出保存为追踪文件来进行处理。
 
-要向分析器发送下一个步骤已开始的信号，请调用prof.step()函数。当前分析器步骤存储在prof.step_num中。
+要向分析器发送下一个步骤已开始的信号，请调用 `prof.step()` 函数。当前分析器步骤存储在 `prof.step_num` 中。
 
 以下示例显示了如何使用上述概念：
+
 ```Python
 def trace_handler(p):
     output = p.key_averages().table(sort_by="self_cuda_time_total", row_limit=10)
@@ -266,27 +290,32 @@ with profile(
         model(inputs)
         p.step()
 ```
+
 ## 使用案例
 
-### 案例一 Mobilenet v2多卡训练性能分析与优化
-##### 1. 问题描述：
+### 案例一 Mobilenet v2 多卡训练性能分析与优化
 
-  开发人员使用某个版本的DeepLink完成Mobilenet v2的适配后，发现该模型在NV上单机八卡训练很慢，需要进行性能优化，提升训练性能。
+#### 问题描述
 
-##### 2. 使用DeepLink Profer进行性能分析
-  1. 修改`mmpretrain`的`tools/train.py`，在`runner.train()`之前开启Profiler，将收集到的性能分析数据存入`mobilenetv2_profiler-slow`
-```Python
-from mmengine.hooks import ProfilerHook
+开发人员使用某个版本的 DeepLink 完成 Mobilenet v2 的适配后，发现该模型在 NV 上单机八卡训练很慢，需要进行性能优化，提升训练性能。
 
-profiler_hook = ProfilerHook(by_epoch = False, profile_times=10, activity_with_cpu=True, activity_with_cuda=True, json_trace_path='mobilenetv2_profiler-slow')
-runner.register_custom_hooks([profiler_hook])
-```
-  2. 使用chrome trace viewer查看，发现conv2d耗时长，从图中可以看到conv2d调用到了`thnn_conv2d`，而不是预期的`cudnn_convolution`
-<div align=center>
-<img src="/dipu/img/profiler/thnn_conv2d.png">
-</div>
+#### 使用 DeepLink Profer 进行性能分析
+
+1. 修改 `mmpretrain` 的 `tools/train.py`，在 `runner.train()` 之前开启 Profiler，将收集到的性能分析数据存入 `mobilenetv2_profiler-slow`：
+
+    ```Python
+    from mmengine.hooks import ProfilerHook
+
+    profiler_hook = ProfilerHook(by_epoch = False, profile_times=10, activity_with_cpu=True, activity_with_cuda=True, json_trace_path='mobilenetv2_profiler-slow')
+    runner.register_custom_hooks([profiler_hook])
+    ```
+
+2. 使用 chrome trace viewer 查看，发现 conv2d 耗时长，从图中可以看到 conv2d 调用到了`thnn_conv2d`，而不是预期的`cudnn_convolution`。
+
+    ![sample-conv2d](/dipu/img/profiler/thnn_conv2d.png)
+
+3. 最后定位到 DeepLink 某个版本新增了 `torch._C._set_cudnn_enabled(false)`，关闭了 cudnn，把这句话删除速度恢复正常。
 
-  3. 最后定位到DeepLink某个版本新增了 `torch._C._set_cudnn_enabled(false)`，关闭了cudnn，把这句话删除速度恢复正常。
+## 参考资料
 
-## 参考
-1. [PyTorch profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html)
++ [PyTorch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html)

From 2770001c28652a803ce7d78870f645d46c2f73cd Mon Sep 17 00:00:00 2001
From: liwenjian-sensetime
 <109193776+liwenjian-sensetime@users.noreply.github.com>
Date: Thu, 7 Dec 2023 11:06:22 +0800
Subject: [PATCH 03/58] update env python 3.10 (#503)

---
 .github/workflows/main.yml          |  2 +-
 dipu/scripts/ci/camb/ci_camb_env.sh | 10 +++++-----
 dipu/scripts/ci/nv/ci_nv_env.sh     | 12 ++++++------
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 6f946ae407..9b3f4cff4e 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -128,7 +128,7 @@ jobs:
           cd ${CAMB_CI_PATH}/${GITHUB_RUN_NUMBER}/Build-Camb
           rm -rf scripts
           ln -s ${CAMB_CI_PATH}/${GITHUB_RUN_NUMBER}/source-main/dipu/third_party/DIOPI/scripts scripts
-          source /mnt/cache/share/platform/env/camb_ci_diopi_impl
+          source /mnt/cache/share/platform/env/pt2.0_diopi
           bash scripts/increment_coverage.sh ${REQUIRE_COVERAGE}
           """
 
diff --git a/dipu/scripts/ci/camb/ci_camb_env.sh b/dipu/scripts/ci/camb/ci_camb_env.sh
index 7527809648..858b9d4654 100644
--- a/dipu/scripts/ci/camb/ci_camb_env.sh
+++ b/dipu/scripts/ci/camb/ci_camb_env.sh
@@ -1,9 +1,9 @@
 PLATFORM=/mnt/lustre/share/platform
-ENV_NAME=dipu_poc
+ENV_NAME=pt2.0_diopi
 export PATH=`python ${PLATFORM}/env/clear_path.py PATH`
 export LD_LIBRARY_PATH=`python ${PLATFORM}/env/clear_path.py LD_LIBRARY_PATH`
-GCC_ROOT=/mnt/lustre/share/platform/dep/gcc-7.5
-CONDA_ROOT=${PLATFORM}/env/miniconda3.8
+GCC_ROOT=/mnt/lustre/share/platform/dep/gcc-10.2
+CONDA_ROOT=${PLATFORM}/env/miniconda3.10
 
 export NEUWARE_HOME=/usr/local/neuware
 export CC=${GCC_ROOT}/bin/gcc
@@ -13,8 +13,8 @@ export CXX=${GCC_ROOT}/bin/g++
 export DIOPI_ROOT=$(pwd)/third_party/DIOPI/impl/lib/
 export DIPU_ROOT=$(pwd)/torch_dipu
 export LD_LIBRARY_PATH=$DIPU_ROOT:$LD_LIBRARY_PATH
-export PYTHONPATH=${PYTORCH_DIR}/install_path/lib/python3.8/site-packages:${PYTHONPATH}
-export PATH=${GCC_ROOT}/bin:${PYTORCH_DIR}/install_path/bin:${CONDA_ROOT}/envs/dipu_poc/bin:${CONDA_ROOT}/bin:${PATH}
+export PYTHONPATH=${PLATFORM}/dep/DIOPI_pytorch/pytorch2.0:${PYTHONPATH}
+export PATH=${GCC_ROOT}/bin:${CONDA_ROOT}/envs/dipu_poc/bin:${CONDA_ROOT}/bin:${PATH}
 export LD_PRELOAD=${GCC_ROOT}/lib64/libstdc++.so.6
 
 
diff --git a/dipu/scripts/ci/nv/ci_nv_env.sh b/dipu/scripts/ci/nv/ci_nv_env.sh
index d885dc983e..2f26b9d899 100644
--- a/dipu/scripts/ci/nv/ci_nv_env.sh
+++ b/dipu/scripts/ci/nv/ci_nv_env.sh
@@ -2,14 +2,14 @@ PLATFORM=/mnt/cache/share/platform
 ENV_NAME=pt2.0_diopi
 export PATH=`python ${PLATFORM}/env/clear_path.py PATH`
 export LD_LIBRARY_PATH=`python ${PLATFORM}/env/clear_path.py LD_LIBRARY_PATH`
-GCC_ROOT=${PLATFORM}/dep/gcc-7.5
-CONDA_ROOT=${PLATFORM}/env/miniconda3.8
+GCC_ROOT=${PLATFORM}/dep/gcc-10.2
+CONDA_ROOT=${PLATFORM}/env/miniconda3.10
 export CC=${GCC_ROOT}/bin/gcc
 export CXX=${GCC_ROOT}/bin/g++
 
-export CUDA_PATH=${PLATFORM}/dep/cuda11.7-cudnn8.5
-export MPI_ROOT=${PLATFORM}/dep/openmpi-4.0.5-cuda11.7
-export NCCL_ROOT=${PLATFORM}/dep/nccl-2.13.4-cuda11.7
+export CUDA_PATH=${PLATFORM}/dep/cuda11.8-cudnn8.9
+export MPI_ROOT=${PLATFORM}/dep/openmpi-4.0.5-cuda11.8
+export NCCL_ROOT=${PLATFORM}/dep/nccl-2.15.5-cuda11.8
 export GTEST_ROOT=${PLATFORM}/dep/googletest-gcc5.4
 
 
@@ -24,7 +24,7 @@ export DIOPI_ROOT=$(pwd)/third_party/DIOPI/impl/lib/
 export DIPU_ROOT=$(pwd)/torch_dipu
 export DIOPI_PATH=$(pwd)/third_party/DIOPI/proto
 export DIPU_PATH=${DIPU_ROOT}
-export PYTORCH_DIR=${PLATFORM}/env/miniconda3.8/envs/pt2.0_diopi/lib/python3.8/site-packages
+export PYTORCH_DIR=${PLATFORM}/dep/DIOPI_pytorch/pytorch2.0_cu118
 export LD_LIBRARY_PATH=$DIPU_ROOT:$LD_LIBRARY_PATH
 export PYTHONPATH=${PYTORCH_DIR}:${PYTHONPATH}
 export PATH=${GCC_ROOT}/bin:${CONDA_ROOT}/envs/dipu_poc/bin:${CONDA_ROOT}/bin:${PLATFORM}/dep/binutils-2.27/bin:${PATH}

From 30894d09b0eafc5717dbf89de99bf165ae25efd3 Mon Sep 17 00:00:00 2001
From: tangzhiyi11 <tangzhiyi11@users.noreply.github.com>
Date: Thu, 7 Dec 2023 19:48:18 +0800
Subject: [PATCH 04/58] [dicp][ascend] get soc_version from aclrt (#505)

---
 dicp/dicp/vendor/AscendGraph/codegen/graph_utils.h | 6 +++---
 dicp/dicp/vendor/AscendGraph/compile_job.py        | 5 +++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/dicp/dicp/vendor/AscendGraph/codegen/graph_utils.h b/dicp/dicp/vendor/AscendGraph/codegen/graph_utils.h
index 380670146f..69e06fec8a 100644
--- a/dicp/dicp/vendor/AscendGraph/codegen/graph_utils.h
+++ b/dicp/dicp/vendor/AscendGraph/codegen/graph_utils.h
@@ -12,6 +12,7 @@
 #include <unordered_set>
 #include <vector>
 
+#include "acl/acl.h"
 #include "all_ops.h"
 #include "ascend_string.h"
 #include "ge_api.h"
@@ -83,10 +84,9 @@ class AclgraphBuilder {
  public:
   explicit AclgraphBuilder() {
     // 1. system init
-    std::string kSocVersion = "Ascend910ProB";
+    auto kSocVersion = aclrtGetSocName();
     std::map<AscendString, AscendString> global_options = {
-        {AscendString(ge::ir_option::SOC_VERSION),
-         AscendString(kSocVersion.c_str())},
+        {AscendString(ge::ir_option::SOC_VERSION), AscendString(kSocVersion)},
         {AscendString(ge::ir_option::PRECISION_MODE), "allow_fp32_to_fp16"},
     };
     auto status = aclgrphBuildInitialize(global_options);
diff --git a/dicp/dicp/vendor/AscendGraph/compile_job.py b/dicp/dicp/vendor/AscendGraph/compile_job.py
index 6b3b2b8228..625dc3dfb3 100644
--- a/dicp/dicp/vendor/AscendGraph/compile_job.py
+++ b/dicp/dicp/vendor/AscendGraph/compile_job.py
@@ -34,6 +34,7 @@ def __init__(self, source_code) -> None:
                      '-std=c++11',
                      '-O3',
                      '-Wall',
+                     '-I/usr/local/Ascend/ascend-toolkit/latest/include',
                      '-I/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_proto/inc',
                      '-I/usr/local/Ascend/ascend-toolkit/latest/include/graph',
                      '-I/usr/local/Ascend/ascend-toolkit/latest/include/ge',
@@ -46,10 +47,10 @@ def __init__(self, source_code) -> None:
                      '-lge_runner',
                      source_path,
                      '-o' + self._lib_path,
-                     '-Wl,-rpath,/usr/local/Ascend/ascend-toolkit/latest/compiler/lib64/stub',
                      '/usr/local/Ascend/ascend-toolkit/latest/compiler/lib64/stub/libgraph.so',
                      '/usr/local/Ascend/ascend-toolkit/latest/compiler/lib64/stub/libge_runner.so',
-                     '/usr/local/Ascend/ascend-toolkit/latest/lib64/libgraph_base.so']
+                     '/usr/local/Ascend/ascend-toolkit/latest/lib64/libgraph_base.so',
+                     '/usr/local/Ascend/ascend-toolkit/latest/runtime/lib64/stub/libascendcl.so',]
 
     def _compile(self):
         if not os.path.exists(self._lib_path):

From 61f57ebc05df32c630097c174766a96760d2fa0c Mon Sep 17 00:00:00 2001
From: Chengyuan Li <37681002+cyLi-Tiger@users.noreply.github.com>
Date: Fri, 8 Dec 2023 17:01:42 +0800
Subject: [PATCH 05/58] lcy/clang-tidy (#483)

* fix namespace declaration format

* update diopi_functions.yaml

* update clang-tidy

* update clang-tidy

* change tab into spaces

* allow const_cast

* fix bug

* fix comment

* fix comments

* fix comments
---
 .../autogen_diopi_wrapper.py                  |  20 +-
 .../diopi_functions.yaml                      | 255 +++++++++++++-----
 .../diopi_wrapper_template.py                 |  96 ++++---
 .../csrc_dipu/aten/ops/DIPUCopy.hpp           |   4 +-
 4 files changed, 248 insertions(+), 127 deletions(-)

diff --git a/dipu/scripts/autogen_diopi_wrapper/autogen_diopi_wrapper.py b/dipu/scripts/autogen_diopi_wrapper/autogen_diopi_wrapper.py
index 5fc67a107d..366d5e6eda 100644
--- a/dipu/scripts/autogen_diopi_wrapper/autogen_diopi_wrapper.py
+++ b/dipu/scripts/autogen_diopi_wrapper/autogen_diopi_wrapper.py
@@ -118,7 +118,7 @@ def create_transform_input_to_cpu_code(fun_config):
     for input in optional_tensor_list_inputs:
         input_process_code += f"\nc10::List<c10::optional<at::Tensor>> {input}_cpu;\n"
         input_process_code += f"for (int i = 0; i < {input}.size();++i)" + " {\n"
-        input_process_code += f"\t{input}_cpu.push_back({input}[i].has_value() && {input}[i].value().defined() ? c10::make_optional<at::Tensor>({input}[i].value().cpu()) : {input}[i]);\n"
+        input_process_code += f"  {input}_cpu.push_back({input}[i].has_value() && {input}[i].value().defined() ? c10::make_optional<at::Tensor>({input}[i].value().cpu()) : {input}[i]);\n"
         input_process_code += "}\n"
 
     outputs = re.findall('Tensor\([a-z]!\)[ ]+([\w\d_]+){1}', schema[:schema.find('->')])
@@ -151,7 +151,7 @@ def create_print_op_args_code(fun_config):
     code += "if (dumpOpArgLevel() > 1) {\n"
     for input in inputs:
         input = input.strip()
-        code += f'\tstd::cout << "\t{opname}:\t{input}:" << dumpArg({input}) << std::endl;\n'
+        code += f'  std::cout << "\t{opname}:\t{input}:" << dumpArg({input}) << std::endl;\n'
     code += "}"
     return code
 
@@ -455,11 +455,11 @@ def create_result_compare_code(fun_config):
     code = ''
     if len(return_param) == 1 :
         compare_code = f'_allclose(result_cpu, result_device)'
-        code += f'std::cout << "autocompare:\t{op_name}\t{return_param[0]}:" << std::endl << "\t" << dumpArg(result_cpu) << std::endl << "\t" << dumpArg(result_device) << std::endl << "\t" << {compare_code} << std::endl;\n';
+        code += f'std::cout << "autocompare:\t{op_name}\t{return_param[0]}:" << std::endl << "  " << dumpArg(result_cpu) << std::endl << "  " << dumpArg(result_device) << std::endl << "  " << {compare_code} << std::endl;\n';
     elif len(return_param) > 1:
         for i in range(len(return_param)):
             compare_code = f'_allclose(std::get<{i}>(result_cpu), std::get<{i}>(result_device))'
-            code += f'std::cout << "autocompare:\t{op_name}\t{return_param[i]}:" << std::endl << "\t" << dumpArg(std::get<{i}>(result_cpu)) << std::endl << "\t" << dumpArg(std::get<{i}>(result_device)) << std::endl << "\t" << {compare_code} << std::endl;\n';
+            code += f'std::cout << "autocompare:\t{op_name}\t{return_param[i]}:" << std::endl << "  " << dumpArg(std::get<{i}>(result_cpu)) << std::endl << "  " << dumpArg(std::get<{i}>(result_device)) << std::endl << "  " << {compare_code} << std::endl;\n';
 
     inputs = re.findall('Tensor +([\w\d_]+)', schema[:schema.find('->')])
     inputs += re.findall('Tensor *\([a-z]!\) *\[ *\] +([\w\d_]+)', schema[:schema.find('->')])
@@ -474,8 +474,8 @@ def create_code_to_print_fun_call_info_from_schema(fun_config):
     op_name = get_op_name_from_schema(fun_config['schema'])
     diopi_func = fun_config.get('interface', '')
     diopi_func = diopi_func[0 : diopi_func.find('(')]
-    debug_code = "if (dumpOpArgLevel() > 0) {\n\t"
-    debug_code += f'printf("--%-50s %-30s \\n", "[{op_name}]:", "{diopi_func}");' + '\n'
+    debug_code = "if (dumpOpArgLevel() > 0) {\n"
+    debug_code += f'  printf("--%-50s %-30s \\n", "[{op_name}]:", "{diopi_func}");' + '\n'
     debug_code += "}\n"
     return debug_code
 
@@ -539,10 +539,10 @@ def create_device_check_code(fun_config):
 
     for args in set(tensors):
         if not args.endswith('?'):
-            code += f'\tTORCH_CHECK(({args}.defined() == false) || ({args}.device().type() == dipu::DIPU_DEVICE_TYPE), __FILE__, ":", __LINE__, ": {op_name}: {args} should be on dipu");\n'
+            code += f'  TORCH_CHECK(({args}.defined() == false) || ({args}.device().type() == dipu::DIPU_DEVICE_TYPE), __FILE__, ":", __LINE__, ": {op_name}: {args} should be on dipu");\n'
         else:
             args = args[0:-1]
-            code += f'\tTORCH_CHECK(({args}.has_value() == false) || ({args}.value().defined() == false) || ({args}.value().device().type() == dipu::DIPU_DEVICE_TYPE), __FILE__, ":", __LINE__, "{op_name}: {args} should be on dipu");\n'
+            code += f'  TORCH_CHECK(({args}.has_value() == false) || ({args}.value().defined() == false) || ({args}.value().device().type() == dipu::DIPU_DEVICE_TYPE), __FILE__, ":", __LINE__, "{op_name}: {args} should be on dipu");\n'
 
     if len(tensors) > 0:
         code += "}"
@@ -588,7 +588,9 @@ def functions_code_gen(fun_config):
         if input.strip().endswith('?'):
             input = input.replace('?', '')
             input_process_code += f"\n::diopiConstTensorHandle_t {input}{diopi_tensor_suffix} = nullptr;\n"
-            input_process_code += f"if ({input}.has_value() && {input}.value().defined()) {input}{diopi_tensor_suffix} = dipu::diopi_helper::toDiopiTensorHandle({input}.value());\n\n"
+            input_process_code += f"if ({input}.has_value() && {input}.value().defined())" + "{\n"
+            input_process_code += f"  {input}{diopi_tensor_suffix} = dipu::diopi_helper::toDiopiTensorHandle({input}.value());\n"
+            input_process_code += "}\n"
         else:
             input_process_code += f"::diopiConstTensorHandle_t {input}{diopi_tensor_suffix} = dipu::diopi_helper::toDiopiTensorHandle({input});\n"
 
diff --git a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
index 4b58185a24..46bebbd3f2 100755
--- a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
+++ b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
@@ -36,15 +36,15 @@
 
 - schema: "aten::add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)"
   custom_code_at_the_beginning: |
-    if (other.numel() == 1) {
-        return dipu_add_scalar_out(self, other.cpu().item(), alpha, out);
-    } else if (self.numel() == 1) {
+    if (other.numel() == 1 && other.is_cpu()) {
+        return dipu_add_scalar_out(self, other.item(), alpha, out);
+    } 
+    if (self.numel() == 1 && self.is_cpu()) {
         if (alpha.toDouble() == 1.0) {
-          return dipu_add_scalar_out(other, self.cpu().item(), alpha, out);
-        } else {
-          dipu_fill__scalar(out, self.cpu().item());
-          return dipu_add__tensor(out, other, alpha);
+          return dipu_add_scalar_out(other, self.item(), alpha, out);
         }
+        dipu_fill__scalar(out, self.item());
+        return dipu_add__tensor(out, other, alpha);
     }
   interface: diopiAdd(ctx, out, self, other, alpha)
 
@@ -55,7 +55,8 @@
   custom_code_at_the_beginning: |
     if (other.numel() == 1 && other.is_cpu()) {
         return dipu_sub_scalar_out(self, other.item(), alpha, out);
-    } else if (self.numel() == 1 && self.is_cpu()) {
+    }
+    if (self.numel() == 1 && self.is_cpu()) {
         at::Tensor selfTensor = at::empty_like(other);
         dipu_fill__scalar(selfTensor, self.item());
         return dipu_sub_out(selfTensor, other, alpha, out);
@@ -94,7 +95,8 @@
   custom_code_at_the_beginning: |
     if (other.numel() == 1 && other.is_cpu()) {
         return dipu_div_scalar_out(self, other.item(), out);
-    } else if (self.numel() == 1 && self.is_cpu()) {
+    }
+    if (self.numel() == 1 && self.is_cpu()) {
         return dipu_div_scalar_out(other, self.item(), out);
     }
   interface: diopiDiv(ctx, out, self, other, RoundModeNone)
@@ -108,7 +110,8 @@
   custom_code_at_the_beginning: |
     if (other.numel() == 1 && other.is_cpu()) {
         return dipu_div_scalar_mode_out(self, other.item(), rounding_mode, out);
-    } else if (self.numel() == 1 && self.is_cpu()) {
+    }
+    if (self.numel() == 1 && self.is_cpu()) {
         return dipu_div_scalar_mode_out(other, self.item(), rounding_mode, out);
     }
     const auto mode = toDiopiRoundMode(rounding_mode.has_value() ? rounding_mode.value().data():"none");
@@ -135,7 +138,8 @@
   custom_code_at_the_beginning: |
     if (other.numel() == 1 && other.is_cpu()) {
         return dipu_mul_scalar_out(self, other.item(), out);
-    } else if (self.numel() == 1 && self.is_cpu()) {
+    }
+    if (self.numel() == 1 && self.is_cpu()) {
         return dipu_mul_scalar_out(other, self.item(), out);
     }
   interface: diopiMul(ctx, out, self, other)
@@ -191,13 +195,19 @@
 
 - schema: "aten::native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))"
   interface: diopiBatchNorm(ctx, out, save_mean, save_invstd, input, weight, bias, const_cast<diopiTensorHandle_t>(running_mean), const_cast<diopiTensorHandle_t>(running_var), training, momentum, eps);
+  custom_code_before_call_diopi: |
+    // NOTE: const_cast here is safe according to pytorch's source code
+    // NOLINTBEGIN(cppcoreguidelines-pro-type-const-cast)
+  custom_code_before_return: |
+    // NOLINTEND(cppcoreguidelines-pro-type-const-cast)
 
 - schema: "aten::native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)"
   custom_code_at_the_beginning: |
     const int64_t dim_c = input.size(1);
     auto out0 = at::empty_like(input);
     auto options = input.options().dtype(at::kFloat);
-    at::Tensor out1, out2;
+    at::Tensor out1;
+    at::Tensor out2;
     if (!training) {
         // do not require save_mean/save_invstd when in test mode
         out1 = at::empty({0}, options);
@@ -207,6 +217,11 @@
         out2 = at::empty({dim_c}, options);
     }
   interface: diopiBatchNorm(ctx, out0, out1, out2, input, weight, bias, const_cast<diopiTensorHandle_t>(running_mean), const_cast<diopiTensorHandle_t>(running_var), training, momentum, eps);
+  custom_code_before_call_diopi: |
+    // NOTE: const_cast here is safe according to pytorch's source code
+    // NOLINTBEGIN(cppcoreguidelines-pro-type-const-cast)
+  custom_code_before_return: |
+    // NOLINTEND(cppcoreguidelines-pro-type-const-cast)
 
 - schema: "native_batch_norm_backward(Tensor grad_out, Tensor input, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_invstd, bool train, float eps, bool[3] output_mask) -> (Tensor, Tensor, Tensor)"
   custom_code_at_the_beginning: |
@@ -235,7 +250,7 @@
 - schema: "native_layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor out, Tensor save_mean, Tensor save_invstd)"
   custom_code_at_the_beginning: |
     const auto input_shape = input.sizes();
-    const int axis = input_shape.size() - normalized_shape.size();
+    const int axis = static_cast<int>(input_shape.size()) - static_cast<int>(normalized_shape.size());
     const int64_t M = c10::multiply_integers(input_shape.cbegin(), input_shape.cbegin() + axis);
     std::vector<int64_t> stats_shape(input_shape.size(), 1);
     std::copy(input_shape.begin(), input_shape.begin() + axis, stats_shape.begin());
@@ -290,7 +305,8 @@
   custom_code_at_the_beginning: |
     if (other.numel() == 1 && other.is_cpu()) {
         return dipu_eq_scalar_out(self, other.item(), out);
-    } else if (self.numel() == 1 && self.is_cpu()) {
+    }
+    if (self.numel() == 1 && self.is_cpu()) {
         return dipu_eq_scalar_out(other, self.item(), out);
     }
   interface: diopiEq(ctx, out, self, other)
@@ -312,7 +328,8 @@
   custom_code_at_the_beginning: |
     if (other.numel() == 1 && other.is_cpu()) {
         return dipu_lt_scalar_out(self, other.item(), out);
-    } else if (self.numel() == 1 && self.is_cpu()) {
+    }
+    if (self.numel() == 1 && self.is_cpu()) {
         return dipu_lt_scalar_out(other, self.item(), out);
     }
   interface: diopiLt(ctx, out, self, other)
@@ -334,7 +351,8 @@
   custom_code_at_the_beginning: |
     if (other.numel() == 1 && other.is_cpu()) {
         return dipu_ne_scalar_out(self, other.item(), out);
-    } else if (self.numel() == 1 && self.is_cpu()) {
+    }
+    if (self.numel() == 1 && self.is_cpu()) {
         return dipu_ne_scalar_out(other, self.item(), out);
     }
   interface: diopiNe(ctx, out, self, other)
@@ -356,7 +374,8 @@
   custom_code_at_the_beginning: |
     if (other.numel() == 1 && other.is_cpu()) {
         return dipu_ge_scalar_out(self, other.item(), out);
-    } else if (self.numel() == 1 && self.is_cpu()) {
+    }
+    if (self.numel() == 1 && self.is_cpu()) {
         return dipu_ge_scalar_out(other, self.item(), out);
     }
   interface: diopiGe(ctx, out, self, other)
@@ -378,7 +397,8 @@
   custom_code_at_the_beginning: |
     if (other.numel() == 1 && other.is_cpu()) {
         return dipu_gt_scalar_out(self, other.item(), out);
-    } else if (self.numel() == 1 && self.is_cpu()) {
+    }
+    if (self.numel() == 1 && self.is_cpu()) {
         return dipu_gt_scalar_out(other, self.item(), out);
     }
   interface: diopiGt(ctx, out, self, other)
@@ -400,7 +420,8 @@
   custom_code_at_the_beginning: |
     if (other.numel() == 1 && other.is_cpu()) {
         return dipu_le_scalar_out(self, other.item(), out);
-    } else if (self.numel() == 1 && self.is_cpu()) {
+    }
+    if (self.numel() == 1 && self.is_cpu()) {
         return dipu_le_scalar_out(other, self.item(), out);
     }
   interface: diopiLe(ctx, out, self, other)
@@ -551,7 +572,7 @@
     auto out = at::empty(output_shape, input.options());
   interface: diopiConvTranspose2d(ctx, out, input, weight, bias, stride, padding, output_padding, groups, dilation)
   forward_process_code: |
-    bool bias_has_value = (bias.has_value() == true) ? bias.value().requires_grad() : false;
+    bool bias_has_value = (bias.has_value()) ? bias.value().requires_grad() : false;
   saved_data:
     [
       stride,
@@ -577,10 +598,7 @@
     if (bias_has_value) {
       bias_sizes.push_back(grad_output.size(1));
     }
-    std::array<bool, 3> output_mask;
-    output_mask[0] = input.requires_grad();
-    output_mask[1] = weight.requires_grad();
-    output_mask[2] = bias_has_value;
+    std::array<bool, 3> output_mask = {input.requires_grad(), weight.requires_grad(), bias_has_value};
   backward_schema: "convolution_transpose_backward(Tensor grad_output, Tensor input, Tensor weight, int[] bias_sizes, int[] stride, int[] padding, int[] dilation, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)"
   backward_return_code: |
     std::vector<at::Tensor>  outputs = {
@@ -662,7 +680,9 @@
 - schema: "topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)"
   custom_code_at_the_beginning: |
     std::vector<int64_t> output_size(self.sizes().begin(), self.sizes().end());
-    dim = dim < 0 ? (dim + output_size.size()) : dim;
+    if (dim < 0) {
+      dim = dim + static_cast<int64_t>(output_size.size());
+    }
     output_size[dim] = k;
     auto values = at::empty(output_size, self.options());
     auto indices = at::empty(output_size, self.options().dtype(at::kLong));
@@ -693,7 +713,9 @@
   device: [all, -cuda]
   custom_fallback: True
   custom_code_at_the_beginning: |
-    at::Tensor grad_input, grad_weight, grad_bias;
+    at::Tensor grad_input;
+    at::Tensor grad_weight;
+    at::Tensor grad_bias;
     if (output_mask[0]) {
       grad_input = at::empty(input.sizes(), grad_output.options());
     }
@@ -850,15 +872,17 @@
 
 - schema: "stack(Tensor[] tensors, int dim=0) -> Tensor"
   custom_code_at_the_beginning: |
-    dim += dim < 0 ? tensors[0].sizes().size()+1 : 0;
-    auto num_tensors = tensors.size();
+    if (dim < 0) {
+      dim += static_cast<int64_t>(tensors[0].sizes().size()) + 1;
+    } 
+    auto num_tensors = static_cast<int64_t>(tensors.size());
     auto shape = tensors[0].sizes();
     std::vector<int64_t> tmp;
     for (int i = 0; i < dim; i++) {
         tmp.push_back(shape[i]);
     }
     tmp.push_back(num_tensors);
-    for (int i = dim; i < shape.size(); i++) {
+    for (int i = static_cast<int>(dim); i < shape.size(); i++) {
         tmp.push_back(shape[i]);
     }
     const std::vector<int64_t>& const_tmp = tmp;
@@ -873,28 +897,45 @@
 
 - schema: "stack.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)"
   custom_code_at_the_beginning: |
-    dim += dim < 0 ? tensors[0].sizes().size() : 0;
+    if (dim < 0) {
+      dim += static_cast<int64_t>(tensors[0].sizes().size());
+    } 
     std::vector<diopiConstTensorHandle_t> diopiTensorHandles(tensors.size());
     for (size_t i = 0; i < tensors.size(); ++i) {
       diopiTensorHandles[i] = dipu::diopi_helper::toDiopiTensorHandle(tensors.at(i));
     }
-  interface: diopiStack(ctx, out, diopiTensorHandles.data(), tensors.size(), dim)
+  interface: diopiStack(ctx, out, diopiTensorHandles.data(), static_cast<int64_t>(tensors.size()), dim)
 
 - schema: "sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)"
   custom_code_at_the_beginning: |
-    auto dim_ = dim < 0 ? (dim + self.sizes().size()) : dim;
+    int64_t dim_ = 0;
+    if (dim < 0) {
+      dim_ = dim + static_cast<int64_t>(self.sizes().size());
+    } else {
+      dim_ = dim;
+    }
     auto values = at::empty(self.sizes(), self.options());
     auto indices = at::empty(self.sizes(), self.options().dtype(at::kLong));
   interface: diopiSort(ctx, values, indices, self, dim_, descending, nullptr)
 
 - schema: "sort.values(Tensor self, int dim=-1, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)"
   custom_code_at_the_beginning: |
-    auto dim_ = dim < 0 ? (dim + self.sizes().size()) : dim;
+    int64_t dim_ = 0;
+    if (dim < 0) {
+      dim_ = dim + static_cast<int64_t>(self.sizes().size());
+    } else {
+      dim_ = dim;
+    }
   interface: diopiSort(ctx, values, indices, self, dim_, descending, nullptr)
 
 - schema: "sort.values_stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)"
   custom_code_at_the_beginning: |
-    auto dim_ = dim < 0 ? (dim + self.sizes().size()) : dim;
+    int64_t dim_ = 0;
+    if (dim < 0) {
+      dim_ = dim + static_cast<int64_t>(self.sizes().size());
+    } else {
+      dim_ = dim;
+    }
     bool stable_ = stable.has_value() ? stable.value() : false;
     const bool *p = &stable_;
   interface: diopiSort(ctx, values, indices, self, dim_, descending, p)
@@ -1047,7 +1088,7 @@
     }
 
     const auto& self_sizes = self.sizes();
-    for (int i = self_sizes.size() - 1, j = output_size.size() - 1;i >= 0;i--, j--) {
+    for (int i = static_cast<int>(self_sizes.size()) - 1, j = static_cast<int>(output_size.size()) - 1;i >= 0;i--, j--) {
       output_size[j] *= self_sizes.at(i);
     }
 
@@ -1057,15 +1098,20 @@
 - schema: rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
   custom_code_at_the_beginning: |
     auto out = at::empty_like(self);
+    // NOLINTNEXTLINE(readability-suspicious-call-argument)
     return dipu_sub_out(other, self, alpha, out);
   interface: diopiSub(ctx, out, other, self, alpha)
 
 - schema: "unique_dim(Tensor self, int dim, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor out, Tensor indices, Tensor counts)"
   custom_code_at_the_beginning: |
-    at::Tensor out, counts, indices;
+    at::Tensor out;
+    at::Tensor counts;
+    at::Tensor indices;
     if (return_inverse) {
       const auto ndims = self.sizes().size();
-      dim += (dim < 0 ? ndims : 0);
+      if (dim < 0) {
+        dim += static_cast<int64_t>(ndims);
+      }
       indices = at::empty({self.sizes().at(dim)}, self.options().dtype(at::kLong));
     }
     diopiTensorHandle_t out_ptr = nullptr;
@@ -1080,7 +1126,9 @@
 
 - schema: "_unique2(Tensor self, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor out, Tensor indices, Tensor counts)"
   custom_code_at_the_beginning: |
-    at::Tensor out, counts, indices;
+    at::Tensor out;
+    at::Tensor counts;
+    at::Tensor indices;
     if (return_inverse) {
       indices = at::empty(self.sizes(), self.options().dtype(at::kLong));
     }
@@ -1100,7 +1148,7 @@
     std::transform(tensors.begin(), tensors.end(), diopiTensorHandles.begin(), [](const at::Tensor& tensor){
         return dipu::diopi_helper::toDiopiTensorHandle(tensor);
     });
-  interface: diopiCat(ctx, out, diopiTensorHandles.data(), tensors.size(), dim);
+  interface: diopiCat(ctx, out, diopiTensorHandles.data(), static_cast<int64_t>(tensors.size()), dim);
 
 - schema: "masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor"
   custom_code_at_the_beginning: |
@@ -1125,7 +1173,7 @@
 
 - schema: "min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) min, Tensor(b!) min_indices)"
   custom_code_at_the_beginning: |
-    dim += ((dim >= 0) ? 0 : self.sizes().size());
+    dim += ((dim >= 0) ? 0 : static_cast<int64_t>(self.sizes().size()));
   interface: diopiMin(ctx, min, min_indices, self, dim)
 
 - schema: "max(Tensor self) -> Tensor"
@@ -1143,7 +1191,7 @@
 
 - schema: "max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_indices) -> (Tensor(a!) max, Tensor(b!) max_indices)"
   custom_code_at_the_beginning: |
-    dim += ((dim >= 0) ? 0 : self.sizes().size());
+    dim += ((dim >= 0) ? 0 : static_cast<int64_t>(self.sizes().size()));
     if (max_indices.numel() <= 0) {
       auto output_size = self.sizes().vec();
       if (keepdim) {
@@ -1265,11 +1313,11 @@
   custom_code_at_the_beginning: |
     std::vector<int64_t> size(2);
   custom_code_before_call_diopi: |
-    if (output_size.size() > 0) {
+    if (!output_size.empty()) {
       std::copy(output_sizeVector.begin(), output_sizeVector.end(), size.begin());
     } else {
-      size[0] = std::floor(self.size(-2) * scales_h.value_or(1.0));
-      size[1] = std::floor(self.size(-1) * scales_w.value_or(1.0));
+      size[0] = std::floor(static_cast<double>(self.size(-2)) * scales_h.value_or(1.0));
+      size[1] = std::floor(static_cast<double>(self.size(-1)) * scales_w.value_or(1.0));
     }
   interface: diopiUpsampleNearest(ctx, out, self, size);
 
@@ -1278,11 +1326,11 @@
   custom_code_at_the_beginning: |
     std::vector<int64_t> size(2);
   custom_code_before_call_diopi: |
-    if (output_size.size() > 0) {
+    if (!output_size.empty()) {
       std::copy(output_sizeVector.begin(), output_sizeVector.end(), size.begin());
     } else {
-      size[0] = std::floor(self.size(-2) * scales_h.value_or(1.0));
-      size[1] = std::floor(self.size(-1) * scales_w.value_or(1.0));
+      size[0] = std::floor(static_cast<double>(self.size(-2)) * scales_h.value_or(1.0));
+      size[1] = std::floor(static_cast<double>(self.size(-1)) * scales_w.value_or(1.0));
     }
     const char* mode = "bilinear";
   interface: diopiUpsampleLinear(ctx, out, self, size, align_corners, mode);
@@ -1292,11 +1340,11 @@
   custom_code_at_the_beginning: |
     std::vector<int64_t> size(2);
   custom_code_before_call_diopi: |
-    if (output_size.size() > 0) {
+    if (!output_size.empty()) {
       std::copy(output_sizeVector.begin(), output_sizeVector.end(), size.begin());
     } else {
-      size[0] = std::floor((*(input_sizeVector.rbegin() + 1)) * scales_h.value_or(1.0));
-      size[1] = std::floor((*(input_sizeVector.rbegin())) * scales_w.value_or(1.0));
+      size[0] = std::floor(static_cast<double>(*(input_sizeVector.rbegin() + 1)) * scales_h.value_or(1.0));
+      size[1] = std::floor(static_cast<double>(*(input_sizeVector.rbegin())) * scales_w.value_or(1.0));
     }
   interface: diopiUpsampleNearestBackward(ctx, grad_input, grad_output, size, input_size)
 
@@ -1305,11 +1353,11 @@
   custom_code_at_the_beginning: |
     std::vector<int64_t> size(2);
   custom_code_before_call_diopi: |
-    if (output_size.size() > 0) {
+    if (!output_size.empty()) {
       std::copy(output_sizeVector.begin(), output_sizeVector.end(), size.begin());
     } else {
-      size[0] = std::floor((*(input_sizeVector.rbegin() + 1)) * scales_h.value_or(1.0));
-      size[1] = std::floor((*(input_sizeVector.rbegin())) * scales_w.value_or(1.0));
+      size[0] = std::floor(static_cast<double>(*(input_sizeVector.rbegin() + 1)) * scales_h.value_or(1.0));
+      size[1] = std::floor(static_cast<double>(*(input_sizeVector.rbegin())) * scales_w.value_or(1.0));
     }
     const char* mode = "bilinear";
   interface: diopiUpsampleLinearBackward(ctx, grad_input, grad_output, size, input_size, align_corners, mode)
@@ -1418,7 +1466,7 @@
   custom_code_at_the_beginning: |
     auto shape = self.sizes();
     std::vector<int64_t> output_shape(shape.begin(), shape.end());
-    dim += dim >= 0 ? 0 : shape.size();
+    dim += dim >= 0 ? 0 : static_cast<int64_t>(shape.size());
     output_shape[dim] = index.numel();
     auto out = at::empty({output_shape}, self.options());
   interface: diopiIndexSelect(ctx, out, self, dim, index)
@@ -1527,7 +1575,35 @@
     at::Tensor neg_log_likelihood = at::empty({batch_size}, options);
     at::Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, options);
   backward_return_code: |
-    std::vector<at::Tensor> outputs(7);
+    /* Note: This kernel's output size will be checked by pytorch/torch/csrc/autograd/custom_function.h
+    *
+    *  ''' custom_function.h
+    *   auto num_outputs = static_cast<int64_t>(outputs.size());
+    *   // Returning too many results is ok, but only as long as they're all
+    *   // undefined. Truncate the result vector in that case.
+    *   if (num_outputs > num_forward_inputs) {
+    *     bool all_undef = true;
+    *     for (const auto i : c10::irange(num_forward_inputs, num_outputs)) {
+    *       all_undef &= (!outputs[i].defined());
+    *     }
+    *     if (all_undef) {
+    *       outputs.resize(num_forward_inputs);
+    *       num_outputs = num_forward_inputs;
+    *     }
+    *   }
+    *
+    *   if (num_outputs != num_forward_inputs) {
+    *     std::string msg("function ");
+    *     msg += name() + " returned an incorrect number of gradients (expected ";
+    *     msg += c10::to_string(num_forward_inputs) + ", got ";
+    *     msg += c10::to_string(num_outputs) + ")";
+    *     throw std::runtime_error(msg);
+    *   }
+    *   '''
+    */
+
+    constexpr int kSameAsInputSize = 7;
+    std::vector<at::Tensor> outputs(kSameAsInputSize);
     outputs[0] = result;
     return outputs;
 
@@ -1610,7 +1686,35 @@
     at::Tensor neg_log_likelihood = at::empty({batch_size}, options);
     at::Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, options);
   backward_return_code: |
-    std::vector<at::Tensor> outputs(7);
+    /* Note: This kernel's output size will be checked by pytorch/torch/csrc/autograd/custom_function.h
+    *
+    *  ''' custom_function.h
+    *   auto num_outputs = static_cast<int64_t>(outputs.size());
+    *   // Returning too many results is ok, but only as long as they're all
+    *   // undefined. Truncate the result vector in that case.
+    *   if (num_outputs > num_forward_inputs) {
+    *     bool all_undef = true;
+    *     for (const auto i : c10::irange(num_forward_inputs, num_outputs)) {
+    *       all_undef &= (!outputs[i].defined());
+    *     }
+    *     if (all_undef) {
+    *       outputs.resize(num_forward_inputs);
+    *       num_outputs = num_forward_inputs;
+    *     }
+    *   }
+    *
+    *   if (num_outputs != num_forward_inputs) {
+    *     std::string msg("function ");
+    *     msg += name() + " returned an incorrect number of gradients (expected ";
+    *     msg += c10::to_string(num_forward_inputs) + ", got ";
+    *     msg += c10::to_string(num_outputs) + ")";
+    *     throw std::runtime_error(msg);
+    *   }
+    *   '''
+    */
+    
+    constexpr int kSameAsInputSize = 7;
+    std::vector<at::Tensor> outputs(kSameAsInputSize);
     outputs[0] = result;
     return outputs;
 
@@ -1755,7 +1859,7 @@
       indices_tensor_vec[i] = (indices[i].has_value() && indices[i].value().defined()) ? indices[i].value().to(self.device()) : at::Tensor();
       indices_vec[i] = diopi_helper::toDiopiTensorHandle(indices_tensor_vec[i]);
     }
-  interface: diopiIndex(ctx, &out_ptr, self, indices_vec.data(), indices_vec.size())
+  interface: diopiIndex(ctx, &out_ptr, self, indices_vec.data(), static_cast<int64_t>(indices_vec.size()))
   custom_code_before_return: |
     dipu::getCurrentDIPUStream().synchronize();
     out = *reinterpret_cast<at::Tensor*>(out_ptr);
@@ -1769,7 +1873,7 @@
       indices_tensor_vec[i] = (indices[i].has_value() && indices[i].value().defined()) ? indices[i].value().to(self.device()) : at::Tensor();
       indices_vec[i] = diopi_helper::toDiopiTensorHandle(indices_tensor_vec[i]);
     }
-  interface: diopiIndexPut(ctx, self, self, values, indices_vec.data(), indices_vec.size(), accumulate)
+  interface: diopiIndexPut(ctx, self, self, values, indices_vec.data(), static_cast<int64_t>(indices_vec.size()), accumulate)
 
 - schema: "_cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor"
   custom_code_at_the_beginning: |
@@ -1832,15 +1936,15 @@
 
     int num_blocks = 1;
     for(int i = 0; i < 2; i++){
-      num_blocks *= int((input_shape[i + 2] + 2 * padding[i] - dilation[i] * (kernel_size[i] - 1) - 1) / stride[i]) + 1;
+      num_blocks *= static_cast<int>((input_shape[i + 2] + 2 * padding[i] - dilation[i] * (kernel_size[i] - 1) - 1) / stride[i]) + 1;
     }
-    int channels = input_shape[1];
+    int channels = static_cast<int>(input_shape[1]);
     for(int i = 0; i < 2; i++){
-      channels *= kernel_size[i];
+      channels *= static_cast<int>(kernel_size[i]);
     }
 
     std::vector<int64_t> out_shape({channels, num_blocks});
-    if(batched_input == true){
+    if(batched_input){
       out_shape.insert(out_shape.begin(), input_shape[0]);
     }
     auto out = at::empty({out_shape}, self.options());
@@ -1856,13 +1960,13 @@
       input_shape.insert(input_shape.begin(), 1);
     }
 
-    int channels = input_shape[1];
+    int channels = static_cast<int>(input_shape[1]);
     for(int i = 0; i < 2; i++){
-      channels = channels / kernel_size[i];
+      channels = channels / static_cast<int>(kernel_size[i]);
     }
 
     std::vector<int64_t> out_shape({channels, output_size.at(0).expect_int(), output_size.at(1).expect_int()});
-    if(batched_input == true){
+    if(batched_input){
       out_shape.insert(out_shape.begin(), input_shape[0]);
     }
     auto out = at::empty({out_shape}, self.options());
@@ -1907,7 +2011,12 @@
     auto shape = input.size(1);
     auto out0 = at::empty({shape}, input.options().dtype(at::kFloat));
     auto out1 = at::empty({shape}, input.options().dtype(at::kFloat));
-  interface: diopiBatchNormGatherStatsWithCounts(ctx, out0, out1, input, mean, invstd, const_cast<diopiTensorHandle_t>(running_mean), const_cast<diopiTensorHandle_t>(running_var), momentum, eps, counts)
+  interface: diopiBatchNormGatherStatsWithCounts(ctx, out0, out1, input, mean, invstd, const_cast<diopiTensorHandle_t>(running_mean), const_cast<diopiTensorHandle_t>(running_var), static_cast<float>(momentum), static_cast<float>(eps), counts)
+  custom_code_before_call_diopi: |
+    // NOTE: const_cast here is safe according to pytorch's source code
+    // NOLINTBEGIN(cppcoreguidelines-pro-type-const-cast)
+  custom_code_before_return: |
+    // NOLINTEND(cppcoreguidelines-pro-type-const-cast)
 
 - schema: batch_norm_backward_reduce(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g) -> (Tensor, Tensor, Tensor, Tensor)
   custom_code_at_the_beginning: |
@@ -1936,7 +2045,7 @@
 - schema: batch_norm_elemt(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps) -> Tensor
   custom_code_at_the_beginning: |
     auto out = at::empty_like(input);
-  interface: diopiBatchNormElemt(ctx, out, input, weight, bias, mean, invstd, eps);
+  interface: diopiBatchNormElemt(ctx, out, input, weight, bias, mean, invstd, static_cast<float>(eps));
 
 - schema: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!)
   interface: diopiSmoothL1Loss(ctx, out, self, target, static_cast<diopiReduction_t>(reduction), static_cast<double>(beta));
@@ -2143,7 +2252,7 @@
     auto selfVec = self.vec();
     auto scalarsCpu = scalars.cpu();
     for (size_t i = 0;i < self.size();i++) {
-      dipu_addcmul_(selfVec[i], tensor1[i], tensor2[i], scalarsCpu[i].item());
+      dipu_addcmul_(selfVec[i], tensor1[i], tensor2[i], scalarsCpu[static_cast<int64_t>(i)].item());
     }
     return;
   interface: diopiAddcmulInp(ctx, self, tensor1, tensor2, scalars)
@@ -2174,7 +2283,7 @@
     auto selfVec = self.vec();
     auto scalarsCpu = scalars.cpu();
     for (size_t i = 0;i < self.size();i++) {
-      dipu_addcdiv_(selfVec[i], tensor1[i], tensor2[i], scalarsCpu[i].item());
+      dipu_addcdiv_(selfVec[i], tensor1[i], tensor2[i], scalarsCpu[static_cast<int64_t>(i)].item());
     }
     return;
   interface: diopiAddcdivInp(ctx, self, tensor1, tensor2, scalars)
@@ -2262,12 +2371,14 @@
   custom_fallback: True
   custom_code_at_the_beginning: |
     std::vector<diopiTensorHandle_t> diopiTensorHandles(self.size(), nullptr);
+    // NOTE: const_cast here is safe according to pytorch's source code
+    // NOLINTBEGIN(cppcoreguidelines-pro-type-const-cast)
     std::transform(self.begin(), self.end(), diopiTensorHandles.begin(), [](const at::Tensor& t){
         return dipu::diopi_helper::toDiopiTensorHandle(const_cast<at::Tensor&>(t));
     });
-  interface: diopiAmpForeachNonFiniteCheckAndUnscaleInp(ctx, diopiTensorHandles.data(), self.size(), found_inf, inv_scale)
-  autocompare: disable
+    // NOLINTEND(cppcoreguidelines-pro-type-const-cast)
+  interface: diopiAmpForeachNonFiniteCheckAndUnscaleInp(ctx, diopiTensorHandles.data(), static_cast<int64_t>(self.size()), found_inf, inv_scale)
 
 - schema: _amp_update_scale_(Tensor(a!) self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor(a!)
   custom_fallback: True
-  interface: diopiAmpUpdateScaleInp(ctx, self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval)
+  interface: diopiAmpUpdateScaleInp(ctx, self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, static_cast<int32_t>(growth_interval))
diff --git a/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py b/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py
index 7eda79b15c..22076410d1 100644
--- a/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py
+++ b/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py
@@ -16,29 +16,37 @@
 #include <csrc_dipu/utils/Log.h>
 #include "CustomFallbackFunctions.hpp"
 #include "csrc_dipu/aten/ops/DIPUCopy.hpp"
+#include <vector>
 
 $header_include_code
 
-namespace dipu::native {
-
-using dipu::diopi_helper::toDiopiGeneratorHandle;
+// NOTE: some kernels (e.g. _foreach_add_.List) have custom codes at the beginning ending with early return.
+// This is a workaround indended to skip some of the autogened codes (e.g. type cast, calling DIOPI, etc.).
+// NOLINTBEGIN(readability-redundant-control-flow)
 
+namespace dipu {
 
-using namespace dipu::diopi_helper;
+namespace native {
+    
+using dipu::diopi_helper::toDiopiGeneratorHandle;
+using dipu::diopi_helper::toDiopiSize;
+using dipu::diopi_helper::toDiopiRoundMode;
 
-$functions_code
+$functions_code    
 
+}  // namespace native
+}  // namespace dipu
 
-}  // namespace dipu::native
+// NOLINTEND(readability-redundant-control-flow)
 
 namespace at {
 
 DIPU_LIBRARY_IMPL(aten, DIPU_DEVICE_TYPE_MACRO, m) {
-    $op_register_code
+  $op_register_code
 }
 
 DIPU_LIBRARY_IMPL(aten, DIPU_AUTOGRAD_DEVICE_TYPE_MACRO, m) {
-    $autograd_op_register_code
+  $autograd_op_register_code
 }
 
 }  // namespace at
@@ -49,34 +57,34 @@
 """
 //  $comment
 $cppsignautre {
-    dipu::profile::RecordBlockCreator _(__FUNCTION__);
-    $custom_code_at_the_beginning
+  dipu::profile::RecordBlockCreator _(__FUNCTION__);
+  $custom_code_at_the_beginning
 
-    ::diopiContext context(dipu::getCurrentDIPUStream().rawstream());
-    auto ctx = &context;
+  ::diopiContext context(dipu::getCurrentDIPUStream().rawstream());
+  auto ctx = &context;
 
-    $input_process_code
+  $input_process_code
 
-    $output_process_code
+  $output_process_code
 
-    $attrs_process_code
+  $attrs_process_code
 
-    $device_check_code
+  $device_check_code
 
-    $custom_code_before_call_diopi
+  $custom_code_before_call_diopi
 
-    dipu::profile::RecordBlockCreator dipuRecorder(R"($diopi_fun_call_code)");
-    ::diopiError_t ret = $diopi_fun_call_code
-    dipuRecorder.end();
-    if (checkDiopiReturnValue()) {
-        TORCH_CHECK(ret == ::diopiSuccess, __FILE__, ":", __LINE__, R"($diopi_fun_call_code)", " error, error code is ", ret, "error message is ", diopiGetLastErrorString());
-    }
+  dipu::profile::RecordBlockCreator dipuRecorder(R"($diopi_fun_call_code)");
+  ::diopiError_t ret = $diopi_fun_call_code
+  dipuRecorder.end();
+  if (checkDiopiReturnValue()) {
+    TORCH_CHECK(ret == ::diopiSuccess, __FILE__, ":", __LINE__, R"($diopi_fun_call_code)", " error, error code is ", ret, "error message is ", diopiGetLastErrorString());
+  }
 
-    $custom_code_before_return
+  $custom_code_before_return
 
-    synchronizeIfEnable();
+  synchronizeIfEnable();
 
-    $return_code
+  $return_code
 }
 """
 
@@ -94,29 +102,29 @@
 """
 class $autograd_function_name : public torch::autograd::Function<$autograd_function_name> {
 public:
-    static $return_code forward(torch::autograd::AutogradContext *ctx, $param_list) {
-        $forward_process_code
+  static $return_code forward(torch::autograd::AutogradContext *ctx, $param_list) {
+    $forward_process_code
 
-        $save_for_backward_code
+    $save_for_backward_code
 
-        at::AutoDispatchBelowADInplaceOrView g;
-        return $call_forward_impl_code;
-    }
+    at::AutoDispatchBelowADInplaceOrView g;
+    return $call_forward_impl_code;
+  }
 
   static std::vector<at::Tensor> backward(torch::autograd::AutogradContext *ctx, std::vector<at::Tensor> grad_outputs) {
-      $load_saved_data_code
+    $load_saved_data_code
 
-      $cal_grad_code
+    $cal_grad_code
 
-      $call_backward_impl_code
+    $call_backward_impl_code
 
-      $backward_return_code
+    $backward_return_code
   }
 };
 
 $cppsignautre {
-    auto result = $autograd_function_name::apply($arg_name_list);
-    $wrappter_custom_return
+  auto result = $autograd_function_name::apply($arg_name_list);
+  $wrappter_custom_return
 }
 """
 
@@ -125,15 +133,15 @@ class $autograd_function_name : public torch::autograd::Function<$autograd_funct
 """
 //  $comment
 $cppsignautre {
-    std::cout << std::endl << __FUNCTION__ << std::endl;
-    $transform_input_to_cpu_code
+  std::cout << std::endl << __FUNCTION__ << std::endl;
+  $transform_input_to_cpu_code
 
-    $execute_op_on_cpu_code
+  $execute_op_on_cpu_code
 
-    $execute_op_on_device_code
+  $execute_op_on_device_code
 
-    $transform_result_to_cpu_code
+  $transform_result_to_cpu_code
 
-    $result_compare_code
+  $result_compare_code
 }
 """
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUCopy.hpp b/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUCopy.hpp
index 47f519984e..c7298900ae 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUCopy.hpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUCopy.hpp
@@ -15,12 +15,12 @@ namespace dipu {
 namespace native {
 // NOTICE: these 2 func defined in AutoGenedKernels.cpp
 // if dipu autogen support header file gen, remove this
-at::Tensor dipu_wrap_diopi_cast_dtype(const at::Tensor& src,
+at::Tensor dipu_wrap_diopi_cast_dtype(const at::Tensor& self,
                                       at::ScalarType dtype);
 
 // if dipu autogen support proxy one torch op to multiple diopi op, remove
 // this.
-at::Tensor& dipu_wrap_diopi_copy_inp(at::Tensor& dst, const at::Tensor& src,
+at::Tensor& dipu_wrap_diopi_copy_inp(at::Tensor& self, const at::Tensor& src,
                                      bool non_blocking);
 
 }  // namespace native

From 16028db2b438b819ac5618bbe90b012a9c81197c Mon Sep 17 00:00:00 2001
From: Aaron <dswei@birentech.com>
Date: Mon, 11 Dec 2023 11:35:18 +0800
Subject: [PATCH 06/58] [FIX] fix virtual memory error of using SUPA (#468)

* [FIX] fix virtual memory of SUPA

* [FIX] fix incorrect copy

* [FIX] remove useless copy and add missing 'supa'in cmakelists.txt
---
 dipu/CMakeLists.txt                           |  6 ++--
 .../csrc_dipu/vendor/supa/copyinplace.cpp     | 17 +++++++----
 .../csrc_dipu/vendor/supa/deviceimpl.cpp      | 29 ++++++++++++++-----
 3 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/dipu/CMakeLists.txt b/dipu/CMakeLists.txt
index d94770c289..24b368a9de 100644
--- a/dipu/CMakeLists.txt
+++ b/dipu/CMakeLists.txt
@@ -44,7 +44,7 @@ elseif (${DEVICE} IN_LIST DEVICE_TOPSRIDER)
 elseif (${DEVICE} IN_LIST DEVICE_SUPA)
   set(USE_SUPA ON)
   set(UsedVendor supa)
-  set(DIOPI_IMPL_OPT "")
+  set(DIOPI_IMPL_OPT "supa")
   #SUPA DEVICE DOES NOT NEED TO BUILD DIOPI, so set the target to "" to control the workflow.
 elseif (${DEVICE} IN_LIST DEVICE_DROPLET)
   set(USE_DROPLET ON)
@@ -81,14 +81,14 @@ if(NOT DEFINED DIPU_ABI_V)
     OUTPUT_VARIABLE DIPU_ABI_V)
 endif()
 
-if(NOT DEFINED DIPU_COMPILED_WITH_CXX11_ABI)    
+if(NOT DEFINED DIPU_COMPILED_WITH_CXX11_ABI)
   execute_process(
     COMMAND
       sh -x -c
       "python -c 'import torch;print(1 if torch.compiled_with_cxx11_abi() else 0)'"
     OUTPUT_VARIABLE DIPU_COMPILED_WITH_CXX11_ABI)
 endif()
-    
+
 if(DIPU_COMPILED_WITH_CXX11_ABI GREATER 0)
   set(DIPU_COMPILED_WITH_CXX11_ABI 1)
 else()
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp b/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp
index 0b84a9e8ab..9149e8e985 100644
--- a/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp
+++ b/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp
@@ -18,11 +18,18 @@ class SUPACopyInplace : public DIPUCopyInpOnDIOPI {
   SUPACopyInplace() = default;
   ~SUPACopyInplace() = default;
 
-  // assume it can handle between device.
-  void copyNodirectBetweenDevices(at::Tensor& dst, const at::Tensor& src,
-                                  bool non_blocking,
-                                  CopyParamsInfo& info) override {
-    dipu_wrap_diopi_copy_inp(dst, src, non_blocking);
+  void run(at::Tensor& dst, const at::Tensor& src, bool non_blocking) override {
+    auto curStream = dipu::getCurrentDIPUStream();
+    ::diopiContext context(curStream.rawstream());
+    auto ctx = &context;
+    auto diopi_src = dipu::diopi_helper::toDiopiTensorHandle(src);
+    auto diopi_dst = dipu::diopi_helper::toDiopiTensorHandle(dst);
+    TORCH_CHECK(diopiError_t::diopiSuccess ==
+                diopiCopyInp(ctx, diopi_src, diopi_dst));
+    // syncAfterCopy
+    if (!non_blocking) {
+      dipu::devapis::syncStream(curStream.rawstream());
+    }
   }
 };
 
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/supa/deviceimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/supa/deviceimpl.cpp
index c04b74e79f..f2f2983869 100644
--- a/dipu/torch_dipu/csrc_dipu/vendor/supa/deviceimpl.cpp
+++ b/dipu/torch_dipu/csrc_dipu/vendor/supa/deviceimpl.cpp
@@ -184,6 +184,8 @@ DIPU_API void freeHost(void* p) { free(p); }
 extern "C" {
 void* br_device_malloc(uint64_t bytes);
 void br_device_free(void* ptr);
+// get physical address from ptr(virtual)
+void* get_phy_ptr(const void* ptr);
 }
 
 DIPU_API OpStatus mallocDevice(void** p, size_t nbytes, bool throwExcepion) {
@@ -206,47 +208,60 @@ DIPU_API bool isPinnedPtr(const void* p) { return false; }
 // (asynchronous) set val
 DIPU_API void memSetAsync(const deviceStream_t stream, void* ptr, int val,
                           size_t size) {
-  SUPA_CALL(suMemsetAsync(ptr, val, size, stream));
+  auto phy_gpu_addr = get_phy_ptr(ptr);
+  SUPA_CALL(suMemsetAsync(phy_gpu_addr, val, size, stream));
 }
 
 // (synchronous) copy from device to a device
 DIPU_API void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void* dst,
                          deviceId_t srcDevId, const void* src) {
   // SUPA uses Unified Virtual Address
-  SUPA_CALL(suMemcpy(dst, src, nbytes, suMemcpyDeviceToDevice));
+  auto phy_src_gpu_addr = get_phy_ptr(src);
+  auto phy_dst_gpu_addr = get_phy_ptr(dst);
+  SUPA_CALL(suMemcpy(phy_dst_gpu_addr, phy_src_gpu_addr, nbytes,
+                     suMemcpyDeviceToDevice));
 }
 
 // (synchronous) copy from host to a device
 DIPU_API void memCopyH2D(size_t nbytes, /*deviceId_t dstDevId,*/ void* dst,
                          /*Host srcDev,*/ const void* src) {
-  SUPA_CALL(suMemcpy(dst, src, nbytes, suMemcpyHostToDevice));
+  auto phy_dst_gpu_addr = get_phy_ptr(dst);
+  SUPA_CALL(suMemcpy(phy_dst_gpu_addr, src, nbytes, suMemcpyHostToDevice));
 }
 
 // (synchronous) copy from a device to host
 DIPU_API void memCopyD2H(size_t nbytes, /*Host dstDev,*/ void* dst,
                          /*deviceId_t srcDevId,*/ const void* src) {
-  SUPA_CALL(suMemcpy(dst, src, nbytes, suMemcpyDeviceToHost));
+  auto phy_src_gpu_addr = get_phy_ptr(src);
+  SUPA_CALL(suMemcpy(dst, phy_src_gpu_addr, nbytes, suMemcpyDeviceToHost));
 }
 
 // (asynchronous) copy from device to a device
 DIPU_API void memCopyD2DAsync(const deviceStream_t stream, size_t nbytes,
                               deviceId_t dstDevId, void* dst,
                               deviceId_t srcDevId, const void* src) {
-  SUPA_CALL(suMemcpyAsync(dst, src, nbytes, stream, suMemcpyDeviceToDevice));
+  auto phy_src_gpu_addr = get_phy_ptr(src);
+  auto phy_dst_gpu_addr = get_phy_ptr(dst);
+  SUPA_CALL(suMemcpyAsync(phy_dst_gpu_addr, phy_src_gpu_addr, nbytes, stream,
+                          suMemcpyDeviceToDevice));
 }
 
 // (asynchronous) copy from host to a device
 DIPU_API void memCopyH2DAsync(const deviceStream_t stream, size_t nbytes,
                               /*deviceId_t dstDevId,*/ void* dst,
                               /*Host srcDev,*/ const void* src) {
-  SUPA_CALL(suMemcpyAsync(dst, src, nbytes, stream, suMemcpyHostToDevice));
+  auto phy_dst_gpu_addr = get_phy_ptr(dst);
+  SUPA_CALL(suMemcpyAsync(phy_dst_gpu_addr, src, nbytes, stream,
+                          suMemcpyHostToDevice));
 }
 
 // (asynchronous) copy from a device to host
 DIPU_API void memCopyD2HAsync(const deviceStream_t stream, size_t nbytes,
                               /*Host dstDev,*/ void* dst,
                               /*deviceId_t srcDevId,*/ const void* src) {
-  SUPA_CALL(suMemcpyAsync(dst, src, nbytes, stream, suMemcpyDeviceToHost));
+  auto phy_src_gpu_addr = get_phy_ptr(src);
+  SUPA_CALL(suMemcpyAsync(dst, phy_src_gpu_addr, nbytes, stream,
+                          suMemcpyDeviceToHost));
 }
 }  // end namespace devapis
 }  // end namespace dipu

From b178d4c5f5bdb49ae224c237f836d211670e2836 Mon Sep 17 00:00:00 2001
From: wyz5864 <109072365+wyz5864@users.noreply.github.com>
Date: Mon, 11 Dec 2023 11:36:25 +0800
Subject: [PATCH 07/58] make conv2d out at right memory-format (#502)

---
 .../autogen_diopi_wrapper/diopi_functions.yaml  |  6 +++---
 dipu/tests/python/unittests/test_conv2d.py      | 17 +++++++++++++++++
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
index 46bebbd3f2..af1e62e564 100755
--- a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
+++ b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
@@ -515,7 +515,7 @@
     int64_t out_height = (height + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1) / stride[0] + 1;
     int64_t out_width = (width + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1) / stride[1] + 1;
     c10::SmallVector<int64_t, 4> output_size = {batch_size, out_channel, out_height, out_width};
-    at::Tensor out = at::empty(output_size, input.options());
+    at::Tensor out = at::empty(output_size, input.options().memory_format(input.suggest_memory_format()));
   interface: diopiConvolution2d(&context, out, input, weight, bias, stride, padding, dilation, groups)
 
 - schema: "convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)"
@@ -527,10 +527,10 @@
     at::Tensor grad_bias;
     std::vector<int64_t> bias_sizes;
     if (output_mask[0]) {
-      grad_input = at::empty(input.sizes(), input.options());
+      grad_input = at::empty_like(input);
     }
     if (output_mask[1]) {
-      grad_weight = at::empty(weight.sizes(), weight.options().dtype(at::kFloat));
+      grad_weight = at::empty(weight.sizes(), weight.options().dtype(at::kFloat).memory_format(weight.suggest_memory_format()));
     }
     if (output_mask[2]) {
       bias_sizes.push_back(grad_output.size(1));
diff --git a/dipu/tests/python/unittests/test_conv2d.py b/dipu/tests/python/unittests/test_conv2d.py
index e93181c670..b33677aef3 100644
--- a/dipu/tests/python/unittests/test_conv2d.py
+++ b/dipu/tests/python/unittests/test_conv2d.py
@@ -39,6 +39,23 @@ def test_conv_2d(self):
         )
         # print("conv2d output compare successfully")
 
+    def test_conv2d_nhwc(self):
+        device = torch.device("dipu")
+
+        m = nn.Conv2d(2, 3, 3).to(device=device, memory_format=torch.channels_last)
+        self.assertTrue(m.weight.is_contiguous(memory_format=torch.channels_last))
+
+        x = torch.rand(2, 2, 5, 5).to(device=device, memory_format=torch.channels_last)
+        x.requires_grad_()
+        self.assertTrue(x.is_contiguous(memory_format=torch.channels_last))
+
+        y = m(x)
+        self.assertTrue(y.is_contiguous(memory_format=torch.channels_last))
+
+        y.backward(torch.rand_like(y))
+        self.assertTrue(x.grad.is_contiguous(memory_format=torch.channels_last))
+        self.assertTrue(m.weight.grad.is_contiguous(memory_format=torch.channels_last))
+
 
 if __name__ == "__main__":
     run_tests()

From ad46e399c64976f525211a4ea2fad0d25ffa9ca3 Mon Sep 17 00:00:00 2001
From: tangzhiyi11 <tangzhiyi11@users.noreply.github.com>
Date: Mon, 11 Dec 2023 11:49:30 +0800
Subject: [PATCH 08/58] [dicp][ascend] add fusion switch file for ascend (#512)

---
 dicp/dicp/vendor/AscendGraph/codegen/fusion_switch.cfg | 10 ++++++++++
 dicp/dicp/vendor/AscendGraph/codegen/graph_compile.cpp |  8 +++++---
 dicp/dicp/vendor/AscendGraph/codegen/graph_utils.h     |  8 +++++++-
 dicp/dicp/vendor/AscendGraph/compile_job.py            |  3 ++-
 4 files changed, 24 insertions(+), 5 deletions(-)
 create mode 100644 dicp/dicp/vendor/AscendGraph/codegen/fusion_switch.cfg

diff --git a/dicp/dicp/vendor/AscendGraph/codegen/fusion_switch.cfg b/dicp/dicp/vendor/AscendGraph/codegen/fusion_switch.cfg
new file mode 100644
index 0000000000..71834659c8
--- /dev/null
+++ b/dicp/dicp/vendor/AscendGraph/codegen/fusion_switch.cfg
@@ -0,0 +1,10 @@
+{
+    "Switch":{
+        "GraphFusion":{
+            "ALL":"on"
+        },
+        "UBFusion":{
+            "ALL":"on"
+         }
+    }
+}
diff --git a/dicp/dicp/vendor/AscendGraph/codegen/graph_compile.cpp b/dicp/dicp/vendor/AscendGraph/codegen/graph_compile.cpp
index fbced63f60..99f422dcaa 100644
--- a/dicp/dicp/vendor/AscendGraph/codegen/graph_compile.cpp
+++ b/dicp/dicp/vendor/AscendGraph/codegen/graph_compile.cpp
@@ -1,7 +1,8 @@
 #include "graph_utils.h"
 
 static void compile(const std::string& graph_path,
-                    const std::string& graph_json_file) {
+                    const std::string& graph_json_file,
+                    const std::string& fusion_switch_file) {
   std::string graph_name = "BuildGraph";
   Graph graph(graph_name.c_str());
   std::ifstream f(graph_json_file);
@@ -18,13 +19,14 @@ static void compile(const std::string& graph_path,
     }
   }
 
-  AclgraphBuilder builder;
+  AclgraphBuilder builder{fusion_switch_file};
   builder.saveGraph(graph_path, graph, options);
 }
 
 int main(int argc, char* argv[]) {
   std::string graph_path{argv[1]};
   std::string graph_json_file{argv[2]};
-  compile(graph_path, graph_json_file);
+  std::string fusion_switch_file{argv[3]};
+  compile(graph_path, graph_json_file, fusion_switch_file);
   return 0;
 }
diff --git a/dicp/dicp/vendor/AscendGraph/codegen/graph_utils.h b/dicp/dicp/vendor/AscendGraph/codegen/graph_utils.h
index 69e06fec8a..2cbacf3bcb 100644
--- a/dicp/dicp/vendor/AscendGraph/codegen/graph_utils.h
+++ b/dicp/dicp/vendor/AscendGraph/codegen/graph_utils.h
@@ -82,11 +82,14 @@ ge::Operator genInput(const std::string op_name,
 
 class AclgraphBuilder {
  public:
-  explicit AclgraphBuilder() {
+  explicit AclgraphBuilder(const std::string& fusion_switch_file)
+      : _fusion_switch_file(fusion_switch_file) {
     // 1. system init
     auto kSocVersion = aclrtGetSocName();
     std::map<AscendString, AscendString> global_options = {
         {AscendString(ge::ir_option::SOC_VERSION), AscendString(kSocVersion)},
+        {AscendString(ge::ir_option::FUSION_SWITCH_FILE),
+         AscendString(_fusion_switch_file.c_str())},
         {AscendString(ge::ir_option::PRECISION_MODE), "allow_fp32_to_fp16"},
     };
     auto status = aclgrphBuildInitialize(global_options);
@@ -122,6 +125,9 @@ class AclgraphBuilder {
     aclgrphBuildFinalize();
     std::cout << "aclgrphBuildFinalize success!" << std::endl;
   }
+
+ private:
+  std::string _fusion_switch_file;
 };
 
 ge::Format get_ascend_format(const std::string& format) {
diff --git a/dicp/dicp/vendor/AscendGraph/compile_job.py b/dicp/dicp/vendor/AscendGraph/compile_job.py
index 625dc3dfb3..93b70dca43 100644
--- a/dicp/dicp/vendor/AscendGraph/compile_job.py
+++ b/dicp/dicp/vendor/AscendGraph/compile_job.py
@@ -28,6 +28,7 @@ def __init__(self, source_code) -> None:
         graph_util_path = load_and_run.__file__.replace('/load_and_run.py', '')
         source_path = graph_util_path + '/graph_compile.cpp'
         json_util_path = graph_util_path + '/nlohmann'
+        self.fusion_switch_file = graph_util_path + '/fusion_switch.cfg'
         self._cmd = ['/usr/bin/c++',
                      '-D_GLIBCXX_USE_CXX11_ABI=0',
                      '-fPIC',
@@ -67,7 +68,7 @@ def get_key(self):
 
     def build_graph(self, output_path, graph_path):
         self._compile()
-        cmd = [self._lib_path, output_path, graph_path]
+        cmd = [self._lib_path, output_path, graph_path, self.fusion_switch_file]
         try:
             subprocess.check_output(cmd, stderr=subprocess.STDOUT)
         except subprocess.CalledProcessError as e:

From 0bbb2ee32caa4aaa06a7193aab22bbef8e3c904b Mon Sep 17 00:00:00 2001
From: Lingjie <lilingjie@sensetime.com>
Date: Wed, 13 Dec 2023 14:45:53 +0800
Subject: [PATCH 09/58] [dipu] Speedup profiler ctor when not enabled (#526)

* speedup profiler ctor

* clean & format include
---
 .../csrc_dipu/profiler/profiler.cpp           | 71 ++++++++-----------
 dipu/torch_dipu/csrc_dipu/profiler/profiler.h | 71 ++++++++++++-------
 2 files changed, 75 insertions(+), 67 deletions(-)

diff --git a/dipu/torch_dipu/csrc_dipu/profiler/profiler.cpp b/dipu/torch_dipu/csrc_dipu/profiler/profiler.cpp
index ea23bf43f0..4789b49848 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/profiler.cpp
+++ b/dipu/torch_dipu/csrc_dipu/profiler/profiler.cpp
@@ -1,12 +1,17 @@
 #include "profiler.h"
 
-#include <ThreadUtil.h>
 #include <cstdio>
-#include <fstream>
+#include <memory>
+#include <utility>
 
 #include <c10/util/Exception.h>
+#include <c10/util/string_view.h>
 #include <torch/csrc/profiler/util.h>
 
+#include "csrc_dipu/profiler/CorrelationIDManager.h"
+
+#include "ThreadUtil.h"
+
 namespace dipu {
 
 namespace profile {
@@ -265,22 +270,20 @@ void abandonAllRecords() {
   resetId();
 }
 
-RecordCreator::RecordCreator(const string_t& name, size_t opId,
+RecordCreator::RecordCreator(string_t name, size_t opId,
                              uint64_t linkCorrelationId,
-                             const ExtraRecordInfo& extraInfo) {
+                             ExtraRecordInfo extraInfo) {
   if (isEnable()) {
-    name_ = name;
+    name_ = std::move(name);
     opId_ = opId;
     begin_ = torch::profiler::impl::getTime();
     end_ = false;
     linkCorrelationId_ = linkCorrelationId;
-    extraInfo_ = extraInfo;
+    extraInfo_ = std::move(extraInfo);
   }
 }
 
-RecordCreator::~RecordCreator() { end(); }
-
-void RecordCreator::end() {
+void RecordCreator::end() noexcept {
   if (!end_) {
     RecordsImpl::get().addRecord(
         Record{name_, opId_, begin_,
@@ -295,12 +298,12 @@ void RecordCreator::end() {
 DeviceRecordCreator::DeviceRecordCreator(string_t name, deviceStream_t stream,
                                          int streamId, size_t opId,
                                          uint64_t linkCorrelationId,
-                                         const ExtraRecordInfo& extraInfo) {
+                                         ExtraRecordInfo extraInfo) {
   if (isEnable()) {
     DeviceRecordsImpl::get().ensureSetup(stream);
-    name_ = name;
+    name_ = std::move(name);
     opId_ = opId;
-    extraInfo_ = extraInfo;
+    extraInfo_ = std::move(extraInfo);
     stream_ = stream;
     streamId_ = streamId;
     pStart_.reset(new DeviceEvent());
@@ -311,9 +314,7 @@ DeviceRecordCreator::DeviceRecordCreator(string_t name, deviceStream_t stream,
   }
 }
 
-DeviceRecordCreator::~DeviceRecordCreator() { end(); }
-
-void DeviceRecordCreator::end() {
+void DeviceRecordCreator::end() noexcept {
   if (!end_) {
     TORCH_CHECK(pStart_, "dipu profiler error with pStart_ is not inited");
     TORCH_CHECK(pStop_, "dipu profiler error with pStop_ is not inited");
@@ -329,12 +330,12 @@ void DeviceRecordCreator::end() {
 }
 
 static std::string extraceFunction(const std::string& functionName) {
-  auto start = functionName.find_first_not_of(":");
+  auto start = functionName.find_first_not_of(':');
   if (start == std::string::npos) {
     return "";
   }
 
-  auto end = functionName.find_first_of("(");
+  auto end = functionName.find_first_of('(');
   if (end == std::string::npos) {
     end = functionName.size();
   }
@@ -345,32 +346,18 @@ static std::string extraceFunction(const std::string& functionName) {
   return functionName.substr(start, end - start);
 }
 
-RecordBlockCreator::RecordBlockCreator(string_t name,
-                                       const ExtraRecordInfo& extraInfo,
-                                       deviceStream_t stream, int streamId,
-                                       bool enProfile) {
-  if (enProfile && isEnable()) {
-    size_t opId = generateId();
-    uint64_t correlationId =
-        CorrelationIDManager::instance().getCorrelationID();
-    name = extraceFunction(name);
-    pHostRecord_.reset(new RecordCreator("LaunchKernel_" + name, opId,
-                                         correlationId, extraInfo));
-    pDeviceRecord_.reset(new DeviceRecordCreator(name, stream, streamId, opId,
-                                                 correlationId, extraInfo));
-  }
-}
-
-void RecordBlockCreator::end() {
-  if (!finish_) {
-    pHostRecord_.reset();
-    pDeviceRecord_.reset();
-  }
-  finish_ = true;
+void RecordBlockCreator::initialize(string_t name, ExtraRecordInfo extraInfo,
+                                    deviceStream_t stream,
+                                    c10::StreamId streamId) {
+  size_t opId = generateId();
+  uint64_t correlationId = CorrelationIDManager::instance().getCorrelationID();
+  name = extraceFunction(name);
+  pHostRecord_ = std::make_unique<RecordCreator>("LaunchKernel_" + name, opId,
+                                                 correlationId, extraInfo);
+  pDeviceRecord_ = std::make_unique<DeviceRecordCreator>(
+      std::move(name), stream, streamId, opId, correlationId,
+      std::move(extraInfo));
 }
-
-RecordBlockCreator::~RecordBlockCreator() { end(); }
-
 }  // namespace profile
 
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/profiler/profiler.h b/dipu/torch_dipu/csrc_dipu/profiler/profiler.h
index eed733567c..7cb5a750d5 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/profiler.h
+++ b/dipu/torch_dipu/csrc_dipu/profiler/profiler.h
@@ -1,23 +1,23 @@
 #pragma once
 
-#include <IActivityProfiler.h>
-#include <chrono>
-#include <deque>
+#include <cstdint>
 #include <list>
 #include <map>
 #include <memory>
 #include <mutex>
-#include <stdint.h>
 #include <string>
-#include <thread>
 #include <unordered_map>
 #include <utility>
-#include <vector>
 
+#include <c10/core/Stream.h>
+#include <c10/util/Optional.h>
+#include <c10/util/string_view.h>
+
+#include "csrc_dipu/vendor/vendorapi.h"
 #include <csrc_dipu/base/basedef.h>
 #include <csrc_dipu/runtime/rthelper.h>
 
-#include "CorrelationIDManager.h"
+#include "IActivityProfiler.h"
 
 namespace dipu {
 namespace profile {
@@ -40,11 +40,9 @@ void abandonAllRecords();
 
 struct ExtraRecordInfo {
   string_t scope;
-  size_t opSeqId;
+  size_t opSeqId{};
   string_t attrs;
 
-  ExtraRecordInfo() : scope(""), opSeqId(0), attrs("") {}
-
   ExtraRecordInfo& setScope(const string_t& scopeName) {
     scope = scopeName;
     return *this;
@@ -86,7 +84,6 @@ class RecordsImpl final {
 
   std::map<std::pair<int64_t, int64_t>, libkineto::ResourceInfo> resourceInfo_;
 
- private:
   RecordsImpl() = default;
 
  public:
@@ -112,14 +109,13 @@ class RecordCreator final {
   ExtraRecordInfo extraInfo_;
 
  public:
-  explicit RecordCreator(const string_t& name, size_t opId,
-                         uint64_t linkCorrelationId,
-                         const ExtraRecordInfo& extraInfo = ExtraRecordInfo());
+  explicit RecordCreator(string_t name, size_t opId, uint64_t linkCorrelationId,
+                         ExtraRecordInfo extraInfo = ExtraRecordInfo());
 
-  ~RecordCreator();
+  ~RecordCreator() { end(); }
 
  private:
-  void end();
+  void end() noexcept;
 };
 
 class DeviceEvent;
@@ -148,27 +144,52 @@ class DeviceRecordCreator final {
  public:
   DeviceRecordCreator(string_t name, deviceStream_t stream, int streamId,
                       size_t opId, uint64_t linkCorrelationId,
-                      const ExtraRecordInfo& extraInfo = ExtraRecordInfo());
+                      ExtraRecordInfo extraInfo = ExtraRecordInfo());
 
-  ~DeviceRecordCreator();
+  ~DeviceRecordCreator() { end(); }
 
  private:
-  void end();
+  void end() noexcept;
 };
 
 class RecordBlockCreator {
  public:
+  // TODO(lljbash): maybe use std::string_view and std::optional after c++17
   explicit RecordBlockCreator(
-      string_t name, const ExtraRecordInfo& extraInfo = ExtraRecordInfo(),
-      deviceStream_t stream = dipu::getCurrentDIPUStream(),
-      int streamId = dipu::getCurrentDIPUStream().id(),
-      bool enProfile = isEnable());
+      c10::string_view name,
+      c10::optional<ExtraRecordInfo> extraInfo = c10::nullopt,
+      c10::optional<deviceStream_t> stream = c10::nullopt,
+      c10::optional<c10::StreamId> streamId = c10::nullopt,
+      c10::optional<bool> enProfile = c10::nullopt) {
+    if (enProfile.value_or(isEnable())) {
+      if (!extraInfo) {
+        extraInfo.emplace();
+      }
+      if (!stream) {
+        auto dipu_stream = getCurrentDIPUStream();
+        if (!streamId) {
+          streamId = dipu_stream.id();
+        }
+        stream = static_cast<deviceStream_t>(dipu_stream);
+      }
+      initialize(string_t(name), std::move(*extraInfo), *stream, *streamId);
+    }
+  }
 
-  void end();
+  void end() noexcept {
+    if (!finish_) {
+      pHostRecord_.reset();
+      pDeviceRecord_.reset();
+      finish_ = true;
+    }
+  }
 
-  ~RecordBlockCreator();
+  ~RecordBlockCreator() { end(); }
 
  private:
+  void initialize(string_t name, ExtraRecordInfo extraInfo,
+                  deviceStream_t stream, c10::StreamId streamId);
+
   std::unique_ptr<RecordCreator> pHostRecord_ = nullptr;
   std::unique_ptr<DeviceRecordCreator> pDeviceRecord_ = nullptr;
   bool finish_ = false;

From f1c2f3154d32d2c0b2c12cf18006cb2a837575da Mon Sep 17 00:00:00 2001
From: ustclight-sls <55499123+ustclight-sls@users.noreply.github.com>
Date: Wed, 13 Dec 2023 14:56:09 +0800
Subject: [PATCH 10/58] [DIPU]clang-tidy_shanhang (#516)

* Create main readme

* Update readme.md

* Update readme.md

* Update readme.md

* add clone kineto for dicp (#457)

add clone kineto for dicp

* [dicp][ascend] infer op result_info (#448)

* finish res_op_infer for softmax+log_softmax+add+amax(keepdim=True) pass static test

* repeal modification to diopi

* modify operator logic in /DIPU/dicp/dicp/dynamo_bridge/operator.py to support test of'infer_result'

* fix a bug in get_cast_dtype: type(int+bool) should be int

* clean code format

* fix gettupleelem in topsgraph

---------

Co-authored-by: jinminxi104 <jinminxi104@hotmail.com>

* Fdy/enhance copy (#430)

* mv vopy file path

* add new copy

* fix static param err

* fix copy err

* fix direct copy bug

* rm unused bcast template name

* change clang format

* change name hpp

* rm unused header file

* remove unused header 2

* change override behavior

* change comment

* change cudacopy

* fix d2d copy err

* change register to use autogen

* revert incorrect format

* config fallback

* fix link err

* fix comment wanglei

* add newline

* fix cpu copy err

* add camb vendor copy

* fix copy err

* fix copy err 2

* fix compile err

* fix lingjie comment1

* fix caikun comment

* fix camb ci

* fix camb ci

* fix device switch err

* fix ling jie caikun comment 2

* fix comment incorrect local  ref

* change init copy

* update DIOPI submodule (#458)

* update DIOPI submodule

* diopi update to main

* update mmcv version

* update submodule

* update mmcv commit id

* feat: pass CMAKE_BUILD_TYPE into DIOPI (#428)

* [dipu] Fix copy_ fallback of topsrider. (#477)

* [dicp][tops] Add dicp ci of tops. (#469)

* Add dicp ci of tops.

* Fix dicp ci of tops.

* fix recycle dep (#474)

* Fdy/fix copy tidy (#471)

* fix tidy 0

* fix clang tidy copy

* fix lingjie comment

* add tidy msg

* fix lint comment

* fix format

* add copy right

* fuj/ add ceil.out (#480)

* add ceil.out

* add floor_ and cases for floor_, ceil and ceil_

* [dipu] tidy some source files and update nv build script (#453)

* fix: tidy some source files
- and also update build nv script

* fix: make clang-format v16 happy

* fix: make clang-format v16 happy

* fix: remove usings and simplify some code

* fix: remove index

* fix: remove initialized_

* fix: add keyword VERSION

* fix: remove VERSION 3.25 as CI is using CMake 3.22

* add 910B CI && remove 910 CI && update DIOPI (#481)

* add 910b

* add 910b

* add 910b

* add 910b

* add resnet50

* fix bugs

* fix bugs

* fix bugs

* fix bugs

* fix bugs

* rm nouse code

* update DIOPI submodule (#458)

* update DIOPI submodule

* diopi update to main

* update mmcv version

* update submodule

* update mmcv commit id

* feat: pass CMAKE_BUILD_TYPE into DIOPI (#428)

* [dipu] Fix copy_ fallback of topsrider. (#477)

* [dicp][tops] Add dicp ci of tops. (#469)

* Add dicp ci of tops.

* Fix dicp ci of tops.

* fix recycle dep (#474)

* rm 910 ci

* update diopi

* rm 910

---------

Co-authored-by: wugeshui <wugeshui@sensetime.com>
Co-authored-by: CyCle1024 <ccy_justin@163.com>
Co-authored-by: Peter Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: wiryls <7984500+wiryls@users.noreply.github.com>
Co-authored-by: yaofengchen <67218893+yao-fengchen@users.noreply.github.com>
Co-authored-by: fandaoyi <fandaoyi@sensetime.com>
Co-authored-by: wugeshui <106943115+wugeshui@users.noreply.github.com>

* [dipu]add ascend profiler (#476)

* add ascend profiler

* support with_stack

* code format

* fix clang tidy

* optimize naming

* optimize naming

* add dipu ci on dicp (#488)

* [dicp][ascend] fix ascend mm/bmm on 910B (#482)

* mock torch.cuda.XXXTensor (#462)

* mock torch.cuda.XXXTensor

* add newline at end of file

* fix conflict

* fix format

* fix format

* fix comment

* Fix `multiprocessing.Process` tests not collected by coverage and gcov (#486)

* Fix `multiprocessing.Process` tests not collected by coverage and gcov

* fix --concurrency=multiprocessing

* [dipu] update tidy configuration and remove if-constexpr in C++14 (#470)

* fix: update tidy config and remove if-constexpr

* fix: it should be a list instead of bool value

* feat: update clangd config

* fix: move the comment out of yaml scalar

* docs: add comments

* fix: add DeviceIndex

* fix: add some checks for headers

* feat: update .clang-tidy

* add profiler readme (#489)

* add profiler readme

* Update readme.md

* update

* Update readme.md

* Update readme.md

* Update readme.md

---------

Co-authored-by: caikun-pjlab <116071181+caikun-pjlab@users.noreply.github.com>

* [dicp][tops] support outputs with inplace copy (#440)

* add dipu stream synchronize.

* adjust some ops.

* fix some paras error and rename device name.

* unset keep_inference_input_mutations.

* fix paras error in conversion.

* fix para dtype conversion.

* fix empty output and inplace copy of input paras in optimizer case.

* remove inplace output gen_empty_tensor.

* Ywt/fix autocompare compile error (#492)

* pass string to python

* disable _amp_foreach_non_finite_check_and_unscale_ autocompare

* [dipu] Wx/support the test for llm inference (#454)

* add one iter for llm

* add bert ci using the correct transformers repository

* add test for the inference of llama 7b using the transformers repository

* one iter test for traditional models by default

* fix bug

* add test for the inference of internlm 7b using the transformers repository

* test for torch_dipu

* set device check args other for maximum.out

* fix the partition arg parsing bug on cuda

* test the setting of CUDA_PARTITION

* fix the bug of setting CUDA_PARTATION

* add llm

* add llm

* optimize the selection of model list

* set pythonpath for torch_dipu

* test

* fix bug in the command of setting pythonpath

---------

Co-authored-by: wugeshui <wugeshui@sensetime.com>

* [DIPU]Wx/check the status of build dipu (#490)

* check the status of build dipu on camb and nv

* add check for ascend

* fix the bug of pipe

* [DIPU] Wx/add schema for logical or and logical not ops (#484)

* add schema for logical or and logical not ops

* fix bug and add test cases for these ops

* add the test case: out is empty tensor

* [dicp][ascend] infer op resinfo (part 2) (#491)

* fix a bug in get_cast_dtype: type(int+bool) should be int

* clean code format

* finish res_op_infer for more simple operators

* Update operator.py

delete some unnecessary print()

* Update operator.py

clean code

* finish operators' info inference except for those having trouble testing solely without inference and operators involving Reshape still have problems

* clean code format

* Update warning message output in operator.py

* extract common function for general binary and unary operator ,add op bmm's inference

* Update ascend_op.py

delete unuse param

* update DIOPI submodule (#485)

* update DIOPI submodule

* update submodule

* temporily forbid resnet50

* move the testing code to dir under torch_dipu (#465)

* move the testing code to dir under torch_dipu

* fix a little bug

* create two soft link to avoid import torch_dipu  too early.

* add one more soft link file to solve bugs.

* support dev fork ci (#496)

* support dev fork ci

* [dipu] add markdownlint and update most markdown files (#493)

* doc: update docs and add markdownlint

* doc: rename readme.md to README.md

* fix: remove MD013

* doc: format

* [dicp][tops] Support some ops for stable-diffusion. (#467)

* Add sin, cos, erf, split.

1. Generalize MakeTuple in tops_op.
2. Generalize make_const in enflame codegen.
3. Add sin, cos, erf, split for tops.
4. Format Python code in dicp tops.

* refine code

* fix abs test path

* clean up code of split.

* adjust const op generation.

* fix nullptr case in const generation.

---------

Co-authored-by: jinminxi104 <jinminxi104@hotmail.com>
Co-authored-by: Reinerzhou <1768552509@qq.com>

* [DIPU] Wx/modify maximum schema due to the case in the inference of internlm (#494)

* improve maximum schema due to the case in the inference of internlm

* fix bug according to comments

* fix bug

* [both] fix, format and remove spaces in README.md (#497)

* doc(readme): fix, format and remove spaces

* fix: typo and try auto-correct

* feat(ci): add autocorrect into ci

* fix: remove autocorrect form ci as it's not ready

* update env python 3.10 (#503)

* fix clang tidy

* [dicp][ascend] get soc_version from aclrt (#505)

* fix clang tidy

* fix format

* fix format

---------

Co-authored-by: MiaoYYu <13160677487@163.com>
Co-authored-by: wugeshui <106943115+wugeshui@users.noreply.github.com>
Co-authored-by: Juntao Chen <90135463+KevinfromTJ@users.noreply.github.com>
Co-authored-by: jinminxi104 <jinminxi104@hotmail.com>
Co-authored-by: fandaoyi <fandaoyi@sensetime.com>
Co-authored-by: Peter Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: wiryls <7984500+wiryls@users.noreply.github.com>
Co-authored-by: yaofengchen <67218893+yao-fengchen@users.noreply.github.com>
Co-authored-by: Fu Jingguo <fujingguo@sensetime.com>
Co-authored-by: hellozmz <407190054@qq.com>
Co-authored-by: wugeshui <wugeshui@sensetime.com>
Co-authored-by: CyCle1024 <ccy_justin@163.com>
Co-authored-by: caikun-pjlab <116071181+caikun-pjlab@users.noreply.github.com>
Co-authored-by: tangzhiyi11 <tangzhiyi11@users.noreply.github.com>
Co-authored-by: wyz5864 <109072365+wyz5864@users.noreply.github.com>
Co-authored-by: Lingjie <lilingjie@sensetime.com>
Co-authored-by: Joyce YU <30998166+MiaoYYu@users.noreply.github.com>
Co-authored-by: Reinerzhou <87467364+Reinerzhou@users.noreply.github.com>
Co-authored-by: POI-WX <131418410+POI-WX@users.noreply.github.com>
Co-authored-by: HuayiL <442488254@qq.com>
Co-authored-by: Reinerzhou <1768552509@qq.com>
Co-authored-by: liwenjian-sensetime <109193776+liwenjian-sensetime@users.noreply.github.com>
Co-authored-by: shanhang <shanhang@sensetime.com>
---
 .../csrc_dipu/runtime/core/DIPUDeviceInfo.cpp | 10 ++-
 .../csrc_dipu/runtime/core/DIPUEventPool.cpp  |  2 +-
 .../csrc_dipu/runtime/core/DIPUStream.cpp     | 84 ++++++++---------
 .../csrc_dipu/runtime/core/DIPUStream.h       |  2 +-
 .../core/allocator/DIPUBFCachingAllocator.cpp |  4 +-
 .../core/allocator/DIPUBSCachingAllocator.cpp | 89 ++++++++++---------
 .../core/allocator/DIPUCachingAllocator.cpp   | 47 +++++-----
 .../core/allocator/DIPUCachingAllocator.h     | 82 ++++++++---------
 .../allocator/DIPUCachingAllocatorUtils.h     |  4 +-
 .../core/allocator/DIPURawAllocator.cpp       | 28 +++---
 .../allocator/DIPURawCachingAllocator.cpp     | 26 +++---
 11 files changed, 202 insertions(+), 176 deletions(-)

diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUDeviceInfo.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUDeviceInfo.cpp
index ab144e7fd4..db4e3c6b99 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUDeviceInfo.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUDeviceInfo.cpp
@@ -15,25 +15,29 @@ using c10::DeviceIndex;
 using dipu::devapis::DIPUDeviceProperties;
 using std::shared_ptr;
 
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 DeviceIndex num_gpus = -1;
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 c10::once_flag init_flag;
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 std::deque<c10::once_flag> device_flags;
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 std::vector<shared_ptr<DIPUDeviceProperties>> device_properties;
 
-static void initDIPUContextVectors() {
+void initDIPUContextVectors() {
   num_gpus = dipu::devproxy::getDeviceCount();
   device_flags.resize(num_gpus);
   device_properties.resize(num_gpus);
 }
 
-static void initDeviceProperty(DeviceIndex device_index) {
+void initDeviceProperty(DeviceIndex device_index) {
   DIPUDeviceProperties device_prop =
       dipu::devproxy::getDeviceProperties(device_index);
   device_properties[device_index] =
       std::make_shared<DIPUDeviceProperties>(device_prop);
 }
 
-static inline void checkDevice(int32_t device_index) {
+inline void checkDevice(int32_t device_index) {
   c10::call_once(init_flag, initDIPUContextVectors);
   if (device_index == -1) {
     device_index = dipu::devproxy::current_device();
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp
index eede27bf36..d3ad103fa4 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp
@@ -65,7 +65,7 @@ EventPool<deviceEvent_t>* getEventPool() {
   const int index = devproxy::current_device();
 // GlobalEventPool for different cards , construct when really needed
 #define dispatch_event_pool(device_id)                               \
-  if (index == device_id) {                                          \
+  if (index == (device_id)) {                                        \
     static EventPool<deviceEvent_t> gDIPUEventPool(                  \
         [](deviceEvent_t& event) { devapis::createEvent(&event); },  \
         [](deviceEvent_t& event) { devapis::destroyEvent(event); }); \
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.cpp
index 4057e0dd28..6b9ed029f5 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.cpp
@@ -38,19 +38,24 @@ std::ostream& operator<<(std::ostream& stream, StreamIdType s) {
   return stream;
 }
 // follow old pytorch cuda, seems new version use an opposite strategy.
-static constexpr int kStreamsPerPoolBits = 3;
-static constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
+constexpr int kStreamsPerPoolBits = 3;
+constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
 
 // Global stream state and constants
-static c10::DeviceIndex num_dipus = -1;
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+c10::DeviceIndex num_dipus = -1;
 // Default streams
-static std::once_flag global_init_flag;
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+std::once_flag global_init_flag;
 
 // streamid contains streamtype and/or raw stream id in DIPUStreamDevice pool
-static thread_local std::unique_ptr<c10::StreamId[]> current_streams = nullptr;
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+thread_local std::unique_ptr<std::vector<c10::StreamId>> current_streams =
+    nullptr;
 
-static c10::StreamId makeC10StreamId(StreamIdType sType, size_t id) {
-  return ((uint32_t) static_cast<c10::StreamId>(sType) << kStreamsPerPoolBits) |
+c10::StreamId makeC10StreamId(StreamIdType sType, size_t id) {
+  return (static_cast<uint32_t>(static_cast<c10::StreamId>(sType)
+                                << kStreamsPerPoolBits)) |
          static_cast<c10::StreamId>(id);
 }
 
@@ -60,25 +65,27 @@ struct DIPUStreamDevice {
   // Default streams
   std::once_flag pool_flag;
   std::once_flag default_flag;
-  deviceId_t devidx_;
+  deviceId_t devidx_{};
   // seems pytorch 2.0 giveup default stream and enable cuda per_thread stream
   // feature at compile time. it cannot be applied to othe device.
   deviceStream_t default_stream = nullptr;
 
-  std::atomic<uint32_t> next_pool_pos;
-  std::array<deviceStream_t, kStreamsPerPool> pool_streams;
+  std::atomic<uint32_t> next_pool_pos{};
+  std::array<deviceStream_t, kStreamsPerPool> pool_streams{};
 
   inline uint32_t getNextPoolIdx() {
     auto raw_idx = next_pool_pos++;
     return raw_idx % kStreamsPerPool;
   }
 
-  inline StreamIdType getStreamIdType(c10::StreamId s) {
-    return static_cast<StreamIdType>((uint32_t)s >> kStreamsPerPoolBits);
+  static StreamIdType getStreamIdType(c10::StreamId s) {
+    return static_cast<StreamIdType>(static_cast<uint32_t>(s) >>
+                                     kStreamsPerPoolBits);
   }
 
-  inline size_t getStreamIdIndex(c10::StreamId s) {
-    return static_cast<size_t>((uint32_t)s & ((1 << kStreamsPerPoolBits) - 1));
+  static size_t getStreamIdIndex(c10::StreamId s) {
+    return static_cast<size_t>(static_cast<uint32_t>(s) &
+                               ((1 << kStreamsPerPoolBits) - 1));
   }
   void _doInitPool() {
     DIPUGuard device_guard{devidx_};
@@ -96,17 +103,15 @@ struct DIPUStreamDevice {
   }
 
  public:
-  DIPUStreamDevice(deviceId_t devidx) {
-    devidx_ = devidx;
-    next_pool_pos = 0;
-  }
+  explicit DIPUStreamDevice(deviceId_t devidx)
+      : next_pool_pos(0), devidx_(devidx) {}
 
   DIPUStream getDIPUStreamfromPool() {
     const auto idx = getNextPoolIdx();
     return DIPUStream(devidx_, makeC10StreamId(StreamIdType::POOL, idx));
   }
 
-  DIPUStream getDefaultDIPUStream() {
+  DIPUStream getDefaultDIPUStream() const {
     return DIPUStream(devidx_, makeC10StreamId(StreamIdType::DEFAULT, 0));
   }
 
@@ -141,10 +146,10 @@ struct DIPUStreamDevice {
   }
 };
 
-static std::array<std::unique_ptr<DIPUStreamDevice>, C10_COMPILE_TIME_MAX_DIPUS>
-    streamDeviceList;
+std::array<std::unique_ptr<DIPUStreamDevice>, C10_COMPILE_TIME_MAX_DIPUS>
+    streamDeviceList;  // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 
-static void initGlobalStreamState() {
+void initGlobalStreamState() {
   num_dipus = devproxy::getDeviceCount();
   // Check if the number of DIPU matches the expected compile-time max number
   // of DIPU.
@@ -155,12 +160,11 @@ static void initGlobalStreamState() {
       C10_COMPILE_TIME_MAX_DIPUS, "). Increase that and recompile.");
 
   for (int i = 0; i < num_dipus; i++) {
-    streamDeviceList[i] =
-        std::move(std::unique_ptr<DIPUStreamDevice>(new DIPUStreamDevice(i)));
+    streamDeviceList[i] = std::move(std::make_unique<DIPUStreamDevice>(i));
   }
 }
 
-static c10::DeviceIndex initDIPUGlobal(c10::DeviceIndex devIdx) {
+c10::DeviceIndex initDIPUGlobal(c10::DeviceIndex devIdx) {
   // Inits default streams (once, globally)
   std::call_once(global_init_flag, initGlobalStreamState);
 
@@ -175,11 +179,11 @@ static c10::DeviceIndex initDIPUGlobal(c10::DeviceIndex devIdx) {
   if (current_streams) {
     return devIdx;
   }
-  current_streams = std::make_unique<c10::StreamId[]>(num_dipus);
+  current_streams = std::make_unique<std::vector<c10::StreamId>>(num_dipus);
 
   // Inits current streams (thread local) to default streams
   for (const auto i : c10::irange(num_dipus)) {
-    current_streams[i] = makeC10StreamId(StreamIdType::DEFAULT, 0);
+    (*current_streams)[i] = makeC10StreamId(StreamIdType::DEFAULT, 0);
   }
   // set device default stream in init
   return devIdx;
@@ -193,21 +197,21 @@ deviceStream_t DIPUStream::rawstream() const {
       this->unwrap().id());
 }
 
-DIPUStream getDIPUStreamFromPool(c10::DeviceIndex devIdx) {
-  devIdx = initDIPUGlobal(devIdx);
+DIPUStream getDIPUStreamFromPool(c10::DeviceIndex device_index) {
+  device_index = initDIPUGlobal(device_index);
   // Initializes the stream pools (once)
-  streamDeviceList[devIdx]->initPool();
-  return streamDeviceList[devIdx]->getDIPUStreamfromPool();
+  streamDeviceList[device_index]->initPool();
+  return streamDeviceList[device_index]->getDIPUStreamfromPool();
 }
 
-DIPUStream getDefaultDIPUStream(c10::DeviceIndex devIdx) {
-  devIdx = initDIPUGlobal(devIdx);
-  return streamDeviceList[devIdx]->getDefaultDIPUStream();
+DIPUStream getDefaultDIPUStream(c10::DeviceIndex device_index) {
+  device_index = initDIPUGlobal(device_index);
+  return streamDeviceList[device_index]->getDefaultDIPUStream();
 }
 
-DIPUStream getCurrentDIPUStream(c10::DeviceIndex devIdx) {
-  devIdx = initDIPUGlobal(devIdx);
-  return DIPUStream(devIdx, current_streams[devIdx]);
+DIPUStream getCurrentDIPUStream(c10::DeviceIndex device_index) {
+  device_index = initDIPUGlobal(device_index);
+  return DIPUStream(device_index, (*current_streams)[device_index]);
 }
 
 // copy from pytorch, not verify
@@ -220,11 +224,11 @@ DIPUStream getStreamFromExternal(deviceStream_t ext_stream,
 void setCurrentDIPUStream(DIPUStream stream) {
   auto devIdx = stream.device_index();
   initDIPUGlobal(devIdx);
-  current_streams[devIdx] = stream.unwrap().id();
+  (*current_streams)[devIdx] = stream.unwrap().id();
 }
 
-std::ostream& operator<<(std::ostream& os, const DIPUStream& stream) {
-  return os << stream.unwrap();
+std::ostream& operator<<(std::ostream& stream, const DIPUStream& s) {
+  return stream << s.unwrap();
 }
 
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.h b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.h
index 33cadadee2..a16ece3d4a 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.h
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.h
@@ -84,7 +84,7 @@ class DIPU_API DIPUStream {
   c10::Stream stream_;
 };
 
-DIPU_API DIPUStream getDIPUStreamFromPool(c10::DeviceIndex device = -1);
+DIPU_API DIPUStream getDIPUStreamFromPool(c10::DeviceIndex device_index = -1);
 
 DIPU_API DIPUStream getDefaultDIPUStream(c10::DeviceIndex device_index = -1);
 
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBFCachingAllocator.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBFCachingAllocator.cpp
index 388d05856c..274b2a51cf 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBFCachingAllocator.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBFCachingAllocator.cpp
@@ -561,7 +561,7 @@ static void deleteBFContext(void* ptr) {
   delete ctx;
 }
 
-DIPU_REGISTER_ALLOCATOR(BF, dipu::DIPU_DEVICE_TYPE, BFCachingAllocator, 0);
-DIPU_REGISTER_ALLOCATOR(BF, at::DeviceType::CPU, BFCachingAllocator, 0);
+DIPU_REGISTER_ALLOCATOR(BF, DIPU_DEVICE_TYPE_MACRO, BFCachingAllocator, 0);
+DIPU_REGISTER_ALLOCATOR(BF, CPU, BFCachingAllocator, 0);
 
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBSCachingAllocator.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBSCachingAllocator.cpp
index 04dcead1f7..df1dd7a800 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBSCachingAllocator.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBSCachingAllocator.cpp
@@ -1,18 +1,18 @@
 // Copyright (c) 2023, DeepLink.
 
+#include <cstdint>
 #include <deque>
 #include <list>
 #include <map>
 #include <mutex>
 #include <set>
-#include <stdint.h>
 #include <unordered_map>
 
 #include "DIPUCachingAllocator.h"
 
 namespace dipu {
 
-static void deleteBSContext(void*);
+static void deleteBSContext(void* ptr);
 
 class BSCachingAllocator : public CacheAllocator {
   struct Impl {
@@ -26,32 +26,35 @@ class BSCachingAllocator : public CacheAllocator {
   mutable mutex_t mutex_;
 
  public:
-  BSCachingAllocator() { impl.reset(new Impl()); }
+  BSCachingAllocator() { impl = std::make_unique<Impl>(); }
 
-  ~BSCachingAllocator() { release_all_memory(); }
+  ~BSCachingAllocator() override { release_all_memory(); }
 
   // Better adaptability to memory blocks of various sizes, but internal
   // fragmentation will be larger
-  size_t getAllocateSizeMoreAdaptable(size_t nbytes) const {
-    static const int kMinAllocationSizeExp = []() {
-      size_t size = 511;
+  static size_t getAllocateSizeMoreAdaptable(size_t nbytes) {
+    constexpr int kMaxBits = 32;
+    static const int kMinAllocationSizeExp = [kMaxBits]() {
+      constexpr int kBytesNum = 511;
+      size_t size = kBytesNum;
       const char* env = std::getenv("DIPU_BS_ALLOCATOR_MIN_ALLOCATE_SIZE");
       if (env != nullptr) {
         size = std::atoi(env);
       }
-      int exp = 32 - __builtin_clz(size);
+      int exp = kMaxBits - __builtin_clz(size);
       return exp;
     }();
-    auto r = std::max(32 - __builtin_clz(nbytes), kMinAllocationSizeExp);
+    auto r = std::max(kMaxBits - __builtin_clz(nbytes), kMinAllocationSizeExp);
     size_t allocateSize = 1 << r;
     return allocateSize;
   }
 
   // The internal fragments are smaller, but are less adaptable to scenes with
   // frequent and drastic changes in size.
-  size_t getAllocateSizeLessFragmentation(size_t nbytes) const {
+  static size_t getAllocateSizeLessFragmentation(size_t nbytes) {
     static const size_t kMinAllocationSize = []() {
-      size_t size = 512;
+      const int kBytesNum = 512;
+      size_t size = kBytesNum;
       const char* env = std::getenv("DIPU_BS_ALLOCATOR_MIN_ALLOCATE_SIZE");
       if (env != nullptr) {
         size = std::atoi(env);
@@ -62,7 +65,7 @@ class BSCachingAllocator : public CacheAllocator {
     return allocateSize;
   }
 
-  size_t getAllocateSize(size_t nbytes) const {
+  static size_t getAllocateSize(size_t nbytes) {
     static bool less_fragmentation =
         std::getenv("DIPU_BS_MORE_ADAPTABLE") == nullptr;
     return less_fragmentation ? getAllocateSizeLessFragmentation(nbytes)
@@ -79,11 +82,11 @@ class BSCachingAllocator : public CacheAllocator {
     size_t nbytes = getAllocateSize(size);
     void* ptr = nullptr;
     auto& idel_blocks = impl->idel_blocks_[nbytes];
-    if (idel_blocks.size() <= 0) {
+    if (idel_blocks.empty()) {
       empty_resource_pool();
     }
     for (size_t i = 0; i < 2; i++) {
-      if (idel_blocks.size() > 0) {
+      if (!idel_blocks.empty()) {
         ptr = idel_blocks.front();
         idel_blocks.pop_front();
         impl->total_idel_bytes_ -= nbytes;
@@ -92,28 +95,24 @@ class BSCachingAllocator : public CacheAllocator {
                                     << " bytes, ptr:" << ptr
                                     << ",allocator:" << this);
         break;
-      } else {
-        try {
-          auto data_ptr = raw_allocator()->allocate(nbytes);
-          ptr = data_ptr.get();
-          device() = data_ptr.device();
-          data_ptr.release_context();
-          set_memory_reserved(memory_reserved() + nbytes);
-
-          impl->allocated_.insert(ptr);
-          impl->total_alocated_bytes_ += nbytes;
-          DIPU_DEBUG_ALLOCATOR(4, "BSCachingAllocator::allocate "
-                                      << nbytes << ", requires:" << size
-                                      << " bytes, ptr:" << ptr
-                                      << ",allocator:" << this);
-          break;
-        } catch (...) {
-          if (i == 0) {
-            empty_cache();
-          } else {
-            TORCH_CHECK(false, "no memory available")
-          }
-        }
+      }
+      try {
+        auto data_ptr = raw_allocator()->allocate(nbytes);
+        ptr = data_ptr.get();
+        device() = data_ptr.device();
+        data_ptr.release_context();
+        set_memory_reserved(memory_reserved() + nbytes);
+
+        impl->allocated_.insert(ptr);
+        impl->total_alocated_bytes_ += nbytes;
+        DIPU_DEBUG_ALLOCATOR(4, "BSCachingAllocator::allocate "
+                                    << nbytes << ", requires:" << size
+                                    << " bytes, ptr:" << ptr
+                                    << ",allocator:" << this);
+        break;
+      } catch (...) {
+        TORCH_CHECK(i == 0, "no memory available");
+        empty_cache();
       }
     }
     set_memory_allocated(memory_allocated() + nbytes);
@@ -148,7 +147,7 @@ class BSCachingAllocator : public CacheAllocator {
     }
   }
 
-  void empty_cache() const override {
+  void empty_cache_impl() const {
     DIPU_DEBUG_ALLOCATOR(8,
                          "BSCachingAllocator::empty_cache ,allocator:" << this);
     empty_resource_pool();
@@ -168,12 +167,16 @@ class BSCachingAllocator : public CacheAllocator {
     }
   }
 
-  void release_all_memory() const {
+  void empty_cache() const override { empty_cache_impl(); }
+
+  void release_all_memory_impl() const {
     DIPU_DEBUG_ALLOCATOR(
         8, "BSCachingAllocator::release_all_memory allocator:" << this);
     empty_cache();
   }
 
+  void release_all_memory() const override { release_all_memory_impl(); }
+
   void flush_mem_pool() const {
     DIPU_DEBUG_ALLOCATOR(
         8, "BSCachingAllocator::flush_mem_pool allocator:" << this);
@@ -195,9 +198,9 @@ class BSCachingAllocator : public CacheAllocator {
                                            << ", size_:" << size());
       if (allocator_->impl) {
         std::deque<DIPUEvent> events;
-        for (auto iter = streams().begin(); iter != streams().end(); iter++) {
+        for (const auto& item : streams()) {
           events.emplace_back();
-          events.back().record(*iter);
+          events.back().record(item);
         }
 
         allocator_->async_mem_pool()->add(std::make_tuple(ptr(), size()),
@@ -225,7 +228,9 @@ static void deleteBSContext(void* ptr) {
   delete ctx;
 }
 
-DIPU_REGISTER_ALLOCATOR(BS, dipu::DIPU_DEVICE_TYPE, BSCachingAllocator, 0);
-DIPU_REGISTER_ALLOCATOR(BS, at::DeviceType::CPU, BSCachingAllocator, 0);
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+DIPU_REGISTER_ALLOCATOR(BS, DIPU_DEVICE_TYPE_MACRO, BSCachingAllocator, 0);
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+DIPU_REGISTER_ALLOCATOR(BS, CPU, BSCachingAllocator, 0);
 
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.cpp
index b3396b2f26..fa11173f67 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.cpp
@@ -8,6 +8,7 @@
 
 namespace dipu {
 
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 std::mutex DIPURawDeviceAllocator::mutex_;
 
 namespace {
@@ -22,28 +23,31 @@ using RegisteredAllocator = std::map<
     std::map<std::string,
              std::tuple<std::function<c10::Allocator*(int)>, uint8_t>>>;
 
-static std::unique_ptr<RegisteredAllocator> gDIPURegisterdAllocatorPtr;
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+std::unique_ptr<RegisteredAllocator> gDIPURegisterdAllocatorPtr;
 
-static std::mutex dipu_register_allocator_mutex;
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+std::mutex dipu_register_allocator_mutex;
 
-static std::set<c10::Allocator*> used_allocator;
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+std::set<c10::Allocator*> used_allocator;
 
 }  // namespace
 
 constexpr const char* dipu_default_memcaching_algorithm = "BF";
 
-std::string dipu_device_memcaching_algorithm = []() {
+const std::string dipu_device_memcaching_algorithm = []() {
   const char* env = std::getenv("DIPU_DEVICE_MEMCACHING_ALGORITHM");
   return env ? env : dipu_default_memcaching_algorithm;
 }();
 
-std::string dipu_host_memcaching_algorithm = []() {
+const std::string dipu_host_memcaching_algorithm = []() {
   const char* env = std::getenv("DIPU_HOST_MEMCACHING_ALGORITHM");
   return env ? env : dipu_default_memcaching_algorithm;
 }();
 
-void setAllocator(const std::string name, c10::DeviceType device_type,
-                  std::function<c10::Allocator*(int)> allocator_geter,
+void setAllocator(const std::string& name, c10::DeviceType device_type,
+                  const std::function<c10::Allocator*(int)>& allocator_getter,
                   uint8_t priority) {
   std::lock_guard<std::mutex> lock(dipu_register_allocator_mutex);
   if (!gDIPURegisterdAllocatorPtr) {
@@ -52,11 +56,11 @@ void setAllocator(const std::string name, c10::DeviceType device_type,
   auto& gDIPURegisterdAllocator = *gDIPURegisterdAllocatorPtr;
   if (gDIPURegisterdAllocator[device_type].count(name) <= 0) {
     gDIPURegisterdAllocator[device_type][name] =
-        std::make_tuple(allocator_geter, priority);
+        std::make_tuple(allocator_getter, priority);
   } else {
     if (std::get<1>(gDIPURegisterdAllocator[device_type][name]) < priority) {
       gDIPURegisterdAllocator[device_type][name] =
-          std::make_tuple(allocator_geter, priority);
+          std::make_tuple(allocator_getter, priority);
     } else {
       TORCH_CHECK(false,
                   "A higher priority allocator is already registered for the "
@@ -99,9 +103,10 @@ c10::Allocator* getAllocator(c10::DeviceType device_type) {
 }
 
 void emptyCachedMem() {
-  auto empty_allocator_cache = [](auto allocator) {
+  auto function_name = __FUNCTION__;
+  auto empty_allocator_cache = [&function_name](auto allocator) {
     auto cached_allocator = dynamic_cast<CacheAllocator*>(allocator);
-    DIPU_DEBUG_ALLOCATOR(8, __FUNCTION__
+    DIPU_DEBUG_ALLOCATOR(8, function_name
                                 << " allocator:" << allocator
                                 << ", cached_allocator:" << cached_allocator);
     if (cached_allocator != nullptr) {
@@ -164,7 +169,7 @@ size_t maxMemoryAllocated(const c10::Device& device) {
   return 0;
 }
 
-void recordStream(const c10::DataPtr& ptr, DIPUStream stream) {
+void recordStream(const c10::DataPtr& ptr, const DIPUStream& stream) {
   void* ctx = ptr.get_context();
   if (ctx == nullptr) {
     return;
@@ -175,7 +180,7 @@ void recordStream(const c10::DataPtr& ptr, DIPUStream stream) {
   }
 }
 
-void recordStream(const at::Tensor& tensor, DIPUStream stream) {
+void recordStream(const at::Tensor& tensor, const DIPUStream& stream) {
   dipu::recordStream(tensor.storage().data_ptr(), stream);
 }
 
@@ -184,12 +189,12 @@ class DIPUDeviceCachingProxy : public c10::Allocator {
   c10::DeviceType device_type_;
 
  public:
-  DIPUDeviceCachingProxy(c10::DeviceType device_type)
+  explicit DIPUDeviceCachingProxy(c10::DeviceType device_type)
       : device_type_(device_type) {}
 
-  ~DIPUDeviceCachingProxy() {}
+  ~DIPUDeviceCachingProxy() override = default;
 
-  c10::DataPtr allocate(size_t size) const {
+  c10::DataPtr allocate(size_t size) const override {
     return getAllocator(device_type_)->allocate(size);
   }
 
@@ -197,15 +202,17 @@ class DIPUDeviceCachingProxy : public c10::Allocator {
     return getAllocator(device_type_)->raw_deleter();
   }
 };
-static DIPUDeviceCachingProxy dipu_default_device_allocator(
-    dipu::DIPU_DEVICE_TYPE);
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+DIPUDeviceCachingProxy dipu_default_device_allocator(dipu::DIPU_DEVICE_TYPE);
 };  // namespace
 
 void initCachedAllocator() {
   // Make the c10::GetAllocator interface available
+  constexpr int kPriority = 255;
   c10::SetAllocator(dipu::DIPU_DEVICE_TYPE, &dipu_default_device_allocator,
-                    255);
-  c10::SetAllocator(c10::DeviceType::CUDA, &dipu_default_device_allocator, 255);
+                    kPriority);
+  c10::SetAllocator(c10::DeviceType::CUDA, &dipu_default_device_allocator,
+                    kPriority);
 }
 
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h
index 98245a430d..26dd2225fe 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h
@@ -41,7 +41,7 @@ class MemStats {
   }
 
  public:
-  MemStats() {}
+  MemStats() = default;
 
   ~MemStats() {
     if (allocated_in_bytes_ != 0) {
@@ -58,9 +58,9 @@ class MemStats {
 
   size_t memory_reserved() const { return reserved_in_bytes_; }
 
-  size_t max_memory_allocated() { return max_allocated_in_bytes_; }
+  size_t max_memory_allocated() const { return max_allocated_in_bytes_; }
 
-  size_t max_memory_reserved() { return max_reserved_in_bytes_; }
+  size_t max_memory_reserved() const { return max_reserved_in_bytes_; }
 };
 
 class DIPU_API CacheAllocator : public c10::Allocator, public MemStats {
@@ -78,7 +78,7 @@ class DIPU_API CacheAllocator : public c10::Allocator, public MemStats {
   void free_raw(void* ptr) { return raw_allocator()->raw_deallocate(ptr); }
 
  public:
-  CacheAllocator() {}
+  CacheAllocator() = default;
 
   void set_raw_allocator(c10::Allocator* raw_allocator) {
     raw_allocator_ = raw_allocator;
@@ -89,9 +89,7 @@ class DIPU_API CacheAllocator : public c10::Allocator, public MemStats {
     async_mem_pool_ = async_mem_pool;
   }
 
-  virtual ~CacheAllocator(){
-
-  };
+  ~CacheAllocator() override = default;
 
   virtual void empty_cache() const = 0;
 
@@ -129,22 +127,22 @@ class DIPU_API CacheAllocator : public c10::Allocator, public MemStats {
 
     void* ptr() { return ptr_; }
 
-    size_t size() { return size_; }
+    size_t size() const { return size_; }
   };
 };
 
-void setAllocator(const std::string name, c10::DeviceType device_type,
-                  std::function<c10::Allocator*(int)> allocator_get_fn,
+void setAllocator(const std::string& name, c10::DeviceType device_type,
+                  const std::function<c10::Allocator*(int)>& allocator_getter,
                   uint8_t priority = 0);
 
 c10::Allocator* getAllocator(c10::DeviceType device_type);
 
-namespace {  // For internal implementation only
+namespace allocator_details {  // For internal implementation only
 
 struct AllocatorRegisterer {
   explicit AllocatorRegisterer(
-      const std::string name, c10::DeviceType device_type,
-      std::function<c10::Allocator*(int)> allocator_get_fn,
+      const std::string& name, c10::DeviceType device_type,
+      const std::function<c10::Allocator*(int)>& allocator_get_fn,
       uint8_t priority = 0) {
     setAllocator(name, device_type, allocator_get_fn, priority);
   }
@@ -180,42 +178,46 @@ c10::Allocator* get_allocator_impl(c10::Allocator* raw_allocator) {
 
 template <class AllocatorImpl, class AsyncMemPoolImpl>
 c10::Allocator* get_allocator(int device_id, c10::Allocator* raw_allocator) {
-#define allocator_dispatch_device_id(id)                            \
-  if (device_id == id) {                                            \
+#define DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(id)                       \
+  if (device_id == (id)) {                                          \
     return get_allocator_impl<AllocatorImpl, AsyncMemPoolImpl, id>( \
         raw_allocator);                                             \
   }
 
-  allocator_dispatch_device_id(0);
-  allocator_dispatch_device_id(1);
-  allocator_dispatch_device_id(2);
-  allocator_dispatch_device_id(3);
-  allocator_dispatch_device_id(4);
-  allocator_dispatch_device_id(5);
-  allocator_dispatch_device_id(6);
-  allocator_dispatch_device_id(7);
-  allocator_dispatch_device_id(8);
-  allocator_dispatch_device_id(9);
-  allocator_dispatch_device_id(10);
-  allocator_dispatch_device_id(11);
-  allocator_dispatch_device_id(12);
-  allocator_dispatch_device_id(13);
-  allocator_dispatch_device_id(14);
-  allocator_dispatch_device_id(15);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(0);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(1);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(2);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(3);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(4);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(5);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(6);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(7);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(8);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(9);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(10);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(11);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(12);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(13);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(14);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(15);
   TORCH_CHECK(false, "support up to 16 cards");
 }
+#undef DIPU_ALLOCATOR_DISPATCH_DEVICE_ID
 
 #define DIPU_REGISTER_ALLOCATOR(name, device_type, CachingAllocator, priority) \
   namespace name##device_type {                                                \
-    static RawAllocator<device_type>::type raw_allocator;                      \
-    using AsyncMemPool = AsyncResourcePoolImpl<std::tuple<void*, size_t>,      \
-                                               device_type, priority>;         \
-    static std::function<c10::Allocator*(int)> allocator_get_fn =              \
-        std::bind(get_allocator<CachingAllocator, AsyncMemPool>,               \
-                  std::placeholders::_1, &raw_allocator);                      \
-    static AllocatorRegisterer g_allocator(#name, device_type,                 \
-                                           allocator_get_fn, priority);        \
+    static allocator_details::RawAllocator<at::DeviceType::device_type>::type  \
+        raw_allocator;                                                         \
+    using AsyncMemPool =                                                       \
+        AsyncResourcePoolImpl<std::tuple<void*, size_t>,                       \
+                              at::DeviceType::device_type, priority>;          \
+    static const std::function<c10::Allocator*(int)> allocator_get_fn =        \
+        std::bind(                                                             \
+            allocator_details::get_allocator<CachingAllocator, AsyncMemPool>,  \
+            std::placeholders::_1, &raw_allocator);                            \
+    static const allocator_details::AllocatorRegisterer g_allocator(           \
+        #name, at::DeviceType::device_type, allocator_get_fn, priority);       \
   }
-}  // namespace
+}  // namespace allocator_details
 
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocatorUtils.h b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocatorUtils.h
index 19455be9a7..2e5e71cbd1 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocatorUtils.h
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocatorUtils.h
@@ -21,8 +21,8 @@ void initCachedAllocator();
 
 void releaseAllDeviceMem();
 
-void recordStream(const c10::DataPtr& ptr, DIPUStream stream);
+void recordStream(const c10::DataPtr& ptr, const DIPUStream& stream);
 
-void recordStream(const at::Tensor& tensor, DIPUStream stream);
+void recordStream(const at::Tensor& tensor, const DIPUStream& stream);
 
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.cpp
index e28073f950..2f1607abab 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.cpp
@@ -15,7 +15,6 @@ namespace dipu {
 
 static void DIPURawDeviceAllocatorDeleter(void* ptr) {
   if (ptr) {
-    auto device = devproxy::current_device();
     DIPU_DEBUG_ALLOCATOR(2, "devproxy::freeDevice: free " << ptr);
     // When only one stream is involved, in order to improve performance and
     // memory usage, we actually do not use events for synchronization. The
@@ -29,7 +28,7 @@ static void DIPURawDeviceAllocatorDeleter(void* ptr) {
   }
 }
 
-DIPURawDeviceAllocator::DIPURawDeviceAllocator() {}
+DIPURawDeviceAllocator::DIPURawDeviceAllocator() = default;
 
 c10::DataPtr DIPURawDeviceAllocator::allocate(size_t size) const {
   auto idx = devproxy::current_device();
@@ -40,6 +39,7 @@ c10::DeleterFnPtr DIPURawDeviceAllocator::raw_deleter() const {
   return &DIPURawDeviceAllocatorDeleter;
 }
 
+// NOLINTNEXTLINE(readability-convert-member-functions-to-static)
 c10::DataPtr DIPURawDeviceAllocator::allocate(
     size_t nbytes, c10::DeviceIndex device_index) const {
   std::lock_guard<std::mutex> lock(mutex_);
@@ -55,7 +55,7 @@ c10::DataPtr DIPURawDeviceAllocator::allocate(
 
 class DIPURawHostAllocatorImpl final {
  public:
-  std::pair<void*, void*> allocate(size_t size) {
+  static std::pair<void*, void*> allocate(size_t size) {
     if (size == 0) {
       return {nullptr, nullptr};
     }
@@ -71,7 +71,7 @@ class DIPURawHostAllocatorImpl final {
     return {data, data};
   }
 
-  void free(void* ctx) {
+  static void free(void* ctx) {
     if (ctx == nullptr) {
       return;
     }
@@ -85,7 +85,7 @@ class DIPURawHostAllocatorImpl final {
     ctx = nullptr;
   }
 
-  bool isPinnedPtr(const void* p) {
+  static bool isPinnedPtr(const void* p) {
     bool is_pinned = false;
     {
       std::lock_guard<std::mutex> lck(mtx_);
@@ -98,10 +98,9 @@ class DIPURawHostAllocatorImpl final {
         if (cp >= cptr && cp < max_ptr) {
           is_pinned = true;
           break;
-        } else {
-          if (cp >= max_ptr) {
-            break;
-          }
+        }
+        if (cp >= max_ptr) {
+          break;
         }
       }
     }
@@ -109,20 +108,23 @@ class DIPURawHostAllocatorImpl final {
   }
 
  private:
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
   static std::mutex mtx_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
   static std::map<void*, size_t> blocks_;
 };
 
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 std::map<void*, size_t> DIPURawHostAllocatorImpl::blocks_;
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 std::mutex DIPURawHostAllocatorImpl::mtx_;
 
 namespace {
 
-static DIPURawHostAllocatorImpl dipu_host_allocator;
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+DIPURawHostAllocatorImpl dipu_host_allocator;
 
-static void DIPURawHostAllocatorDeleter(void* ctx) {
-  dipu_host_allocator.free(ctx);
-}
+void DIPURawHostAllocatorDeleter(void* ctx) { dipu_host_allocator.free(ctx); }
 
 }  // namespace
 
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawCachingAllocator.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawCachingAllocator.cpp
index d558667d5b..f986852d42 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawCachingAllocator.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawCachingAllocator.cpp
@@ -4,13 +4,13 @@
 
 namespace dipu {
 
-static void deleteRawCachingAllocatorContext(void*);
+static void deleteRawCachingAllocatorContext(void* ptr);
 
 class RawCachingAllocator : public CacheAllocator {
  public:
-  RawCachingAllocator() {}
+  RawCachingAllocator() = default;
 
-  ~RawCachingAllocator() {}
+  ~RawCachingAllocator() override = default;
 
   class Context : public DataPtrContextBase {
    public:
@@ -19,9 +19,9 @@ class RawCachingAllocator : public CacheAllocator {
         : DataPtrContextBase(allocator, ptr, size), real_size_(real_size) {}
     ~Context() {
       std::deque<DIPUEvent> events;
-      for (auto iter = streams().begin(); iter != streams().end(); iter++) {
+      for (const auto& item : streams()) {
         events.emplace_back();
-        events.back().record(*iter);
+        events.back().record(item);
       }
       auto allocator_ = static_cast<const RawCachingAllocator*>(allocator());
       allocator_->async_mem_pool()->add(std::make_tuple(ptr(), size()), events);
@@ -32,9 +32,10 @@ class RawCachingAllocator : public CacheAllocator {
     size_t real_size_ = 0;
   };
 
-  size_t getAllocateSize(size_t nbytes) const {
+  static size_t getAllocateSize(size_t nbytes) {
     static const size_t kMinAllocationSize = []() {
-      size_t size = 512;
+      const int kBytesNum = 512;
+      size_t size = kBytesNum;
       const char* env = std::getenv("DIPU_RAW_ALLOCATOR_MIN_ALLOCATE_SIZE");
       if (env != nullptr) {
         size = std::atoi(env);
@@ -54,8 +55,8 @@ class RawCachingAllocator : public CacheAllocator {
     auto ptr = raw_allocator()->raw_allocate(nbytes);
     set_memory_reserved(memory_reserved() + nbytes);
     set_memory_allocated(memory_allocated() + nbytes);
-    return c10::DataPtr(ptr, new Context(this, ptr, size, nbytes),
-                        deleteRawCachingAllocatorContext, device());
+    return {ptr, new Context(this, ptr, size, nbytes),
+            deleteRawCachingAllocatorContext, device()};
   }
 
   void empty_cache() const override {
@@ -84,8 +85,9 @@ static void deleteRawCachingAllocatorContext(void* ptr) {
   auto ctx = static_cast<RawCachingAllocator::Context*>(ptr);
   delete ctx;
 }
-
-DIPU_REGISTER_ALLOCATOR(RAW, dipu::DIPU_DEVICE_TYPE, RawCachingAllocator, 0);
-DIPU_REGISTER_ALLOCATOR(RAW, at::DeviceType::CPU, RawCachingAllocator, 0);
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+DIPU_REGISTER_ALLOCATOR(RAW, DIPU_DEVICE_TYPE_MACRO, RawCachingAllocator, 0);
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+DIPU_REGISTER_ALLOCATOR(RAW, CPU, RawCachingAllocator, 0);
 
 }  // namespace dipu

From ab5eada9bc747d83c0886a1d9a90456a943c25e7 Mon Sep 17 00:00:00 2001
From: Lingjie <lilingjie@sensetime.com>
Date: Wed, 13 Dec 2023 15:06:03 +0800
Subject: [PATCH 11/58] Speedup dumpOnArgLevel by using lazy initialization
 (#524)

---
 dipu/SupportedDiopiFunctions.txt               | 6 ++++++
 dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp | 4 ++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/dipu/SupportedDiopiFunctions.txt b/dipu/SupportedDiopiFunctions.txt
index c7daf5d5d1..ee844acfc5 100644
--- a/dipu/SupportedDiopiFunctions.txt
+++ b/dipu/SupportedDiopiFunctions.txt
@@ -48,6 +48,8 @@ diopiCastDtype
 diopiCat
 diopiCdist
 diopiCdistBackward
+diopiCeil
+diopiCeilInp
 diopiClamp
 diopiClampInp
 diopiClampInpScalar
@@ -135,6 +137,10 @@ diopiLog2
 diopiLog2Inp
 diopiLogicalAnd
 diopiLogicalAndInp
+diopiLogicalNot
+diopiLogicalNotInp
+diopiLogicalOr
+diopiLogicalOrInp
 diopiLogInp
 diopiLogSoftmax
 diopiLogSoftmaxBackward
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp b/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp
index 213f3ff3e3..157d927979 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp
@@ -35,8 +35,8 @@ inline void synchronizeIfEnable() {
 }
 
 inline int dumpOpArgLevel() {
-  const char* env_ptr = std::getenv("DIPU_DUMP_OP_ARGS");
-  int level = env_ptr ? std::atoi(env_ptr) : 0;
+  static const char* env_ptr = std::getenv("DIPU_DUMP_OP_ARGS");
+  static int level = env_ptr ? std::atoi(env_ptr) : 0;
   return level;
 }
 

From 96917249700ed6bffb884093848c5c0011b9810a Mon Sep 17 00:00:00 2001
From: tangzhiyi11 <tangzhiyi11@users.noreply.github.com>
Date: Wed, 13 Dec 2023 15:30:19 +0800
Subject: [PATCH 12/58] [dicp][ascend] fuse transpose/mm in ascendgraph (#523)

---
 dicp/dicp/dynamo_bridge/op_transformer.py     |  3 +++
 dicp/dicp/vendor/AscendGraph/ascend_op.py     |  6 +++++
 .../dicp/vendor/AscendGraph/codegen/ascend.py |  7 +++++
 .../AscendGraph/codegen/load_and_run.py       |  2 +-
 dicp/dicp/vendor/AscendGraph/conversion.py    | 27 ++++++++++++-------
 5 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/dicp/dicp/dynamo_bridge/op_transformer.py b/dicp/dicp/dynamo_bridge/op_transformer.py
index e577be69fb..722a7ccac4 100644
--- a/dicp/dicp/dynamo_bridge/op_transformer.py
+++ b/dicp/dicp/dynamo_bridge/op_transformer.py
@@ -48,6 +48,9 @@ def get_proxy(self, target, args: Tuple[Argument, ...], kwargs: Dict[str, Any] =
             'call_function', target.get_singleton(), args, kwargs)
         return proxy
 
+    def get_proxy_from_node(self, node):
+        return self.tracer.proxy(node)
+
     def call_function(self, target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Any]) -> Any:
         if target in self._conversions:
             converted_target = self._conversions[target]
diff --git a/dicp/dicp/vendor/AscendGraph/ascend_op.py b/dicp/dicp/vendor/AscendGraph/ascend_op.py
index 36a18ad3a7..014715031a 100644
--- a/dicp/dicp/vendor/AscendGraph/ascend_op.py
+++ b/dicp/dicp/vendor/AscendGraph/ascend_op.py
@@ -107,6 +107,12 @@ def infer_result(self, x1, x2):
         return common_binary_op_infer(x1, x2)
 
 
+class Muls(Operator):
+    def __init__(self):
+        super().__init__("Muls")
+        self.torch_op = aten.mul
+
+
 class Div(Operator):
     def __init__(self):
         super().__init__("Div")
diff --git a/dicp/dicp/vendor/AscendGraph/codegen/ascend.py b/dicp/dicp/vendor/AscendGraph/codegen/ascend.py
index 635fb5161b..bc11bd608f 100644
--- a/dicp/dicp/vendor/AscendGraph/codegen/ascend.py
+++ b/dicp/dicp/vendor/AscendGraph/codegen/ascend.py
@@ -712,6 +712,13 @@ def Mul(name, x, y):
         op.set_input("x2", y)
         return op.to_node()
 
+    @staticmethod
+    def Muls(name, x, y):
+        op = OP(name, "Muls")
+        op.set_input("x", x)
+        op.set_attr_float("value", float(y))
+        return op.to_node()
+
     @staticmethod
     def IdentityN(name, *args, **kwargs):
         input_names = []
diff --git a/dicp/dicp/vendor/AscendGraph/codegen/load_and_run.py b/dicp/dicp/vendor/AscendGraph/codegen/load_and_run.py
index b57516e386..eb47b76362 100644
--- a/dicp/dicp/vendor/AscendGraph/codegen/load_and_run.py
+++ b/dicp/dicp/vendor/AscendGraph/codegen/load_and_run.py
@@ -124,6 +124,7 @@ def release_memory(self):
 
 
 memory_pool = MemoryPool()
+zero_tensor = torch.randn(1).to(dipu_device_str)
 
 
 class AscendExecutor(object):
@@ -254,7 +255,6 @@ def init_resource(self):
 
     def _prepare_input(self, images, dims):
         assert self.num_inputs == len(images)
-        zero_tensor = torch.randn(1).to(dipu_device_str)
         for i in range(self.num_inputs):
             buffer_size = self.input_size[i]
             if dims is not None and i in dims.keys():
diff --git a/dicp/dicp/vendor/AscendGraph/conversion.py b/dicp/dicp/vendor/AscendGraph/conversion.py
index d4f850ced7..6f97b94f74 100644
--- a/dicp/dicp/vendor/AscendGraph/conversion.py
+++ b/dicp/dicp/vendor/AscendGraph/conversion.py
@@ -135,12 +135,12 @@ def get_param_proxy(self, param, type, target_shape):
 
     def mul_scalar(self, x, y):
         out_dtype = fx_traceback.get_current_meta()['val'].dtype
-        const_dtype = torch.float32 if out_dtype == torch.float16 else out_dtype
-        y_shape = list(x.node.meta['val'].shape)
-        y_op = self.get_param_proxy(y, const_dtype, y_shape)
-        if out_dtype == torch.float16:
-            y_op = self.get_proxy(ascend_op.Cast, (y_op, "FLOAT16"))
-        return self.get_proxy(ascend_op.Mul, (x, y_op))
+        # Muls support bfloat16, int32, int16, float16, float32, complex32, complex64.
+        if out_dtype not in [torch.float, torch.float16, torch.int32]:
+            y_shape = list(x.node.meta['val'].shape)
+            y_op = self.get_param_proxy(y, out_dtype, y_shape)
+            return self.get_proxy(ascend_op.Mul, (x, y_op))
+        return self.get_proxy(ascend_op.Muls, (x, y))
 
     def mul_complex64(self, x, y):
         out_dtype = fx_traceback.get_current_meta()['val'].dtype
@@ -855,15 +855,22 @@ def symsize(self, x, dim):
     def mm(self, x, y):
         # TODO! MatMul not support fp32 input
         # for higher precision in some cases
-        out_dtype = fx_traceback.get_current_meta()['val'].dtype
         if len(self.sym_in_args) > 0 or len(self.sym_to_inputs) > 0:
             x = self.get_proxy(ascend_op.Unsqueeze, (x, [0]))
             y = self.get_proxy(ascend_op.Unsqueeze, (y, [0]))
             mm = self.get_proxy(ascend_op.BatchMatMul, (x, y, False, False))
             return self.get_proxy(ascend_op.Squeeze, (mm, [0]))
-        else:
-            mm = self.get_proxy(ascend_op.MatMul, (x, y, False, False))
-            return self.get_proxy(ascend_op.Cast, (mm, get_ascend_dtype(out_dtype)))
+        out_dtype = fx_traceback.get_current_meta()['val'].dtype
+        trans_x = False
+        trans_y = False
+        if isinstance(x.node.target, ascend_op.Permute) and x.node.args[1] == [1, 0]:
+            x = self.get_proxy_from_node(x.node.args[0])
+            trans_x = True
+        if isinstance(y.node.target, ascend_op.Permute) and y.node.args[1] == [1, 0]:
+            y = self.get_proxy_from_node(y.node.args[0])
+            trans_y = True
+        mm = self.get_proxy(ascend_op.MatMul, (x, y, trans_x, trans_y))
+        return self.get_proxy(ascend_op.Cast, (mm, get_ascend_dtype(out_dtype)))
 
     @register_conversion(aten.bmm.default)
     def bmm(self, x, y):

From cfa7cd82e90f1e2a026890b8446efb7b75f4e48d Mon Sep 17 00:00:00 2001
From: tangzhiyi11 <tangzhiyi11@users.noreply.github.com>
Date: Wed, 13 Dec 2023 15:36:37 +0800
Subject: [PATCH 13/58] [dicp][ascend] remove unnecessary broadcast (#527)

---
 .../dicp/vendor/AscendGraph/codegen/ascend.py | 29 +++++++------------
 .../AscendGraph/codegen/load_and_run.py       |  2 +-
 dicp/dicp/vendor/AscendGraph/conversion.py    | 18 ++++--------
 3 files changed, 18 insertions(+), 31 deletions(-)

diff --git a/dicp/dicp/vendor/AscendGraph/codegen/ascend.py b/dicp/dicp/vendor/AscendGraph/codegen/ascend.py
index bc11bd608f..2e55ea5a80 100644
--- a/dicp/dicp/vendor/AscendGraph/codegen/ascend.py
+++ b/dicp/dicp/vendor/AscendGraph/codegen/ascend.py
@@ -334,34 +334,27 @@ def gen_call_func(self):
             call_body.writeline(shape_str)
         else:
             call_body.writeline('''output_shape = None''')
-
-
+        
         # add stride & storage_offset info
-        out_stride_str = '''out_stride = ['''
-        out_storage_offset_str = '''out_storage_offset = ['''
+        out_strides = []
+        out_storage_offsets = []
         for elem in self.output_args:
             if hasattr(elem, 'meta'):
                 elem = elem.meta['val']
             if isinstance(elem, torch.SymInt) or isinstance(elem, torch.SymBool):
-                out_stride_str += '[1],'
-                out_storage_offset_str += '0,'
+                out_strides.append('[1]')
+                out_storage_offsets.append('0')
                 continue
             if elem.dim()==0: # temporary solution for sum.default(a) whose result is a scalar(no dim no stride)
-                out_stride_str += '[1],'
-                out_storage_offset_str += '0,'
+                out_strides.append('[1]')
+                out_storage_offsets.append('0')
                 continue
             stride = list(elem.stride())
-            if len(stride) == 0:
-                raise RuntimeError("Error handling empty output_stride")
             stride = [self.process_sym_name(str(dim)) for dim in stride]
-            out_stride_str += '[' + ','.join(map(str, stride)) + '],'
-            out_storage_offset_str += str(elem.storage_offset()) + ','
-        out_stride_str += extra_stride_str
-        out_storage_offset_str += extra_storage_offset_str
-        out_stride_str = out_stride_str[:-1] + ']'
-        out_storage_offset_str = out_storage_offset_str[:-1] + ']'
-        call_body.writeline(out_stride_str)
-        call_body.writeline(out_storage_offset_str)
+            out_strides.append(str(stride))
+            out_storage_offsets.append(elem.storage_offset())
+        call_body.writeline(f'out_stride = {out_strides}')
+        call_body.writeline(f'out_storage_offset = {out_storage_offsets}')
 
         call_body.splice("""
                              import torch_dipu
diff --git a/dicp/dicp/vendor/AscendGraph/codegen/load_and_run.py b/dicp/dicp/vendor/AscendGraph/codegen/load_and_run.py
index eb47b76362..a28e7294f0 100644
--- a/dicp/dicp/vendor/AscendGraph/codegen/load_and_run.py
+++ b/dicp/dicp/vendor/AscendGraph/codegen/load_and_run.py
@@ -204,7 +204,7 @@ def load_model(self):
         check_ret("set_config_opt", ret)
 
         ret = acl.mdl.set_config_opt(
-            config_handle, ACL_MDL_WORKSPACE_SIZET, work_size)
+            config_handle, ACL_MDL_WORKSPACE_SIZET, memory_pool.work_size)
         check_ret("set_config_opt", ret)
 
         ret = acl.mdl.set_config_opt(
diff --git a/dicp/dicp/vendor/AscendGraph/conversion.py b/dicp/dicp/vendor/AscendGraph/conversion.py
index 6f97b94f74..af5961c6c7 100644
--- a/dicp/dicp/vendor/AscendGraph/conversion.py
+++ b/dicp/dicp/vendor/AscendGraph/conversion.py
@@ -129,8 +129,6 @@ def get_param_proxy(self, param, type, target_shape):
             param = param if isinstance(param, list) else [param]
             param = self.get_proxy(
                 ascend_op.Const, (param, type, [len(param)]))
-        shape_op = self.get_shape_proxy(target_shape)
-        param = self.get_proxy(ascend_op.BroadcastTo, (param, shape_op))
         return param
 
     def mul_scalar(self, x, y):
@@ -781,16 +779,12 @@ def maximum(self, a, b):
             b = self.get_proxy(ascend_op.Cast, (b, "FLOAT16"))
         return self.get_proxy(ascend_op.Maximum, (a, b))
 
-    def common_process_scalar(self, x, y):
-        x_dtype = x.node.meta['val'].dtype
+    def common_process_scalar(self, y, dtype):
         need_cast = False
-        if x_dtype == torch.float16:
-            x_dtype = torch.float32
+        if dtype == torch.float16:
+            dtype = torch.float32
             need_cast = True
-        y = self.get_proxy(ascend_op.Const, (y, x_dtype))
-        y_shape = list(x.node.meta['val'].shape)
-        shape_preprocess = self.get_shape_proxy(y_shape)
-        y = self.get_proxy(ascend_op.BroadcastTo, (y, shape_preprocess))
+        y = self.get_proxy(ascend_op.Const, (y, dtype))
         if need_cast:
             y = self.get_proxy(ascend_op.Cast, (y, "FLOAT16"))
         return y
@@ -798,13 +792,13 @@ def common_process_scalar(self, x, y):
     @register_conversion(aten.sub)
     def sub(self, x, y):
         if not isinstance(y, torch.fx.proxy.Proxy):
-            y = self.common_process_scalar(x, y)
+            y = self.common_process_scalar(y, x.node.meta['val'].dtype)
         return self.get_proxy(ascend_op.Sub, (x, y))
 
     @register_conversion(aten.rsub)
     def rsub(self, x, y):
         if not isinstance(y, torch.fx.proxy.Proxy):
-            y = self.common_process_scalar(x, y)
+            y = self.common_process_scalar(y, x.node.meta['val'].dtype)
         return self.get_proxy(ascend_op.Sub, (y, x))
 
     @register_conversion(aten.transpose.int)

From fbb67db835dac4fedaa26f6a46408209467a9389 Mon Sep 17 00:00:00 2001
From: caikun-pjlab <116071181+caikun-pjlab@users.noreply.github.com>
Date: Wed, 13 Dec 2023 15:59:56 +0800
Subject: [PATCH 14/58] update kineto (#530)

---
 dipu/third_party/kineto | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dipu/third_party/kineto b/dipu/third_party/kineto
index c1bed2f2dc..2923b3002a 160000
--- a/dipu/third_party/kineto
+++ b/dipu/third_party/kineto
@@ -1 +1 @@
-Subproject commit c1bed2f2dc3779dec2a63025ea1b72a957f4badf
+Subproject commit 2923b3002a179d6dfe202e6d032567bb2816eae7

From 060b341325cbbe7aa1f53bb4b93e96042b334afb Mon Sep 17 00:00:00 2001
From: tangzhiyi11 <tangzhiyi11@users.noreply.github.com>
Date: Fri, 15 Dec 2023 14:36:22 +0800
Subject: [PATCH 15/58] [dicp][ascend] opt inplace copy (#533)

* opt copy inplace

* optimzer load_and_run
---
 .../dicp/vendor/AscendGraph/codegen/ascend.py | 19 +++--
 .../AscendGraph/codegen/load_and_run.py       | 78 ++++++++++++++-----
 2 files changed, 66 insertions(+), 31 deletions(-)

diff --git a/dicp/dicp/vendor/AscendGraph/codegen/ascend.py b/dicp/dicp/vendor/AscendGraph/codegen/ascend.py
index 2e55ea5a80..a0cf0e78aa 100644
--- a/dicp/dicp/vendor/AscendGraph/codegen/ascend.py
+++ b/dicp/dicp/vendor/AscendGraph/codegen/ascend.py
@@ -362,14 +362,17 @@ def gen_call_func(self):
                              for idx in range(len(args)):
                                  if isinstance(args[idx], int):
                                      args[idx] = torch.tensor(args[idx], device=dipu_device_str, dtype=torch.int32)
-                                 if isinstance(args[idx], torch.Tensor):
-                                     tmp_arg = args[idx].clone()
-                                     with torch.no_grad():
-                                         args[idx].copy_(tmp_arg)
-                                     del tmp_arg
                          """, strip=True)
         call_body.writeline(f"({','.join(self.args)}) = args")
-        call_str = ['output_tensor = kernel_cpp_0(args, dims, output_shape, out_stride, out_storage_offset)']
+
+        # dealing with modified args passing back
+        allocated_output = {}
+        for item in self.assign_args:
+            input_index = item[1]
+            output_index = self.graph_output_names.index(item[0])
+            allocated_output[output_index] = input_index
+        call_body.writeline(f'allocated_output= {allocated_output}')
+        call_str = ['output_tensor = kernel_cpp_0(args, dims, output_shape, out_stride, out_storage_offset, allocated_output)']
 
         if precision_check and self.aten_graph is not None:
             # import aten graph
@@ -394,10 +397,6 @@ def gen_call_func(self):
                 call_str.extend([f'del {name}',
                                  f'{name} = int(output_tensor[{i}])'])
 
-        # dealing with modified args passing back
-        output_convert = [f'args[{name[1]}].copy_({name[0]})' for name in self.assign_args]
-        call_str.extend(output_convert)
-
         if precision_check:
             for i, name in enumerate(self.py_output_names):
                 if name != 'None' and name not in self.args and name not in self.symint_outputs:
diff --git a/dicp/dicp/vendor/AscendGraph/codegen/load_and_run.py b/dicp/dicp/vendor/AscendGraph/codegen/load_and_run.py
index a28e7294f0..5b0a5ea2c0 100644
--- a/dicp/dicp/vendor/AscendGraph/codegen/load_and_run.py
+++ b/dicp/dicp/vendor/AscendGraph/codegen/load_and_run.py
@@ -1,10 +1,11 @@
-import acl
+import atexit
 import os
+
+import acl
 import numpy as np
 import torch
-import atexit
 import torch_dipu
-
+from torch.profiler import record_function
 
 dipu_device_str = torch_dipu.dipu.device.__diputype__
 
@@ -53,6 +54,16 @@
 ACL_MDL_OUTPUTQ_ADDR_PTR = 12
 ACL_MDL_WORKSPACE_MEM_OPTIMIZE = 13
 
+ACL_DDR_MEM = 0
+ACL_HBM_MEM = 1
+ACL_DDR_MEM_HUGE = 2
+ACL_DDR_MEM_NORMAL = 3
+ACL_HBM_MEM_HUGE = 4
+ACL_HBM_MEM_NORMAL = 5
+ACL_DDR_MEM_P2P_HUGE = 6
+ACL_DDR_MEM_P2P_NORMAL = 7
+ACL_HBM_MEM_P2P_HUGE = 8
+ACL_HBM_MEM_P2P_NORMAL = 9
 
 def get_np_dtype(dtype):
     if dtype == ACL_FLOAT:
@@ -110,7 +121,7 @@ def __init__(self):
 
     def init_work_weight_ptr(self):
         if self.work_ptr is None:
-            self.work_size = 15 * 1024 * 1024 * 1024
+            self.work_size = 18 * 1024 * 1024 * 1024
             self.work_ptr, ret = acl.rt.malloc(self.work_size,
                                                ACL_MEM_MALLOC_HUGE_FIRST)
             check_ret("acl.rt.malloc", ret)
@@ -173,12 +184,17 @@ def load_model(self):
         if work_size == 0:
             work_size = memory_pool.work_size
         elif work_size > memory_pool.work_size:
-            memory_pool.work_size = work_size
-            memory_pool.release_memory()
-            print("Adjust memory pool allocation.")
-            memory_pool.work_ptr, ret = acl.rt.malloc(work_size,
-                                                      ACL_MEM_MALLOC_HUGE_FIRST)
-            check_ret("acl.rt.malloc", ret)
+            free, _, ret = acl.rt.get_mem_info(ACL_HBM_MEM)
+            check_ret("acl.rt.get_mem_info", ret)
+            # If free < work_size, means that memory is insufficient.
+            # Just ignore and continue, it may be work.
+            if free > work_size:
+                memory_pool.work_size = work_size
+                memory_pool.release_memory()
+                print("Adjust memory pool allocation.")
+                memory_pool.work_ptr, ret = acl.rt.malloc(work_size,
+                                                        ACL_MEM_MALLOC_HUGE_FIRST)
+                check_ret("acl.rt.malloc", ret)
 
         self.weight_ptr, ret = acl.rt.malloc(weight_size,
                                              ACL_MEM_MALLOC_HUGE_FIRST)
@@ -253,6 +269,7 @@ def init_resource(self):
 
         print("init resource success")
 
+    @record_function('load_and_run_prepare_input')
     def _prepare_input(self, images, dims):
         assert self.num_inputs == len(images)
         for i in range(self.num_inputs):
@@ -283,10 +300,14 @@ def _prepare_input(self, images, dims):
                 check_ret("acl.mdl.set_dataset_tensor_desc", ret)
                 assert (dataset == self.input_dataset)
 
-    def _prepare_output(self, output_tensor, output_shape, out_stride, out_storage_offset):
+    @record_function('load_and_run_prepare_output')
+    def _prepare_output(self, output_tensor, output_shape, out_stride, out_storage_offset, allocated_output):
         for i in range(self.num_outputs):
-            item = torch.empty(
-                self.output_dims[i], dtype=self.output_dtypes[i], device=dipu_device_str)
+            if allocated_output and i in allocated_output.keys():
+                item = allocated_output[i]
+            else:
+                item = torch.empty(
+                    self.output_dims[i], dtype=self.output_dtypes[i], device=dipu_device_str)
             # TODO! add case judgement for stride info
             # item = item.as_strided(
             #     self.output_dims[i], out_stride[i], out_storage_offset[i])
@@ -295,7 +316,8 @@ def _prepare_output(self, output_tensor, output_shape, out_stride, out_storage_o
                 self.output_data_buffers[i], item.data_ptr(), self.output_size[i])
             check_ret("acl.update_data_buffer", ret)
 
-    def _prepare_dynamic_output(self, output_tensor, output_shape, out_stride, out_storage_offset):
+    @record_function('load_and_run_prepare_dynamic_output')
+    def _prepare_dynamic_output(self, output_tensor, output_shape, out_stride, out_storage_offset, allocated_output):
         for i in range(self.num_outputs):
             tot_size = 1
             for elem in output_shape[i]:
@@ -304,8 +326,11 @@ def _prepare_dynamic_output(self, output_tensor, output_shape, out_stride, out_s
             tot_size *= acl.data_type_size(dtype)
             self.output_dims[i] = output_shape[i]
             self.output_size[i] = tot_size
-            item = torch.empty(
-                self.output_dims[i], dtype=self.output_dtypes[i], device=dipu_device_str)
+            if allocated_output and i in allocated_output.keys():
+                item = allocated_output[i]
+            else:
+                item = torch.empty(
+                    self.output_dims[i], dtype=self.output_dtypes[i], device=dipu_device_str)
             # TODO! add case judgement for stride info
             # item = item.as_strided(
             #     self.output_dims[i], out_stride[i], out_storage_offset[i])
@@ -315,20 +340,31 @@ def _prepare_dynamic_output(self, output_tensor, output_shape, out_stride, out_s
                 self.output_data_buffers[i], item.data_ptr(), self.output_size[i])
             check_ret("acl.update_data_buffer", ret)
 
-    def run(self, images, dims=None, output_shape=None, out_stride=None, out_storage_offset=None):
+    @record_function('load_and_run_run')
+    def run(self, images, dims=None, output_shape=None,
+            out_stride=None, out_storage_offset=None,
+            allocated_output=None):
         assert len(images) > 0
         input = [x.to(dipu_device_str) if isinstance(x, torch.Tensor)
                  and x.device.type != dipu_device_str else x for x in images]
+        allocated_output_tensor = None
+        if allocated_output:
+            allocated_output_tensor = {}
+            for output_index, input_index in allocated_output.items():
+                allocated_output_tensor[output_index] = input[input_index]
         self._prepare_input(input, dims)
         output = []
         if output_shape:
-            self._prepare_dynamic_output(output, output_shape, out_stride, out_storage_offset)
+            self._prepare_dynamic_output(
+                output, output_shape, out_stride, out_storage_offset, allocated_output_tensor)
         else:
-            self._prepare_output(output, output_shape, out_stride, out_storage_offset)
+            self._prepare_output(
+                output, output_shape, out_stride, out_storage_offset, allocated_output_tensor)
         self.forward()
         self._destroy_databuffer()
         return output
 
+    @record_function('load_and_run_forward')
     def forward(self):
         ret = acl.mdl.execute(self.model_id,
                               self.input_dataset,
@@ -348,8 +384,8 @@ def __init__(self, device_id, model_path) -> None:
         self.exe = AscendExecutor(device_id, model_path)
 
     def run(self, images, dims=None, output_shape=None,
-            out_stride=None, out_storage_offset=None):
-        return self.exe.run(images, dims, output_shape, out_stride, out_storage_offset)
+            out_stride=None, out_storage_offset=None, allocated_output=None):
+        return self.exe.run(images, dims, output_shape, out_stride, out_storage_offset, allocated_output)
 
     def cleanup(self):
         if hasattr(self, 'exe'):

From 1059663556f279f62a9a84b07ddbf9419f4bc628 Mon Sep 17 00:00:00 2001
From: Peter Ye <44945378+yewentao256@users.noreply.github.com>
Date: Fri, 15 Dec 2023 15:19:43 +0800
Subject: [PATCH 16/58] remove chech return value if (#534)

---
 .../scripts/autogen_diopi_wrapper/diopi_wrapper_template.py | 4 +---
 dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp              | 6 ------
 dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.cpp       | 2 +-
 3 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py b/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py
index 22076410d1..5983d8dd69 100644
--- a/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py
+++ b/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py
@@ -76,9 +76,7 @@
   dipu::profile::RecordBlockCreator dipuRecorder(R"($diopi_fun_call_code)");
   ::diopiError_t ret = $diopi_fun_call_code
   dipuRecorder.end();
-  if (checkDiopiReturnValue()) {
-    TORCH_CHECK(ret == ::diopiSuccess, __FILE__, ":", __LINE__, R"($diopi_fun_call_code)", " error, error code is ", ret, "error message is ", diopiGetLastErrorString());
-  }
+  TORCH_CHECK(ret == ::diopiSuccess, __FILE__, ":", __LINE__, R"($diopi_fun_call_code)", " error, error code is ", ret, "error message is ", diopiGetLastErrorString());
 
   $custom_code_before_return
 
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp b/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp
index 157d927979..c787528084 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp
@@ -5,12 +5,6 @@
 
 namespace dipu::native {
 
-inline bool checkDiopiReturnValue() {
-  static bool enable =
-      std::getenv("DIPU_DISABLE_CHECK_DIOPI_RETURN_VALUE") == nullptr;
-  return enable;
-}
-
 inline bool checkTensorDevice() {
   static bool enable = []() {
     const char* env_ptr = std::getenv("DIPU_CHECK_TENSOR_DEVICE");
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.cpp
index 6b9ed029f5..4bb97e8839 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.cpp
@@ -67,7 +67,7 @@ struct DIPUStreamDevice {
   std::once_flag default_flag;
   deviceId_t devidx_{};
   // seems pytorch 2.0 giveup default stream and enable cuda per_thread stream
-  // feature at compile time. it cannot be applied to othe device.
+  // feature at compile time. it cannot be applied to other device.
   deviceStream_t default_stream = nullptr;
 
   std::atomic<uint32_t> next_pool_pos{};

From 16e155d65f2a5e56d703b3e6acf3d9036b5acb1b Mon Sep 17 00:00:00 2001
From: Lingjie <lilingjie@sensetime.com>
Date: Fri, 15 Dec 2023 15:21:38 +0800
Subject: [PATCH 17/58] [dipu] Optimize `getAllocator` by adopting lookup table
 (#532)

* [dipu] Optimize `getAllocator` by adopting lookup table

* fix typos & clean includes

* resolve comments

* shrink lookup table & speedup devproxy::getDeviceCount
---
 dipu/.clang-tidy                              |  3 +-
 .../core/allocator/DIPUCachingAllocator.cpp   | 65 ++++++++++++++-----
 .../core/allocator/DIPUCachingAllocator.h     |  5 --
 .../runtime/devproxy/deviceproxy.cpp          |  5 +-
 .../csrc_dipu/vendor/cuda/deviceimpl.cpp      |  2 +-
 5 files changed, 54 insertions(+), 26 deletions(-)

diff --git a/dipu/.clang-tidy b/dipu/.clang-tidy
index a9bb2ef052..b947338dc7 100644
--- a/dipu/.clang-tidy
+++ b/dipu/.clang-tidy
@@ -3,6 +3,7 @@ Checks: '
   bugprone-*,
   -bugprone-easily-swappable-parameters,
   -bugprone-reserved-identifier,
+  -bugprone-signed-char-misuse,
   clang-analyzer-*,
   clang-diagnostic-*,
   cppcoreguidelines-*,
@@ -39,8 +40,6 @@ AnalyzeTemporaryDtors: false
 FormatStyle: file
 HeaderFilterRegex: '.*'
 CheckOptions:
-  - key:   bugprone-signed-char-misuse.CharTypdefsToIgnore
-    value: 'int8_t;c10::DeviceIndex'
   - key:   cppcoreguidelines-avoid-do-while.IgnoreMacros
     value: true
   - key:   cppcoreguidelines-narrowing-conversions.IgnoreConversionFromTypes
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.cpp
index fa11173f67..631a92b93b 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.cpp
@@ -5,6 +5,13 @@
 #include <map>
 #include <set>
 #include <tuple>
+#include <vector>
+
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+
+#include "csrc_dipu/base/basedef.h"
+#include "csrc_dipu/runtime/devproxy/deviceproxy.h"
 
 namespace dipu {
 
@@ -24,7 +31,7 @@ using RegisteredAllocator = std::map<
              std::tuple<std::function<c10::Allocator*(int)>, uint8_t>>>;
 
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-std::unique_ptr<RegisteredAllocator> gDIPURegisterdAllocatorPtr;
+std::unique_ptr<RegisteredAllocator> gDIPURegisteredAllocatorPtr;
 
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 std::mutex dipu_register_allocator_mutex;
@@ -50,16 +57,16 @@ void setAllocator(const std::string& name, c10::DeviceType device_type,
                   const std::function<c10::Allocator*(int)>& allocator_getter,
                   uint8_t priority) {
   std::lock_guard<std::mutex> lock(dipu_register_allocator_mutex);
-  if (!gDIPURegisterdAllocatorPtr) {
-    gDIPURegisterdAllocatorPtr = std::make_unique<RegisteredAllocator>();
+  if (!gDIPURegisteredAllocatorPtr) {
+    gDIPURegisteredAllocatorPtr = std::make_unique<RegisteredAllocator>();
   }
-  auto& gDIPURegisterdAllocator = *gDIPURegisterdAllocatorPtr;
-  if (gDIPURegisterdAllocator[device_type].count(name) <= 0) {
-    gDIPURegisterdAllocator[device_type][name] =
+  auto& gDIPURegisteredAllocator = *gDIPURegisteredAllocatorPtr;
+  if (gDIPURegisteredAllocator[device_type].count(name) <= 0) {
+    gDIPURegisteredAllocator[device_type][name] =
         std::make_tuple(allocator_getter, priority);
   } else {
-    if (std::get<1>(gDIPURegisterdAllocator[device_type][name]) < priority) {
-      gDIPURegisterdAllocator[device_type][name] =
+    if (std::get<1>(gDIPURegisteredAllocator[device_type][name]) < priority) {
+      gDIPURegisteredAllocator[device_type][name] =
           std::make_tuple(allocator_getter, priority);
     } else {
       TORCH_CHECK(false,
@@ -70,21 +77,29 @@ void setAllocator(const std::string& name, c10::DeviceType device_type,
   }
 }
 
-c10::Allocator* getAllocator(const c10::Device& device) {
+namespace {
+
+int getDeviceIndex(const c10::Device& device, int host_index) {
+  if (device.is_cpu()) {
+    return host_index;
+  }
+  if (device.has_index()) {
+    return device.index();
+  }
+  return devproxy::current_device();
+}
+
+c10::Allocator* createAllocator(const c10::Device& device) {
   c10::DeviceType device_type = device.type();
   c10::Allocator* result = nullptr;
-  auto& gDIPURegisterdAllocator = *gDIPURegisterdAllocatorPtr;
+  auto& gDIPURegisteredAllocator = *gDIPURegisteredAllocatorPtr;
   const std::string algorithm =
       (device_type == dipu::DIPU_DEVICE_TYPE ? dipu_device_memcaching_algorithm
                                              : dipu_host_memcaching_algorithm);
-  if (gDIPURegisterdAllocator[device_type].count(algorithm) > 0) {
+  if (gDIPURegisteredAllocator[device_type].count(algorithm) > 0) {
     auto allocator_geter =
-        std::get<0>(gDIPURegisterdAllocator[device_type][algorithm]);
-    int device_index = 0;
-    if (device_type == dipu::DIPU_DEVICE_TYPE) {
-      device_index =
-          device.has_index() ? device.index() : devproxy::current_device();
-    }
+        std::get<0>(gDIPURegisteredAllocator[device_type][algorithm]);
+    int device_index = getDeviceIndex(device, 0);
 
     auto allocator = allocator_geter(device_index);
     if (device_type == dipu::DIPU_DEVICE_TYPE) {
@@ -98,6 +113,22 @@ c10::Allocator* getAllocator(const c10::Device& device) {
   return nullptr;
 }
 
+}  // namespace
+
+c10::Allocator* getAllocator(const c10::Device& device) {
+  // allocator_lookup_table[device_index] == device allocator
+  // allocator_lookup_table[device_count] == host allocator
+  static const int device_count = devproxy::getDeviceCount();
+  static const int host_index = device_count;
+  static std::vector<c10::Allocator*> allocator_lookup_table(device_count + 1);
+  int device_index = getDeviceIndex(device, host_index);
+  auto& allocator = allocator_lookup_table[device_index];
+  if (allocator == nullptr) {
+    allocator = createAllocator(device);
+  }
+  return allocator;
+}
+
 c10::Allocator* getAllocator(c10::DeviceType device_type) {
   return getAllocator(c10::Device(device_type));
 }
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h
index 26dd2225fe..238ac88ebe 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h
@@ -1,17 +1,12 @@
 // Copyright (c) 2023, DeepLink.
 #pragma once
 
-#include <list>
-#include <map>
 #include <set>
 
 #include <c10/core/Allocator.h>
 #include <c10/core/Device.h>
 
-#include "csrc_dipu/runtime/core/DIPUEvent.h"
-
 #include "DIPUAsyncResourcePool.h"
-#include "DIPUCachingAllocatorUtils.h"
 #include "DIPURawAllocator.h"
 
 namespace dipu {
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.cpp b/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.cpp
index 275d358e80..82b2985200 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.cpp
@@ -41,7 +41,10 @@ void syncDevice() { return devapis::syncDevice(); }
 // check last launch succ or not, throw if fail
 void checkLastError() { return devapis::checkLastError(); }
 
-int getDeviceCount() { return devapis::getDeviceCount(); }
+int getDeviceCount() {
+  static int device_count = devapis::getDeviceCount();
+  return device_count;
+}
 
 void getDriverVersion(int* version) {
   return devapis::getDriverVersion(version);
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/cuda/deviceimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/cuda/deviceimpl.cpp
index 04282160de..f716eb9880 100644
--- a/dipu/torch_dipu/csrc_dipu/vendor/cuda/deviceimpl.cpp
+++ b/dipu/torch_dipu/csrc_dipu/vendor/cuda/deviceimpl.cpp
@@ -62,7 +62,7 @@ void checkLastError() { DIPU_CALLCUDA(::cudaGetLastError()) }
 
 int getDeviceCount() {
   int num = -1;
-  DIPU_CALLCUDA(::cudaGetDeviceCount(reinterpret_cast<int*>(&num)))
+  DIPU_CALLCUDA(::cudaGetDeviceCount(&num))
   return num;
 }
 

From db40164c70dfac8492dd29e3575742a1b5f44ec2 Mon Sep 17 00:00:00 2001
From: HuayiL <442488254@qq.com>
Date: Fri, 15 Dec 2023 17:19:54 +0800
Subject: [PATCH 18/58] Op preference mem format (#525)

* add memory perference in op for camb.
This change will add a TAG in diopi_functions.yaml and the autogen will replace it with the prefered memory format depending on the convert_config.yaml of the device

* fix bug found in ci running

* improve the code according to the comment.

* improve code format.

* improve CMakeLists.txt code.
---
 .../autogen_diopi_wrapper.py                  |  13 +-
 .../diopi_functions.yaml                      | 108 ++++++++++++++--
 .../op_memory_format_converter.py             | 115 ++++++++++++++++++
 dipu/torch_dipu/csrc_dipu/CMakeLists.txt      |   5 +-
 4 files changed, 226 insertions(+), 15 deletions(-)
 create mode 100644 dipu/scripts/autogen_diopi_wrapper/op_memory_format_converter.py

diff --git a/dipu/scripts/autogen_diopi_wrapper/autogen_diopi_wrapper.py b/dipu/scripts/autogen_diopi_wrapper/autogen_diopi_wrapper.py
index 366d5e6eda..fc76ef0e0b 100644
--- a/dipu/scripts/autogen_diopi_wrapper/autogen_diopi_wrapper.py
+++ b/dipu/scripts/autogen_diopi_wrapper/autogen_diopi_wrapper.py
@@ -658,8 +658,9 @@ def functions_code_gen(fun_config):
         return_code = f"return std::tie({params});"
 
     custom_code_at_the_beginning = fun_config.get('custom_code_at_the_beginning', fun_config.get('custom_code', ''))
+    #strip all whitespace and divide code to different lines.
     custom_code_at_the_beginning = re.sub(';\s*$', ';\n',custom_code_at_the_beginning)
-
+    
     fbody = fun_template.substitute(
             comment=[fun_config['schema']],
             cppsignautre=[create_cpp_signature_from_schema(fun_config['schema'])],
@@ -738,6 +739,7 @@ def parase_args():
     import argparse
     parser = argparse.ArgumentParser(description='autogen diopi wrapper code')
     parser.add_argument('--config', type=str, default = 'diopi_functions.yaml', help='path to functions config file')
+    parser.add_argument('--convert_config', type=str, dest = "convert_config", default="", help="path to the convert_config.yaml")
     parser.add_argument('--out', type=str, default = 'AutoGenedKernels.cpp', help='path to functions config file')
     parser.add_argument('--dummy_call_diopi', default=False, type=boolean_string, help='whether acctually call diopi interface')
     parser.add_argument('--use_diopi_adapter', default=True, type=boolean_string, help='whether use diopi adapter')
@@ -757,7 +759,9 @@ def main():
         file_data = diopi_functions_file.read()
         funcs_config = yaml.load(file_data, Loader=yaml.FullLoader)
 
-
+    from op_memory_format_converter import OpMemoryFormatConverter
+    memory_format_converter = OpMemoryFormatConverter(args.convert_config)
+    
     functions_code = ''
     op_register_code = ''
     header_include_code = ''
@@ -775,6 +779,7 @@ def main():
         mergeed_fun_config = dict(args.fun_config_dict)
         mergeed_fun_config.update(vars(args))
         mergeed_fun_config.update(fun_config)
+        #filter for those device specific op.
         if 'device' in mergeed_fun_config:
             current_device = mergeed_fun_config.get('current_device', '')
             if current_device not in (mergeed_fun_config['device'] + ['all',]):
@@ -789,6 +794,10 @@ def main():
                 continue
 
         fun_code, register_code = functions_code_gen(mergeed_fun_config)
+        
+        #The class object memory_format_converter will replace the prefered memory format placeholder to the prefered memory format based on the device's convert_config.yaml
+        fun_code = memory_format_converter.convert(fun_code, fun_config)
+        
         functions_code += fun_code
         if mergeed_fun_config.get('register_op', True) in [True, "True"]:
             if mergeed_fun_config.get('autograd', False) == True:
diff --git a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
index af1e62e564..f203c03c97 100755
--- a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
+++ b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
@@ -204,7 +204,15 @@
 - schema: "aten::native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)"
   custom_code_at_the_beginning: |
     const int64_t dim_c = input.size(1);
-    auto out0 = at::empty_like(input);
+    const auto input_shape = input.sizes();
+    const int axis = input_shape.size();
+    auto out0 = at::empty_like(input, input.options(), \
+        (axis==4?\
+            (c10::optional<at::MemoryFormat>(${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt})):\
+            (axis==5?\
+                (c10::optional<at::MemoryFormat>(${PREFERED_MEMORY_FORMAT_PLACEHOLDER_3D:-c10::nullopt})):\
+                c10::optional<at::MemoryFormat>(c10::nullopt))\
+        ));
     auto options = input.options().dtype(at::kFloat);
     at::Tensor out1;
     at::Tensor out2;
@@ -227,7 +235,15 @@
   custom_code_at_the_beginning: |
     int64_t dim_c = input.size(1);
     auto options = input.options().dtype(at::kFloat);
-    at::Tensor out0 = at::empty_like(input);
+    const auto input_shape = input.sizes();
+    const int axis = input_shape.size();
+    at::Tensor out0 = at::empty_like(input, input.options(), \
+        (axis==4?\
+             (c10::optional<at::MemoryFormat>(${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt})):\
+             (axis==5?\
+                 (c10::optional<at::MemoryFormat>(${PREFERED_MEMORY_FORMAT_PLACEHOLDER_3D:-c10::nullopt})):\
+                  c10::optional<at::MemoryFormat>(c10::nullopt))\
+        ));
     at::Tensor out1 = at::empty({dim_c}, options);
     at::Tensor out2 = at::empty({dim_c}, options);
   interface: diopiBatchNormBackward(ctx, out0, out1, out2, grad_out, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps)
@@ -515,7 +531,7 @@
     int64_t out_height = (height + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1) / stride[0] + 1;
     int64_t out_width = (width + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1) / stride[1] + 1;
     c10::SmallVector<int64_t, 4> output_size = {batch_size, out_channel, out_height, out_width};
-    at::Tensor out = at::empty(output_size, input.options().memory_format(input.suggest_memory_format()));
+    at::Tensor out = at::empty(output_size, input.options(),${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-input.suggest_memory_format()});
   interface: diopiConvolution2d(&context, out, input, weight, bias, stride, padding, dilation, groups)
 
 - schema: "convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)"
@@ -527,7 +543,7 @@
     at::Tensor grad_bias;
     std::vector<int64_t> bias_sizes;
     if (output_mask[0]) {
-      grad_input = at::empty_like(input);
+      grad_input = at::empty(input.sizes(), input.options(), ${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
     }
     if (output_mask[1]) {
       grad_weight = at::empty(weight.sizes(), weight.options().dtype(at::kFloat).memory_format(weight.suggest_memory_format()));
@@ -547,7 +563,7 @@
     at::Tensor grad_input;
     at::Tensor grad_weight;
     at::Tensor grad_bias;
-    grad_input = at::empty(input.sizes(), input.options());
+    grad_input = at::empty(input.sizes(), input.options(), ${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
     grad_weight = at::empty(weight.sizes(), weight.options().dtype(at::kFloat));
     if (output_mask[2]) {
         grad_bias = at::empty({grad_output.size(1)}, grad_output.options());
@@ -569,7 +585,7 @@
     const int64_t w_out = (w_in - 1) * stride[1] - 2 * padding[1] + (dilation[1] * (kernel_width - 1) + 1) + output_padding[1];
     const int64_t c_out = weight.size(1) * groups;
     auto output_shape =  input.sizes().size() == 3 ? std::vector<int64_t>{c_out, h_out, w_out} : std::vector<int64_t>{n, c_out, h_out, w_out};
-    auto out = at::empty(output_shape, input.options());
+    auto out = at::empty(output_shape, input.options(), ${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
   interface: diopiConvTranspose2d(ctx, out, input, weight, bias, stride, padding, output_padding, groups, dilation)
   forward_process_code: |
     bool bias_has_value = (bias.has_value()) ? bias.value().requires_grad() : false;
@@ -1321,6 +1337,22 @@
     }
   interface: diopiUpsampleNearest(ctx, out, self, size);
 
+- schema: "upsample_nearest2d(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor"
+  size_attr: [size]
+  custom_code_at_the_beginning: |
+    std::vector<int64_t> size(2);
+    if (output_size.size() > 0) {
+      std::vector<int64_t> tmpVector(output_size.size());
+      auto symIntToInt = [](const c10::SymInt& t)-> int64_t {return t.expect_int();};
+      std::transform(output_size.cbegin(), output_size.cend(), tmpVector.begin(), symIntToInt);
+      std::copy(tmpVector.begin(), tmpVector.end(), size.begin());
+    } else {
+      size[0] = std::floor(self.size(-2) * scales_h.value_or(1.0));
+      size[1] = std::floor(self.size(-1) * scales_w.value_or(1.0));
+    }
+    auto out = at::empty({self.size(0),self.size(1),size[0],size[1]},self.options(),${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+  interface: diopiUpsampleNearest(ctx, out, self, size);
+
 - schema: "upsample_bilinear2d.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)"
   size_attr: [size]
   custom_code_at_the_beginning: |
@@ -1335,6 +1367,23 @@
     const char* mode = "bilinear";
   interface: diopiUpsampleLinear(ctx, out, self, size, align_corners, mode);
 
+- schema: "upsample_bilinear2d(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor"
+  size_attr: [size]
+  custom_code_at_the_beginning: |
+    std::vector<int64_t> size(2);
+    if (output_size.size() > 0) {
+      std::vector<int64_t> tmpVector(output_size.size());
+      auto symIntToInt = [](const c10::SymInt& t)-> int64_t {return t.expect_int();};
+      std::transform(output_size.cbegin(), output_size.cend(), tmpVector.begin(), symIntToInt);
+      std::copy(tmpVector.begin(), tmpVector.end(), size.begin());
+    } else {
+      size[0] = std::floor(self.size(-2) * scales_h.value_or(1.0));
+      size[1] = std::floor(self.size(-1) * scales_w.value_or(1.0));
+    }
+    auto out = at::empty({self.size(0),self.size(1),size[0],size[1]},self.options(),${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+    const char* mode = "bilinear";
+  interface: diopiUpsampleLinear(ctx, out, self, size, align_corners, mode);
+
 - schema: "upsample_nearest2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)"
   size_attr: [size]
   custom_code_at_the_beginning: |
@@ -1348,6 +1397,23 @@
     }
   interface: diopiUpsampleNearestBackward(ctx, grad_input, grad_output, size, input_size)
 
+- schema: "upsample_nearest2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor grad_input"
+  size_attr: [size]
+  custom_code_at_the_beginning: |
+    std::vector<int64_t> size(2);
+    auto symInt2Int = [](const c10::SymInt& t)-> int64_t {return t.expect_int();};
+    std::vector<int64_t> grad_input_shape(input_size.size());
+    std::transform(input_size.cbegin(), input_size.cend(), grad_input_shape.begin(), symInt2Int);
+    auto grad_input = at::empty(grad_input_shape,grad_output.options(),${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+  custom_code_before_call_diopi: |
+    if (output_size.size() > 0) {
+      std::copy(output_sizeVector.begin(), output_sizeVector.end(), size.begin());
+    } else {
+      size[0] = std::floor((*(input_sizeVector.rbegin() + 1)) * scales_h.value_or(1.0));
+      size[1] = std::floor((*(input_sizeVector.rbegin())) * scales_w.value_or(1.0));
+    }
+  interface: diopiUpsampleNearestBackward(ctx, grad_input, grad_output, size, input_size)
+
 - schema: "upsample_bilinear2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)"
   size_attr: [size]
   custom_code_at_the_beginning: |
@@ -1362,6 +1428,24 @@
     const char* mode = "bilinear";
   interface: diopiUpsampleLinearBackward(ctx, grad_input, grad_output, size, input_size, align_corners, mode)
 
+- schema: "upsample_bilinear2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor grad_input"
+  size_attr: [size]
+  custom_code_at_the_beginning: |
+    std::vector<int64_t> size(2);
+    auto symInt2Int = [](const c10::SymInt& t)-> int64_t {return t.expect_int();};
+    std::vector<int64_t> grad_input_shape(input_size.size());
+    std::transform(input_size.cbegin(), input_size.cend(), grad_input_shape.begin(), symInt2Int);
+    auto grad_input = at::empty(grad_input_shape,grad_output.options(),${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+  custom_code_before_call_diopi: |
+    if (output_size.size() > 0) {
+      std::copy(output_sizeVector.begin(), output_sizeVector.end(), size.begin());
+    } else {
+      size[0] = std::floor((*(input_sizeVector.rbegin() + 1)) * scales_h.value_or(1.0));
+      size[1] = std::floor((*(input_sizeVector.rbegin())) * scales_w.value_or(1.0));
+    }
+    const char* mode = "bilinear";
+  interface: diopiUpsampleLinearBackward(ctx, grad_input, grad_output, size, input_size, align_corners, mode)
+
 - schema: "sin(Tensor self) -> Tensor"
   custom_code_at_the_beginning: |
     auto out = at::empty_like(self);
@@ -2002,8 +2086,8 @@
 - schema: batch_norm_stats(Tensor input, float eps) -> (Tensor, Tensor)
   custom_code_at_the_beginning: |
     auto shape = input.size(1);
-    auto out0 = at::empty({shape}, input.options().dtype(at::kFloat));
-    auto out1 = at::empty({shape}, input.options().dtype(at::kFloat));
+    auto out0 = at::empty({shape}, input.options().dtype(at::kFloat), ${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+    auto out1 = at::empty({shape}, input.options().dtype(at::kFloat), ${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
   interface: diopiBatchNormStats(ctx, out0, out1, input, eps)
 
 - schema: batch_norm_gather_stats_with_counts(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, Tensor counts) -> (Tensor, Tensor)
@@ -2026,8 +2110,8 @@
     at::Tensor out2;
     at::Tensor out3;
     if(input_g){
-      out0 = at::empty({shape}, input.options().dtype(at::kFloat));
-      out1 = at::empty({shape}, input.options().dtype(at::kFloat));
+      out0 = at::empty({shape}, input.options().dtype(at::kFloat), ${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+      out1 = at::empty({shape}, input.options().dtype(at::kFloat), ${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
     }
     if(weight_g){
       out2 = at::empty({shape}, input.options().dtype(at::kFloat));
@@ -2039,12 +2123,12 @@
 
 - schema: batch_norm_backward_elemt(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor sum_dy, Tensor sum_dy_xmu, Tensor count) -> Tensor
   custom_code_at_the_beginning: |
-    auto out = at::empty_like(grad_out);
+    auto out = at::empty_like(grad_out, grad_out.options(), ${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
   interface: diopiBatchNormBackwardElemt(ctx, out, grad_out, input, mean, invstd, weight, sum_dy, sum_dy_xmu, count);
 
 - schema: batch_norm_elemt(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps) -> Tensor
   custom_code_at_the_beginning: |
-    auto out = at::empty_like(input);
+    auto out = at::empty_like(input, input.options(), ${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
   interface: diopiBatchNormElemt(ctx, out, input, weight, bias, mean, invstd, static_cast<float>(eps));
 
 - schema: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!)
diff --git a/dipu/scripts/autogen_diopi_wrapper/op_memory_format_converter.py b/dipu/scripts/autogen_diopi_wrapper/op_memory_format_converter.py
new file mode 100644
index 0000000000..dfd18dc8bc
--- /dev/null
+++ b/dipu/scripts/autogen_diopi_wrapper/op_memory_format_converter.py
@@ -0,0 +1,115 @@
+import os
+import re
+import yaml
+
+accepted_interface = "ALL"
+
+class OpMemoryFormatConverter(object):
+    #The converter class, will do the converting memory format based on the convert_config.yaml loaded.
+    def __init__(self, convert_config):
+        assert(isinstance(convert_config, str))
+        if convert_config and len(convert_config):
+            with open(convert_config) as convert_config_yaml_file:
+                file_data = convert_config_yaml_file.read()
+                self.convert_config_yaml = yaml.load(file_data, Loader=yaml.FullLoader)
+                self.convert_config = ConvertConfig(self.convert_config_yaml)
+        else:
+            self.convert_config_yaml = list()
+            self.convert_config = ConvertConfig(self.convert_config_yaml)
+
+    def convert(self,custom_code,fun_config):
+        if "interface" in fun_config and (accepted_interface == "ALL" or (fun_config['interface'] in accepted_interface)):
+            return self.do_convert(custom_code,fun_config)
+        else:
+            return custom_code
+    
+    def do_convert(self,custom_code,fun_config):
+        # Do the covert job
+        def choose_default(matched):
+            value = str(matched.group("default"))
+            return value
+        
+        def choose_channelsLast3d(matched):
+            return "at::MemoryFormat::ChannelsLast3d"
+        
+        def choose_channelsLast(matched):
+            return "at::MemoryFormat::ChannelsLast"
+        
+        def choose_contiguous(matched):
+            return "at::MemoryFormat::Contiguous"
+
+        def choose_preserve(matched):
+            return "at::MemoryFormat::Preserve"
+
+        interface = fun_config["interface"]
+        custom_code = custom_code.split("\n")
+        memory_format = self.convert_config.interface2memoryformat(interface)
+        custom_code_new = list()
+        # match string like "${PREFERED_MEMORY_FORMAT_PLACHOLDER_3D:-<default>}"
+        placeholder_3d_pattern = "\$\{PREFERED_MEMORY_FORMAT_PLACEHOLDER_3D:-(?P<default>.*)\}"
+        # match string like "${PREFERED_MEMORY_FORMAT_PLACHOLDER:-<default>}"
+        placeholder_pattern = "\$\{PREFERED_MEMORY_FORMAT_PLACEHOLDER:-(?P<default>.*)\}"
+        for line in custom_code:
+            if memory_format == "channellast":
+                line = re.sub(placeholder_3d_pattern, choose_channelsLast3d, line)
+                line = re.sub(placeholder_pattern, choose_channelsLast, line)
+            elif memory_format == "contiguous":
+                line = re.sub(placeholder_3d_pattern, choose_contiguous, line)
+                line = re.sub(placeholder_pattern, choose_contiguous, line)
+            elif memory_format == "preserve":
+                line = re.sub(placeholder_3d_pattern, choose_preserve, line)
+                line = re.sub(placeholder_pattern, choose_preserve, line)
+            elif memory_format == "empty":
+                line = re.sub(placeholder_3d_pattern, choose_default, line)
+                line = re.sub(placeholder_pattern, choose_default, line)
+            else:
+                print("UNABLE TO RECOGNIZE MEMORY FORMAT!!!")
+            custom_code_new.append(line)
+        custom_code = "\n".join(custom_code_new)
+        return custom_code
+
+class ConvertConfig(object):
+    #This class is used to load and parse the convert_config.yaml
+    def __init__(self, config_yaml):
+        self.convert_dict = dict()
+        self.convert_config_yaml = config_yaml
+        self.default_layout = "empty"
+        assert(isinstance(config_yaml, list))
+        for config in config_yaml:
+            assert(isinstance(config,dict))
+            for interface in config.keys():
+                if interface == "common_config":
+                    detail = config[interface]
+                    assert(isinstance(detail, dict))
+                    if "layout" in detail:
+                        self.default_layout = self.layout2memoryformat(detail["layout"])
+                    pass
+                    # may add common behavior
+            for interface in config.keys():
+                if interface != "common_config":
+                    self.convert_dict.setdefault(interface,dict())
+                    detail = config[interface]
+                    assert(isinstance(detail, dict))
+                    if "layout" in detail:
+                        self.convert_dict[interface]["layout"] = self.layout2memoryformat(detail["layout"])
+ 
+    def layout2memoryformat(self, layout):
+        #used when pasing convert_config.yaml, return the memory format based on NCHW/NHWC and other layout.
+        assert(isinstance(layout, str))
+        if "NCHW" in layout:
+            return "contiguous"
+        if "NLC" in layout:
+            return "channellast"
+        if "NHWC" in layout:
+            return "channellast"
+        if "NDHWC" in layout:
+            return "channellast"
+        return "preserve"
+     
+    def interface2memoryformat(self, interface):
+        #return the prefered memory format based on the DIOPI interface.
+        interface_stripped = interface.strip().split("(")[0]
+        if (interface_stripped not in self.convert_dict) or ("layout" not in self.convert_dict[interface_stripped]):
+            return self.default_layout
+        else:
+            return self.convert_dict[interface_stripped]["layout"]
diff --git a/dipu/torch_dipu/csrc_dipu/CMakeLists.txt b/dipu/torch_dipu/csrc_dipu/CMakeLists.txt
index 764c36c910..f12feb8558 100644
--- a/dipu/torch_dipu/csrc_dipu/CMakeLists.txt
+++ b/dipu/torch_dipu/csrc_dipu/CMakeLists.txt
@@ -29,12 +29,15 @@ add_custom_command(
   COMMAND
     python "${DIPU_AUTOGEN_DIOPI_WRAPPER_SCRIPT}" --config
     "${DIPU_AUTOGEN_DIOPI_WRAPPER_CONFIG}" --out "${DIPU_AUTOGENED_KERNELS_CPP}"
+    "$<$<NOT:$<STREQUAL:${UsedVendor},cuda>>:--convert_config=${CMAKE_SOURCE_DIR}/third_party/DIOPI/impl/${UsedVendor}/convert_config.yaml>"
     --use_diopi_adapter "False" --autocompare "False" --print_func_call_info "True"
     --print_op_arg "True" --fun_config_dict
     '{\"current_device\": \"${UsedVendor}\"}'
   DEPENDS ${DIPU_AUTOGEN_DIOPI_WRAPPER_SCRIPT}
           ${DIPU_AUTOGEN_DIOPI_WRAPPER_CONFIG}
-          ${DIPU_AUTOGEN_DIOPI_WRAPPER_TEMPLATE})
+          ${DIPU_AUTOGEN_DIOPI_WRAPPER_TEMPLATE}
+          "$<$<NOT:$<STREQUAL:${UsedVendor},cuda>>:${CMAKE_SOURCE_DIR}/third_party/DIOPI/impl/${UsedVendor}/convert_config.yaml>"
+)
 add_custom_target(autogen_diopi_kernels_cpp
                   DEPENDS ${DIPU_AUTOGENED_KERNELS_CPP})
 add_dependencies(${DIPU_AUTOGENED_KERNELS} autogen_diopi_kernels_cpp)

From 77585a6bfd92280883524cd4c65b1f6e843686c8 Mon Sep 17 00:00:00 2001
From: lyp-liuyipeng <49586870+lyp-liuyipeng@users.noreply.github.com>
Date: Mon, 18 Dec 2023 11:33:57 +0800
Subject: [PATCH 19/58] lyp_clang_tidy: warning uint64_t->int (#518)

* clang_tidy:torch_dipu/csrc_dipu/profiler/CorrelationIDManager.cpp
                                         CorrelationIDManager.h

* clang_tidy dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.cpp .h

* clang_tidy:torch_dipu/csrc_dipu/profiler/profiler.cpp

* clang_tidy:torch_dipu/csrc_dipu/profiler/patch.cpp

* clang_tidy:torch_dipu/csrc_dipu/profiler/patch.cpp --v2

* clang_tidy:dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBFCachingAllocator.cpp

* clang_tidy:dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBFCachingAllocator.cpp -v2

* clang_tidy: dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEvent.h

* clang_tidy: torch_dipu/csrc_dipu/profiler/profiler.h --v2

* clang_tidy: torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.cpp --v2

* clang_tidy: torch_dipu/csrc_dipu/profiler/CorrelationIDManager.cpp .h --v2

* clang_tidy: magic number; const_cast

* clang_tidy: fix some review issus

* clang_tidy: modify format by using run_format.sh
---
 .../profiler/CorrelationIDManager.cpp         |  10 +-
 .../csrc_dipu/profiler/CorrelationIDManager.h |  21 +--
 .../csrc_dipu/profiler/DIPUDeviceActivity.cpp |  32 ++---
 .../csrc_dipu/profiler/DIPUDeviceActivity.h   |   1 -
 dipu/torch_dipu/csrc_dipu/profiler/patch.cpp  | 124 ++++++++++--------
 .../csrc_dipu/profiler/profiler.cpp           |  54 ++++----
 dipu/torch_dipu/csrc_dipu/profiler/profiler.h |   2 +
 .../core/allocator/DIPUBFCachingAllocator.cpp |  63 +++++----
 8 files changed, 172 insertions(+), 135 deletions(-)

diff --git a/dipu/torch_dipu/csrc_dipu/profiler/CorrelationIDManager.cpp b/dipu/torch_dipu/csrc_dipu/profiler/CorrelationIDManager.cpp
index 1db7ec9d37..98e7141985 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/CorrelationIDManager.cpp
+++ b/dipu/torch_dipu/csrc_dipu/profiler/CorrelationIDManager.cpp
@@ -22,14 +22,18 @@ void CorrelationIDManager::popCorrelationID(
   type_.pop_back();
 }
 
-uint64_t CorrelationIDManager::getCorrelationID() const {
+uint64_t CorrelationIDManager::getCorrelationID() {
   DeviceActivityInterface::CorrelationFlowType type = type_.back();
   return external_ids_[type].back();
 }
 
-thread_local std::deque<uint64_t> CorrelationIDManager::external_ids_
-    [DeviceActivityInterface::CorrelationFlowType::End];
+thread_local std::array<std::deque<uint64_t>,
+                        DeviceActivityInterface::CorrelationFlowType::End>
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+    CorrelationIDManager::external_ids_;
+
 thread_local std::deque<DeviceActivityInterface::CorrelationFlowType>
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
     CorrelationIDManager::type_;
 
 }  // namespace profile
diff --git a/dipu/torch_dipu/csrc_dipu/profiler/CorrelationIDManager.h b/dipu/torch_dipu/csrc_dipu/profiler/CorrelationIDManager.h
index 0db965b980..79e478757a 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/CorrelationIDManager.h
+++ b/dipu/torch_dipu/csrc_dipu/profiler/CorrelationIDManager.h
@@ -1,8 +1,9 @@
 #pragma once
 
-#include <DeviceActivityInterface.h>
+#include <cstdint>
 #include <deque>
-#include <stdint.h>
+
+#include "DeviceActivityInterface.h"
 
 namespace dipu {
 namespace profile {
@@ -15,21 +16,25 @@ class CorrelationIDManager {
   // CorrelationIDManager designed as a singleton
   static CorrelationIDManager& instance();
 
-  void pushCorrelationID(
+  static void pushCorrelationID(
       uint64_t id,
       libkineto::DeviceActivityInterface::CorrelationFlowType type);
-  void popCorrelationID(
+  static void popCorrelationID(
       libkineto::DeviceActivityInterface::CorrelationFlowType type);
-  uint64_t getCorrelationID() const;
+  static uint64_t getCorrelationID();
 
  private:
   CorrelationIDManager() = default;
 
- private:
-  thread_local static std::deque<uint64_t> external_ids_
-      [libkineto::DeviceActivityInterface::CorrelationFlowType::End];
+  thread_local static std::array<
+      std::deque<uint64_t>,
+      libkineto::DeviceActivityInterface::CorrelationFlowType::End>
+      // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+      external_ids_;
+
   thread_local static std::deque<
       libkineto::DeviceActivityInterface::CorrelationFlowType>
+      // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
       type_;
 };
 
diff --git a/dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.cpp b/dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.cpp
index 43a02954b7..c337e2a9b5 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.cpp
+++ b/dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.cpp
@@ -11,7 +11,6 @@
 namespace dipu {
 namespace profile {
 
-using libkineto::DeviceActivityInterface;
 using libkineto::GenericTraceActivity;
 
 DIPUDeviceActivity::~DIPUDeviceActivity() {
@@ -34,12 +33,12 @@ void DIPUDeviceActivity::popCorrelationID(
 }
 
 void DIPUDeviceActivity::enableActivities(
-    const std::set<libkineto::ActivityType>& selectedActivities) {}
+    const std::set<libkineto::ActivityType>& selected_activities) {}
 
 void DIPUDeviceActivity::disableActivities(
-    const std::set<libkineto::ActivityType>& selectedActivities) {
-  if (selectedActivities.find(libkineto::ActivityType::CONCURRENT_KERNEL) !=
-      selectedActivities.end()) {
+    const std::set<libkineto::ActivityType>& selected_activities) {
+  if (selected_activities.find(libkineto::ActivityType::CONCURRENT_KERNEL) !=
+      selected_activities.end()) {
     setProfileOpen(false);
   }
 }
@@ -52,18 +51,18 @@ void DIPUDeviceActivity::clearActivities() {
 
 int32_t DIPUDeviceActivity::processActivities(
     libkineto::ActivityLogger& logger,
-    std::function<const libkineto::ITraceActivity*(int32_t)> linkedActivity,
-    int64_t startTime, int64_t endTime) {
+    std::function<const libkineto::ITraceActivity*(int32_t)> linked_activity,
+    int64_t start_time, int64_t end_time) {
   FlushAllRecords();
-
+  constexpr size_t kMillisecondPerSecond = 1000;
   auto records = RecordsImpl::get().getAllRecordList();
   for (const auto& record : records) {
     GenericTraceActivity act;
-    act.startTime = record.begin / 1000;
-    act.endTime = record.end / 1000;
-    act.id = record.opId;
-    act.device = record.pid;
-    act.resource = record.threadIdx;
+    act.startTime = static_cast<int64_t>(record.begin / kMillisecondPerSecond);
+    act.endTime = static_cast<int64_t>(record.end / kMillisecondPerSecond);
+    act.id = static_cast<int32_t>(record.opId);
+    act.device = static_cast<int32_t>(record.pid);
+    act.resource = static_cast<int32_t>(record.threadIdx);
     act.flow.id = record.opId;
     if (record.isKernel) {
       act.activityType = libkineto::ActivityType::CONCURRENT_KERNEL;
@@ -76,17 +75,17 @@ int32_t DIPUDeviceActivity::processActivities(
     act.flow.id = record.opId;
     act.flow.type = libkineto::kLinkAsyncCpuGpu;
     auto link_cor_id = record.linkCorrelationId;
-    act.linked = linkedActivity(link_cor_id);
+    act.linked = linked_activity(static_cast<int32_t>(link_cor_id));
     logger.handleGenericActivity(act);
   }
 
   std::map<std::pair<int64_t, int64_t>, libkineto::ResourceInfo>
       resource_infos = RecordsImpl::get().getResourceInfo();
   for (const auto& kv : resource_infos) {
-    logger.handleResourceInfo(kv.second, startTime);
+    logger.handleResourceInfo(kv.second, start_time);
   }
 
-  return records.size();
+  return static_cast<int32_t>(records.size());
 }
 
 void DIPUDeviceActivity::teardownContext() {}
@@ -98,6 +97,7 @@ void DIPUDeviceActivity::setMaxBufferSize(int32_t size) {}
 
 namespace libkineto {
 
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 DeviceActivityInterface* device_activity_singleton =
     &dipu::profile::DIPUDeviceActivity::instance();
 
diff --git a/dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.h b/dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.h
index cb5869bdd3..d7e193bf93 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.h
+++ b/dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.h
@@ -39,7 +39,6 @@ class DIPUDeviceActivity : public libkineto::DeviceActivityInterface {
  private:
   DIPUDeviceActivity() = default;
 
- private:
   std::unordered_map<uint64_t, std::unique_ptr<libkineto::GenericTraceActivity>>
       cpu_activities_;
   std::unordered_map<uint64_t, std::unique_ptr<libkineto::GenericTraceActivity>>
diff --git a/dipu/torch_dipu/csrc_dipu/profiler/patch.cpp b/dipu/torch_dipu/csrc_dipu/profiler/patch.cpp
index 51cee0fdc6..7d99301ded 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/patch.cpp
+++ b/dipu/torch_dipu/csrc_dipu/profiler/patch.cpp
@@ -35,6 +35,7 @@ ApproximateClockToUnixTimeConverter::measurePair() {
 }
 
 ApproximateClockToUnixTimeConverter::time_pairs
+// NOLINTNEXTLINE(readability-convert-member-functions-to-static)
 ApproximateClockToUnixTimeConverter::measurePairs() {
   static constexpr auto n_warmup = 5;
   for (C10_UNUSED const auto _ : c10::irange(n_warmup)) {
@@ -58,7 +59,8 @@ ApproximateClockToUnixTimeConverter::makeConverter() {
   for (const auto i : c10::irange(replicates)) {
     auto delta_ns = end_times[i].t_ - start_times_[i].t_;
     auto delta_approx = end_times[i].approx_t_ - start_times_[i].approx_t_;
-    scale_factors[i] = (double)delta_ns / (double)delta_approx;
+    scale_factors[i] =
+        static_cast<double>(delta_ns) / static_cast<double>(delta_approx);
   }
   std::sort(scale_factors.begin(), scale_factors.end());
   long double scale_factor = scale_factors[replicates / 2 + 1];
@@ -76,14 +78,18 @@ ApproximateClockToUnixTimeConverter::makeConverter() {
   for (const auto i : c10::irange(replicates)) {
     auto dt = start_times_[i].t_ - t0;
     auto dt_approx =
-        (double)(start_times_[i].approx_t_ - t0_approx) * scale_factor;
-    t0_correction[i] = dt - (time_t)dt_approx;
+        static_cast<double>(start_times_[i].approx_t_ - t0_approx) *
+        scale_factor;
+    t0_correction[i] = static_cast<double>(dt - static_cast<time_t>(dt_approx));
   }
-  t0 += t0_correction[t0_correction.size() / 2 + 1];
+  t0 += static_cast<time_t>(t0_correction[t0_correction.size() / 2 + 1]);
 
   return [=](approx_time_t t_approx) {
     // See above for why this is more stable than `A * t_approx + B`.
-    auto result = (time_t)((double)(t_approx - t0_approx) * scale_factor) + t0;
+    auto result =
+        static_cast<time_t>(static_cast<double>(t_approx - t0_approx) *
+                            scale_factor) +
+        t0;
     return result;
   };
 }
@@ -98,12 +104,13 @@ namespace linux_perf {
 /*
  * Syscall wrapper for perf_event_open(2)
  */
-inline long perf_event_open(struct perf_event_attr* hw_event, pid_t pid,
-                            int cpu, int group_fd, unsigned long flags) {
+inline int64_t perf_event_open(struct perf_event_attr* hw_event, pid_t pid,
+                               int cpu, int group_fd, uint64_t flags) {
   return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
 }
 
-// TODO sync with Kineto level abstract events in profiler/events.h
+// TODO(caikun-pjlab): sync with Kineto level abstract events in
+// profiler/events.h
 static const std::unordered_map<
     std::string, std::pair<perf_type_id, /* perf event type */ uint32_t>>
     EventTable{{"cycles",
@@ -156,7 +163,7 @@ void PerfEvent::Init() {
   pid_t pid = getpid();  // this pid
   int cpu = -1;          // all cpus
   int group_fd = -1;
-  unsigned long flags = 0;
+  uint64_t flags = 0;
 
   fd_ = static_cast<int>(perf_event_open(&attr, pid, cpu, group_fd, flags));
   if (fd_ == -1) {
@@ -168,7 +175,7 @@ void PerfEvent::Init() {
 
 uint64_t PerfEvent::ReadCounter() const {
   PerfCounter counter{};
-  long n = read(fd_, &counter, sizeof(PerfCounter));
+  int64_t n = read(fd_, &counter, sizeof(PerfCounter));
   TORCH_CHECK(n == sizeof(counter),
               "Read failed for Perf event fd, event : ", name_,
               ", error: ", std::strerror(errno));
@@ -197,7 +204,7 @@ void PerfProfiler::Configure(std::vector<std::string>& event_names) {
     events_.back().Init();
   }
 
-  // TODO
+  // TODO(caikun-pjlab):
   // Reset pthreadpool here to make sure we can attach to new children
   // threads
 }
@@ -265,7 +272,7 @@ activity_t* TraceWrapper::addCPUActivity(
   auto& act = libkineto::CpuTraceBuffer::toRef(cpu_trace_->activities.back());
   act.device = device_and_resource.device;
   act.resource = device_and_resource.resource;
-  act.id = correlation_id;
+  act.id = static_cast<int32_t>(correlation_id);
   act.startTime = start_time;
   if (type != libkineto::ActivityType::CPU_INSTANT_EVENT) {
     act.endTime = end_time;
@@ -318,9 +325,11 @@ void ActivityTraceWrapper::save(const std::string& path) {
 
 void addMetadata(const activity_t* activity, const std::string& key,
                  const std::string& value) {
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
   const_cast<activity_t*>(activity)->addMetadata(key, value);
 }
 
+// NOLINTNEXTLINE(readability-const-return-type)
 const DeviceAndResource kineto_ids() {
 #ifdef USE_KINETO
   return {/*device=*/libkineto::processId(),
@@ -330,14 +339,14 @@ const DeviceAndResource kineto_ids() {
 #endif  // USE_KINETO
 }
 
-struct RegisterLibKinetoClient {
+const struct RegisterLibKinetoClient {
   RegisterLibKinetoClient() { libkineto::api(); }
 } register_libkineto_client;
 
 }  // namespace kineto
 
 namespace {
-static constexpr TensorImplAddress NoTensorImpl{nullptr};
+constexpr TensorImplAddress NoTensorImpl{nullptr};
 
 struct RawTensorInfo {
   TensorImplAddress impl_;
@@ -378,10 +387,51 @@ struct RawTensors {
   }
 
   template <typename T>
-  void operator()(T&) {}
+  void operator()(T& t) {}
 
   std::vector<RawTensorInfo> tensors_;
 };
+
+void FlattenToUniformRepresentation(
+    std::vector<std::shared_ptr<Result>>& sorted_results,
+    std::vector<RawTensorInfo>& tensors) {
+  RawTensors raw_tensors;
+  // The python tracer caches values, so it's only safe to use the first case.
+  ska::flat_hash_set<PyModuleSelf> seen_modules;
+  ska::flat_hash_set<PyOptimizerSelf> seen_optimizers;
+  for (auto& result : sorted_results) {
+    result->visit(c10::overloaded(
+        [&](ExtraFields<EventType::TorchOp>& torch_op) {
+          for (auto& i : torch_op.inputs_) {
+            c10::visit(raw_tensors, i);
+          }
+        },
+        [&](ExtraFields<EventType::PyCall>& py_call) {
+          // torch.nn.Module
+          if (py_call.module_.has_value() &&
+              seen_modules.insert(py_call.module_->self_).second) {
+            for (auto& p : py_call.module_->parameters_) {
+              raw_tensors(p.metadata_);
+              raw_tensors(p.grad_metadata_);
+            }
+          }
+
+          // torch.optim.Optimizer
+          if (py_call.optimizer_.has_value() &&
+              seen_optimizers.insert(py_call.optimizer_->self_).second) {
+            for (auto& p : py_call.optimizer_->parameters_) {
+              raw_tensors(p.metadata_);
+              raw_tensors(p.grad_metadata_);
+              for (auto& state_i : p.state_) {
+                raw_tensors(state_i.second);
+              }
+            }
+          }
+        },
+        [&](auto& i) { raw_tensors(i); }));
+  }
+  tensors = std::move(raw_tensors.tensors_);
+}
 }  // namespace
 
 void calculateUniqueTensorIDs(
@@ -393,45 +443,7 @@ void calculateUniqueTensorIDs(
 
   // Flatten results to a uniform representation.
   // --------------------------------------------------------------------------
-  {
-    RawTensors raw_tensors;
-
-    // The python tracer caches values, so it's only safe to use the first case.
-    ska::flat_hash_set<PyModuleSelf> seen_modules;
-    ska::flat_hash_set<PyOptimizerSelf> seen_optimizers;
-    for (auto& result : sorted_results) {
-      result->visit(c10::overloaded(
-          [&](ExtraFields<EventType::TorchOp>& torch_op) {
-            for (auto& i : torch_op.inputs_) {
-              c10::visit(raw_tensors, i);
-            }
-          },
-          [&](ExtraFields<EventType::PyCall>& py_call) {
-            // torch.nn.Module
-            if (py_call.module_.has_value() &&
-                seen_modules.insert(py_call.module_->self_).second) {
-              for (auto& p : py_call.module_->parameters_) {
-                raw_tensors(p.metadata_);
-                raw_tensors(p.grad_metadata_);
-              }
-            }
-
-            // torch.optim.Optimizer
-            if (py_call.optimizer_.has_value() &&
-                seen_optimizers.insert(py_call.optimizer_->self_).second) {
-              for (auto& p : py_call.optimizer_->parameters_) {
-                raw_tensors(p.metadata_);
-                raw_tensors(p.grad_metadata_);
-                for (auto& state_i : p.state_) {
-                  raw_tensors(state_i.second);
-                }
-              }
-            }
-          },
-          [&](auto& i) { raw_tensors(i); }));
-    }
-    tensors = std::move(raw_tensors.tensors_);
-  }
+  FlattenToUniformRepresentation(sorted_results, tensors);
 
   // Assign IDs to solve ABA for Storage.
   // --------------------------------------------------------------------------
@@ -441,7 +453,7 @@ void calculateUniqueTensorIDs(
     ska::flat_hash_map<key_t, size_t, HashCombine> versions;
     for (auto& t : tensors) {
       auto inserted = versions.insert({{t.storage_, t.device_}, counter});
-      counter += inserted.second;
+      counter += static_cast<size_t>(inserted.second);
       t.allocation_id_ref_.get().emplace(AllocationID(inserted.first->second));
       if (t.is_free_) {
         versions.erase(inserted.first);
@@ -503,7 +515,7 @@ void calculateUniqueTensorIDs(
     size_t current_id{0};
     for (const auto& i : unique_pairs) {
       auto inserted = id_map.insert({i.first, current_id});
-      current_id += inserted.second;
+      current_id += static_cast<size_t>(inserted.second);
       id_map.insert({i.second, inserted.first->second});
     }
   }
diff --git a/dipu/torch_dipu/csrc_dipu/profiler/profiler.cpp b/dipu/torch_dipu/csrc_dipu/profiler/profiler.cpp
index 4789b49848..0e752f37ad 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/profiler.cpp
+++ b/dipu/torch_dipu/csrc_dipu/profiler/profiler.cpp
@@ -20,7 +20,7 @@ static const int32_t DEFAULT_FLUSH_READY_INTERVAL = 1000;
 
 class DeviceEvent final {
  private:
-  deviceEvent_t evt_;
+  deviceEvent_t evt_{};
 
  public:
   DeviceEvent() { dipu::devproxy::createEvent(&evt_); }
@@ -39,26 +39,25 @@ class StreamTimeOffsetTracker final {
   DeviceEvent begin_;
   deviceStream_t stream_;
   size_t beginOffset_;
-  float ratio_ = 0.f;
+  float ratio_ = 0.F;
 
  public:
-  explicit StreamTimeOffsetTracker(deviceStream_t stream) {
-    stream_ = stream;
+  explicit StreamTimeOffsetTracker(deviceStream_t stream)
+      : stream_(stream), beginOffset_(torch::profiler::impl::getTime()) {
     devproxy::recordEvent(begin_.get(), stream_);
     devproxy::waitEvent(begin_.get());
-    beginOffset_ = torch::profiler::impl::getTime();
   }
 
   ~StreamTimeOffsetTracker() = default;
 
   void sync() {
     DeviceEvent end;
-    float time;
+    float time = 0.F;
     dipu::devproxy::recordEvent(end.get(), stream_);
     dipu::devproxy::waitEvent(end.get());
     dipu::devproxy::eventElapsedTime(&time, begin_.get(), end.get());
     size_t endOffset = torch::profiler::impl::getTime();
-    ratio_ = 1.0f * (endOffset - beginOffset_) / time;
+    ratio_ = static_cast<float>(endOffset - beginOffset_) / time;
   }
 
   const DeviceEvent& begin() const { return begin_; }
@@ -123,6 +122,7 @@ RecordsImpl::getResourceInfo() const {
   return resourceInfo_;
 }
 
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 thread_local RecordsImpl::records_t* RecordsImpl::pRecords = nullptr;
 
 class DeviceRecordsImpl final {
@@ -133,7 +133,6 @@ class DeviceRecordsImpl final {
   std::vector<Record> ready_records_;
   std::unique_ptr<StreamTimeOffsetTracker> pTracker_;
 
- private:
   DeviceRecordsImpl() {}
 
   static bool enableFlushReadyEvent() {
@@ -156,7 +155,7 @@ class DeviceRecordsImpl final {
   }
 
   size_t getTime(const DeviceEvent& evt, float scale = 1., size_t shift = 0) {
-    float time;
+    float time = 0.F;
     dipu::devproxy::waitEvent(evt.get());
     dipu::devproxy::eventElapsedTime(&time, beginEvent(), evt.get());
     return static_cast<size_t>(time * scale) + shift;
@@ -165,17 +164,16 @@ class DeviceRecordsImpl final {
  public:
   ~DeviceRecordsImpl() { reset(); }
 
- public:
   void ensureSetup(deviceStream_t stream) {
     if (!pTracker_) {
       std::lock_guard<std::mutex> lk(mtx_);
       if (!pTracker_) {
-        pTracker_.reset(new StreamTimeOffsetTracker(stream));
+        pTracker_ = std::make_unique<StreamTimeOffsetTracker>(stream);
       }
     }
   }
 
-  void addDeviceRecord(DeviceRecord record) {
+  void addDeviceRecord(const DeviceRecord& record) {
     std::lock_guard<std::mutex> lk(mtx_);
     TORCH_CHECK(pTracker_, "dipu profiler error with pTracker is not inited");
     records_.push_back(record);
@@ -186,7 +184,7 @@ class DeviceRecordsImpl final {
   }
 
   void flushReady() {
-    while (records_.size() > 0) {
+    while (!records_.empty()) {
       auto& r = records_.front();
       auto start_status = dipu::devproxy::getEventStatus(r.start->get());
       auto end_status = dipu::devproxy::getEventStatus(r.stop->get());
@@ -196,30 +194,36 @@ class DeviceRecordsImpl final {
           origin_status != devapis::EventStatus::READY) {
         break;
       }
-      float t1 = 0.0f;
-      float t2 = 0.0f;
+      float t1 = 0.F;
+      float t2 = 0.F;
+      constexpr double kMillisecondPerSecond = 1e3;
       dipu::devproxy::eventElapsedTime(&t1, beginEvent(), r.start->get());
       dipu::devproxy::eventElapsedTime(&t2, r.start->get(), r.stop->get());
-      ready_records_.push_back(
-          Record({r.name, r.opId, static_cast<size_t>(t1 * 1e3),
-                  static_cast<size_t>((t1 + t2) * 1e3), r.deviceId, r.streamId,
-                  true, r.linkCorrelationId, r.extraInfo}));
+      ready_records_.push_back(Record(
+          {r.name, r.opId, static_cast<size_t>(t1 * kMillisecondPerSecond),
+           static_cast<size_t>((t1 + t2) * kMillisecondPerSecond), r.deviceId,
+           r.streamId, true, r.linkCorrelationId, r.extraInfo}));
       records_.pop_front();
     }
   }
 
   void flush() {
     std::lock_guard<std::mutex> lk(mtx_);
-    if (records_.size() > 0) {
+    if (!records_.empty()) {
       TORCH_CHECK(pTracker_, "dipu profiler error with pTracker is not inited");
       auto& trakcer = *pTracker_;
       trakcer.sync();
       float ratio = trakcer.ratio();
       size_t offset = trakcer.offset();
 
+      constexpr double kSecondPerMillisecond = 1e-3;
       for (auto& r : ready_records_) {
-        r.begin = static_cast<size_t>(r.begin * 1e-3 * ratio) + offset;
-        r.end = static_cast<size_t>(r.end * 1e-3 * ratio) + offset;
+        r.begin = static_cast<size_t>(static_cast<double>(r.begin) *
+                                      kSecondPerMillisecond * ratio) +
+                  offset;
+        r.end = static_cast<size_t>(static_cast<double>(r.end) *
+                                    kSecondPerMillisecond * ratio) +
+                offset;
         RecordsImpl::get().addRecord(r);
       }
       ready_records_.clear();
@@ -249,6 +253,7 @@ class DeviceRecordsImpl final {
   }
 };
 
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 bool gEnableFlag = false;
 
 bool isEnable() { return gEnableFlag; }
@@ -257,8 +262,9 @@ void setProfileOpen(bool profileFlag) { gEnableFlag = profileFlag; }
 
 void FlushAllRecords() { DeviceRecordsImpl::get().flush(); }
 
-static size_t kInitModuleId = 10000;
-std::atomic<size_t> moduleId(kInitModuleId);
+constexpr size_t kInitModuleId = 10000;
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+std::atomic_size_t moduleId(kInitModuleId);
 
 size_t generateId() { return ++moduleId; }
 
diff --git a/dipu/torch_dipu/csrc_dipu/profiler/profiler.h b/dipu/torch_dipu/csrc_dipu/profiler/profiler.h
index 7cb5a750d5..29237de286 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/profiler.h
+++ b/dipu/torch_dipu/csrc_dipu/profiler/profiler.h
@@ -80,6 +80,8 @@ class RecordsImpl final {
   mutable mutex_t mtx_;
   // tid -> record list
   std::unordered_map<int32_t, std::unique_ptr<records_t>> allRecordLists_;
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
   thread_local static records_t* pRecords;
 
   std::map<std::pair<int64_t, int64_t>, libkineto::ResourceInfo> resourceInfo_;
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBFCachingAllocator.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBFCachingAllocator.cpp
index 274b2a51cf..8ae079a494 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBFCachingAllocator.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBFCachingAllocator.cpp
@@ -1,10 +1,10 @@
 // Copyright (c) 2023, DeepLink.
 
 #include <functional>
-#include <map>
-#include <queue>
+#include <memory>
 #include <stack>
 #include <thread>
+#include <utility>
 #include <vector>
 
 #include "DIPUCachingAllocator.h"
@@ -27,9 +27,9 @@ class BFCachingAllocatorImpl {
   static constexpr int kLogNumSubBins = 2;
   // Allocation parameters
   static constexpr size_t kMinAllocationSize = 512;
-  static constexpr size_t kMaxInternalFragmentation = 8u << 20u;  // 8MB
-  static constexpr size_t kMinExtendSize = 8u << 20u;             // 8MB
-  static constexpr size_t kMaxExtendSize = 1u << 30u;             // 1GB
+  static constexpr size_t kMaxInternalFragmentation = 8U << 20U;  // 8MB
+  static constexpr size_t kMinExtendSize = 8U << 20U;             // 8MB
+  static constexpr size_t kMaxExtendSize = 1U << 30U;             // 1GB
 
   size_t cachedBytes = 0;
   size_t allocatedBytes = 0;
@@ -61,7 +61,7 @@ class BFCachingAllocatorImpl {
     // into 128 bits (`kNumBigBins` * `kNumSubBins`)
     __uint128_t bits = 0;
     // Virtual chunks which are the heads of the bins
-    int binHeads_[kNumBigBins * kNumSubBins]{0};
+    std::array<int, static_cast<size_t>(kNumBigBins* kNumSubBins)> binHeads_{};
     // The extending size next time
     size_t currExtendSize_ = kMinExtendSize;
 
@@ -78,12 +78,14 @@ class BFCachingAllocatorImpl {
       // Find the index of the first "1"
       // `__builtin_ctzll` only support `uint64_t`,
       // so we have to divide
-      uint64_t low_bits = map, high_bits = map >> 64u;
+      uint64_t low_bits = map;
+      constexpr int kLowBitWidth = 64;
+      uint64_t high_bits = map >> kLowBitWidth;
       if (low_bits) {
         return __builtin_ctzll(low_bits);
       }
       if (high_bits) {
-        return 64 + __builtin_ctzll(high_bits);
+        return kLowBitWidth + __builtin_ctzll(high_bits);
       }
       return -1;
     }
@@ -117,14 +119,16 @@ class BFCachingAllocatorImpl {
     Chunk(void* ptr, size_t size, size_t stream)
         : ptr(ptr), size(size), stream(stream) {}
 
-    bool isMonoBlock() const { return !prevChunkInMem && !nextChunkInMem; }
+    bool isMonoBlock() const {
+      return (prevChunkInMem == 0) && (nextChunkInMem == 0);
+    }
   };
 
   std::vector<Chunk> chunks_;
   // Use id recycling for better performance
   std::stack<int> recycleIds_;
 
-  typedef std::unique_ptr<StreamSet> StreamSetHandle;
+  using StreamSetHandle = std::unique_ptr<StreamSet>;
   std::vector<StreamSetHandle> streamSets_;
 
   using mutex_t = SpinMutex;
@@ -135,14 +139,14 @@ class BFCachingAllocatorImpl {
   }
 
   int newChunk(void* ptr, size_t size, size_t stream) {
-    int id;
+    int id = 0;
     if (!recycleIds_.empty()) {
       id = recycleIds_.top();
       recycleIds_.pop();
       chunks_[id] = Chunk(ptr, size, stream);
     } else {
-      id = chunks_.size();
-      chunks_.emplace_back(Chunk(ptr, size, stream));
+      id = static_cast<int>(chunks_.size());
+      chunks_.emplace_back(ptr, size, stream);
     }
     if (!ptr) {
       chunks_[id].allocated = true;
@@ -155,11 +159,14 @@ class BFCachingAllocatorImpl {
     //      [2^`bigBinIdx`, 2^(`bigBinIdx`+1)), length: 2^`bigBinIdx`
     // Split big bin into `kNumSubBins` sub bins
     size_t nBlocks = nbytes / kMinAllocationSize;
-    int bigBinIdx = 63 - __builtin_clzll(nBlocks);
+    constexpr int kMaxBinIdx = 63;
+    int bigBinIdx = kMaxBinIdx - __builtin_clzll(nBlocks);
     // If `nbytes` is so large, we just put it into the last
-    if (bigBinIdx > kNumBigBins - 1) return kNumBigBins * kNumSubBins - 1;
+    if (bigBinIdx > kNumBigBins - 1) {
+      return kNumBigBins * kNumSubBins - 1;
+    }
     // Get the index of sub bin
-    int subBinIdx = nBlocks ^ (1ull << bigBinIdx);
+    int subBinIdx = static_cast<int>(nBlocks ^ (1ULL << bigBinIdx));
     subBinIdx >>= std::max(bigBinIdx - kLogNumSubBins, 0);
     return bigBinIdx * kNumSubBins + subBinIdx;
   }
@@ -385,11 +392,11 @@ class BFCachingAllocatorImpl {
   void set_mem_allocate_fn(allocate_fn_t allocate_fn,
                            deallocate_fn_t deallocate_fn) {
     DIPU_DEBUG_ALLOCATOR(4, "BFCachingAllocator: set_mem_allocate_fn ");
-    this->allocate_fn = allocate_fn;
-    this->deallocate_fn = deallocate_fn;
+    this->allocate_fn = std::move(allocate_fn);
+    this->deallocate_fn = std::move(deallocate_fn);
   }
 
-  size_t memory_reserved() { return cachedBytes; }
+  size_t memory_reserved() const { return cachedBytes; }
 };
 
 static void deleteBFContext(void* ptr);
@@ -405,7 +412,7 @@ class BFCachingAllocator : public CacheAllocator {
     while (async_mem_pool()->ready()) {
       const auto block = async_mem_pool()->get();
       void* ptr = std::get<0>(block);
-      int id = std::get<1>(block);
+      int id = static_cast<int>(std::get<1>(block));
       DIPU_DEBUG_ALLOCATOR(
           8, "BFCachingAllocator: " << __FUNCTION__ << " ,ptr:" << ptr
                                     << " ,id:" << id << " ,allocator:" << this
@@ -424,7 +431,7 @@ class BFCachingAllocator : public CacheAllocator {
       }
       const auto block = async_mem_pool()->get();
       void* ptr = std::get<0>(block);
-      int id = std::get<1>(block);
+      int id = static_cast<int>(std::get<1>(block));
       DIPU_DEBUG_ALLOCATOR(
           8, "BFCachingAllocator: " << __FUNCTION__ << " ,ptr:" << ptr
                                     << " ,id:" << id << " ,allocator:" << this
@@ -437,14 +444,16 @@ class BFCachingAllocator : public CacheAllocator {
     if (impl) {
       return;
     }
-    impl.reset(new BFCachingAllocatorImpl());
+    impl = std::make_unique<BFCachingAllocatorImpl>();
 
     std::function<void*(size_t)> alloc_fn =
-        std::bind(&BFCachingAllocator::allocate_raw, (BFCachingAllocator*)this,
-                  std::placeholders::_1);
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+        std::bind(&BFCachingAllocator::allocate_raw,
+                  const_cast<BFCachingAllocator*>(this), std::placeholders::_1);
     std::function<void(void*)> dealloc_fn =
-        std::bind(&BFCachingAllocator::free_raw, (BFCachingAllocator*)this,
-                  std::placeholders::_1);
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+        std::bind(&BFCachingAllocator::free_raw,
+                  const_cast<BFCachingAllocator*>(this), std::placeholders::_1);
     impl->set_mem_allocate_fn(alloc_fn, dealloc_fn);
   }
 
@@ -546,7 +555,7 @@ class BFCachingAllocator : public CacheAllocator {
 
   BFCachingAllocator() { check_impl(); }
 
-  ~BFCachingAllocator() {
+  ~BFCachingAllocator() override {
     DIPU_DEBUG_ALLOCATOR(8, "~BFCachingAllocator allocator:" << this);
     release_all_memory();
   }

From ab95f38ad5a39e3e2d5f275626a4a016d20e06d6 Mon Sep 17 00:00:00 2001
From: Lingjie <lilingjie@sensetime.com>
Date: Mon, 18 Dec 2023 19:02:49 +0800
Subject: [PATCH 20/58] [dipu] fix: `torch.prod` int type promotion (#541)

`prod` (and other reduction ops) should promote int type (including `bool`) to `int64` when `dtype` is not explicitly provided.

Only `prod` (without `dim`) should be taken care of, because the other cases are already correctly handled in PyTorch.
---
 .../diopi_functions.yaml                      | 11 ++++----
 .../diopi_wrapper_template.py                 | 27 ++++++++++---------
 dipu/tests/python/unittests/test_prod.py      | 22 ++++++++++++---
 3 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
index f203c03c97..e3dac0e428 100755
--- a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
+++ b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
@@ -1,10 +1,10 @@
 - schema: "exampleop.overloadname(Tensor self, Scalar other, Scalar alpha=1, *, Tensor(a!) out) -> Tensor(a!)"
   autocompare: disable
-  register_op: False # Whether generate registe code for this op, default value is True
+  register_op: False # Whether generate register code for this op, default value is True
   print_func_call_info: False # whether generate code that prints function call information
   print_op_args: True # whether generate code that prints op args
-  dummy_call_diopi: False # Does not generate code that actually calls the diopi function, defalut value is False
-  custom_code_at_the_beginning: "/* Here can be a piece of c++ code at the begining*/"
+  dummy_call_diopi: False # Does not generate code that actually calls the diopi function, default value is False
+  custom_code_at_the_beginning: "/* Here can be a piece of c++ code at the beginning*/"
   custom_code_before_call_diopi: |
     std::cout << "self:" << self << std::endl;
     std::cout << "other:" << other << std::endl;
@@ -1084,7 +1084,8 @@
 
 - schema: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor
   custom_code_at_the_beginning: |
-    const auto self_dtype = at::native::to(self, dtype);
+    auto promoted_dtype = at::native::get_dtype_from_self(self, dtype, /*promote_integers=*/true);
+    const auto self_dtype = at::native::to(self, promoted_dtype);
     auto out = at::empty({}, self_dtype.options());
     ::diopiConstTensorHandle_t self_dtype_diopi = dipu::diopi_helper::toDiopiTensorHandle(self_dtype);
   interface: diopiProd(ctx, out, self_dtype_diopi, nullptr)
@@ -2417,7 +2418,7 @@
     return out;
   interface: diopiNorm(ctx, out, self, p, dimDiopiSize);
 
-# wrap_diopi_cast_dtype has no corresponding aten op and not registed, it's just a diopi func wrapper.
+# wrap_diopi_cast_dtype has no corresponding aten op and not registered, it's just a diopi func wrapper.
 # use this tricky method to support call multiple diopi-op in one aten-op
 - schema: "wrap_diopi_cast_dtype(Tensor(a) self, ScalarType dtype) -> Tensor(a)"
   register_op: False
diff --git a/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py b/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py
index 5983d8dd69..ecbba7e67b 100644
--- a/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py
+++ b/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py
@@ -1,38 +1,39 @@
 # Copyright (c) 2023, DeepLink.
 diopi_wrapper_file_template_content = \
-"""
-// autogened file
-#include <ATen/Tensor.h>
+"""// autogened file
+#include <vector>
+
 #include <ATen/ATen.h>
 #include <ATen/Functions.h>
-#include <type_traits>
-
+#include <ATen/Tensor.h>
+#include <ATen/native/ReduceOpsUtils.h>
 #include <torch/csrc/autograd/custom_function.h>
 #include <torch/types.h>
-#include "csrc_dipu/aten/DIPUATenFunctions.h"
+
 #include "csrc_dipu/aten/RegisterDIPU.hpp"
+#include "csrc_dipu/aten/ops/DIPUCopy.hpp"
 #include "csrc_dipu/diopirt/diopirt_impl.h"
 #include "csrc_dipu/profiler/profiler.h"
-#include <csrc_dipu/utils/Log.h>
+
 #include "CustomFallbackFunctions.hpp"
-#include "csrc_dipu/aten/ops/DIPUCopy.hpp"
-#include <vector>
 
 $header_include_code
 
-// NOTE: some kernels (e.g. _foreach_add_.List) have custom codes at the beginning ending with early return.
-// This is a workaround indended to skip some of the autogened codes (e.g. type cast, calling DIOPI, etc.).
+// NOTE: Some kernels (e.g. _foreach_add_.List) have custom codes at the
+// beginning ending with early return. This is a workaround intended to skip
+// some of the autogened codes (e.g. type cast, calling DIOPI, etc.).
+//
 // NOLINTBEGIN(readability-redundant-control-flow)
 
 namespace dipu {
 
 namespace native {
-    
+
 using dipu::diopi_helper::toDiopiGeneratorHandle;
 using dipu::diopi_helper::toDiopiSize;
 using dipu::diopi_helper::toDiopiRoundMode;
 
-$functions_code    
+$functions_code
 
 }  // namespace native
 }  // namespace dipu
diff --git a/dipu/tests/python/unittests/test_prod.py b/dipu/tests/python/unittests/test_prod.py
index 5f0f4fa3fa..24964673a9 100644
--- a/dipu/tests/python/unittests/test_prod.py
+++ b/dipu/tests/python/unittests/test_prod.py
@@ -25,12 +25,12 @@ def test_prod_bool(self):
         input_arrays = [[True, True], [True, False], [False, False]]
         for input_array in input_arrays:
             input_tensor = torch.tensor(input_array)
-            out = torch.prod(input_tensor).item()
-            out_cuda = torch.prod(input_tensor.cuda()).item()
-            self.assertEqual(out, out_cuda)
+            out = torch.prod(input_tensor)
+            out_cuda = torch.prod(input_tensor.cuda())
+            self.assertEqual(out, out_cuda, exact_dtype=True)
 
     def test_prod_dtype(self):
-        test_dtypes = [torch.float16, torch.float32]
+        test_dtypes = [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64]
         for input_dtype in test_dtypes:
             input_tensor = torch.tensor(
                 [[1, 2, 3], [4, 5, 6]], dtype=input_dtype, device="dipu"
@@ -46,6 +46,20 @@ def test_prod_dtype(self):
                 out = torch.prod(input_tensor, 1, dtype=output_dtype)
                 self.assertEqual(out, expected_output, exact_dtype=True)
 
+    def test_prod_integer_promotion(self):
+        test_dtypes = [torch.int8, torch.int16, torch.int32]
+        for input_dtype in test_dtypes:
+            input_tensor = torch.tensor(
+                [[1, 2, 3], [4, 5, 6]], dtype=input_dtype, device="dipu"
+            )
+            expected_output = torch.tensor(720, dtype=torch.int64, device="dipu")
+            out = torch.prod(input_tensor)
+            self.assertEqual(out, expected_output, exact_dtype=True)
+
+            expected_output = torch.tensor([6, 120], dtype=torch.int64, device="dipu")
+            out = torch.prod(input_tensor, 1)
+            self.assertEqual(out, expected_output, exact_dtype=True)
+
 
 if __name__ == "__main__":
     run_tests()

From 68333758ed04b08243a9b49bb44050d8b76e6638 Mon Sep 17 00:00:00 2001
From: Lingjie <lilingjie@sensetime.com>
Date: Tue, 19 Dec 2023 10:54:14 +0800
Subject: [PATCH 21/58] [dipu] fix typo PREFERED -> PREFERRED (#545)

---
 .../diopi_functions.yaml                      | 36 +++++++++----------
 .../op_memory_format_converter.py             |  8 ++---
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
index e3dac0e428..704c234b8a 100755
--- a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
+++ b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
@@ -208,9 +208,9 @@
     const int axis = input_shape.size();
     auto out0 = at::empty_like(input, input.options(), \
         (axis==4?\
-            (c10::optional<at::MemoryFormat>(${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt})):\
+            (c10::optional<at::MemoryFormat>(${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt})):\
             (axis==5?\
-                (c10::optional<at::MemoryFormat>(${PREFERED_MEMORY_FORMAT_PLACEHOLDER_3D:-c10::nullopt})):\
+                (c10::optional<at::MemoryFormat>(${PREFERRED_MEMORY_FORMAT_PLACEHOLDER_3D:-c10::nullopt})):\
                 c10::optional<at::MemoryFormat>(c10::nullopt))\
         ));
     auto options = input.options().dtype(at::kFloat);
@@ -239,9 +239,9 @@
     const int axis = input_shape.size();
     at::Tensor out0 = at::empty_like(input, input.options(), \
         (axis==4?\
-             (c10::optional<at::MemoryFormat>(${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt})):\
+             (c10::optional<at::MemoryFormat>(${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt})):\
              (axis==5?\
-                 (c10::optional<at::MemoryFormat>(${PREFERED_MEMORY_FORMAT_PLACEHOLDER_3D:-c10::nullopt})):\
+                 (c10::optional<at::MemoryFormat>(${PREFERRED_MEMORY_FORMAT_PLACEHOLDER_3D:-c10::nullopt})):\
                   c10::optional<at::MemoryFormat>(c10::nullopt))\
         ));
     at::Tensor out1 = at::empty({dim_c}, options);
@@ -531,7 +531,7 @@
     int64_t out_height = (height + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1) / stride[0] + 1;
     int64_t out_width = (width + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1) / stride[1] + 1;
     c10::SmallVector<int64_t, 4> output_size = {batch_size, out_channel, out_height, out_width};
-    at::Tensor out = at::empty(output_size, input.options(),${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-input.suggest_memory_format()});
+    at::Tensor out = at::empty(output_size, input.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-input.suggest_memory_format()});
   interface: diopiConvolution2d(&context, out, input, weight, bias, stride, padding, dilation, groups)
 
 - schema: "convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)"
@@ -543,7 +543,7 @@
     at::Tensor grad_bias;
     std::vector<int64_t> bias_sizes;
     if (output_mask[0]) {
-      grad_input = at::empty(input.sizes(), input.options(), ${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+      grad_input = at::empty(input.sizes(), input.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
     }
     if (output_mask[1]) {
       grad_weight = at::empty(weight.sizes(), weight.options().dtype(at::kFloat).memory_format(weight.suggest_memory_format()));
@@ -563,7 +563,7 @@
     at::Tensor grad_input;
     at::Tensor grad_weight;
     at::Tensor grad_bias;
-    grad_input = at::empty(input.sizes(), input.options(), ${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+    grad_input = at::empty(input.sizes(), input.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
     grad_weight = at::empty(weight.sizes(), weight.options().dtype(at::kFloat));
     if (output_mask[2]) {
         grad_bias = at::empty({grad_output.size(1)}, grad_output.options());
@@ -585,7 +585,7 @@
     const int64_t w_out = (w_in - 1) * stride[1] - 2 * padding[1] + (dilation[1] * (kernel_width - 1) + 1) + output_padding[1];
     const int64_t c_out = weight.size(1) * groups;
     auto output_shape =  input.sizes().size() == 3 ? std::vector<int64_t>{c_out, h_out, w_out} : std::vector<int64_t>{n, c_out, h_out, w_out};
-    auto out = at::empty(output_shape, input.options(), ${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+    auto out = at::empty(output_shape, input.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
   interface: diopiConvTranspose2d(ctx, out, input, weight, bias, stride, padding, output_padding, groups, dilation)
   forward_process_code: |
     bool bias_has_value = (bias.has_value()) ? bias.value().requires_grad() : false;
@@ -1351,7 +1351,7 @@
       size[0] = std::floor(self.size(-2) * scales_h.value_or(1.0));
       size[1] = std::floor(self.size(-1) * scales_w.value_or(1.0));
     }
-    auto out = at::empty({self.size(0),self.size(1),size[0],size[1]},self.options(),${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+    auto out = at::empty({self.size(0),self.size(1),size[0],size[1]},self.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
   interface: diopiUpsampleNearest(ctx, out, self, size);
 
 - schema: "upsample_bilinear2d.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)"
@@ -1381,7 +1381,7 @@
       size[0] = std::floor(self.size(-2) * scales_h.value_or(1.0));
       size[1] = std::floor(self.size(-1) * scales_w.value_or(1.0));
     }
-    auto out = at::empty({self.size(0),self.size(1),size[0],size[1]},self.options(),${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+    auto out = at::empty({self.size(0),self.size(1),size[0],size[1]},self.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
     const char* mode = "bilinear";
   interface: diopiUpsampleLinear(ctx, out, self, size, align_corners, mode);
 
@@ -1405,7 +1405,7 @@
     auto symInt2Int = [](const c10::SymInt& t)-> int64_t {return t.expect_int();};
     std::vector<int64_t> grad_input_shape(input_size.size());
     std::transform(input_size.cbegin(), input_size.cend(), grad_input_shape.begin(), symInt2Int);
-    auto grad_input = at::empty(grad_input_shape,grad_output.options(),${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+    auto grad_input = at::empty(grad_input_shape,grad_output.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
   custom_code_before_call_diopi: |
     if (output_size.size() > 0) {
       std::copy(output_sizeVector.begin(), output_sizeVector.end(), size.begin());
@@ -1436,7 +1436,7 @@
     auto symInt2Int = [](const c10::SymInt& t)-> int64_t {return t.expect_int();};
     std::vector<int64_t> grad_input_shape(input_size.size());
     std::transform(input_size.cbegin(), input_size.cend(), grad_input_shape.begin(), symInt2Int);
-    auto grad_input = at::empty(grad_input_shape,grad_output.options(),${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+    auto grad_input = at::empty(grad_input_shape,grad_output.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
   custom_code_before_call_diopi: |
     if (output_size.size() > 0) {
       std::copy(output_sizeVector.begin(), output_sizeVector.end(), size.begin());
@@ -2087,8 +2087,8 @@
 - schema: batch_norm_stats(Tensor input, float eps) -> (Tensor, Tensor)
   custom_code_at_the_beginning: |
     auto shape = input.size(1);
-    auto out0 = at::empty({shape}, input.options().dtype(at::kFloat), ${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
-    auto out1 = at::empty({shape}, input.options().dtype(at::kFloat), ${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+    auto out0 = at::empty({shape}, input.options().dtype(at::kFloat), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+    auto out1 = at::empty({shape}, input.options().dtype(at::kFloat), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
   interface: diopiBatchNormStats(ctx, out0, out1, input, eps)
 
 - schema: batch_norm_gather_stats_with_counts(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, Tensor counts) -> (Tensor, Tensor)
@@ -2111,8 +2111,8 @@
     at::Tensor out2;
     at::Tensor out3;
     if(input_g){
-      out0 = at::empty({shape}, input.options().dtype(at::kFloat), ${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
-      out1 = at::empty({shape}, input.options().dtype(at::kFloat), ${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+      out0 = at::empty({shape}, input.options().dtype(at::kFloat), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+      out1 = at::empty({shape}, input.options().dtype(at::kFloat), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
     }
     if(weight_g){
       out2 = at::empty({shape}, input.options().dtype(at::kFloat));
@@ -2124,12 +2124,12 @@
 
 - schema: batch_norm_backward_elemt(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor sum_dy, Tensor sum_dy_xmu, Tensor count) -> Tensor
   custom_code_at_the_beginning: |
-    auto out = at::empty_like(grad_out, grad_out.options(), ${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+    auto out = at::empty_like(grad_out, grad_out.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
   interface: diopiBatchNormBackwardElemt(ctx, out, grad_out, input, mean, invstd, weight, sum_dy, sum_dy_xmu, count);
 
 - schema: batch_norm_elemt(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps) -> Tensor
   custom_code_at_the_beginning: |
-    auto out = at::empty_like(input, input.options(), ${PREFERED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+    auto out = at::empty_like(input, input.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
   interface: diopiBatchNormElemt(ctx, out, input, weight, bias, mean, invstd, static_cast<float>(eps));
 
 - schema: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!)
diff --git a/dipu/scripts/autogen_diopi_wrapper/op_memory_format_converter.py b/dipu/scripts/autogen_diopi_wrapper/op_memory_format_converter.py
index dfd18dc8bc..80a8fccb4d 100644
--- a/dipu/scripts/autogen_diopi_wrapper/op_memory_format_converter.py
+++ b/dipu/scripts/autogen_diopi_wrapper/op_memory_format_converter.py
@@ -45,10 +45,10 @@ def choose_preserve(matched):
         custom_code = custom_code.split("\n")
         memory_format = self.convert_config.interface2memoryformat(interface)
         custom_code_new = list()
-        # match string like "${PREFERED_MEMORY_FORMAT_PLACHOLDER_3D:-<default>}"
-        placeholder_3d_pattern = "\$\{PREFERED_MEMORY_FORMAT_PLACEHOLDER_3D:-(?P<default>.*)\}"
-        # match string like "${PREFERED_MEMORY_FORMAT_PLACHOLDER:-<default>}"
-        placeholder_pattern = "\$\{PREFERED_MEMORY_FORMAT_PLACEHOLDER:-(?P<default>.*)\}"
+        # match string like "${PREFERRED_MEMORY_FORMAT_PLACHOLDER_3D:-<default>}"
+        placeholder_3d_pattern = "\$\{PREFERRED_MEMORY_FORMAT_PLACEHOLDER_3D:-(?P<default>.*)\}"
+        # match string like "${PREFERRED_MEMORY_FORMAT_PLACHOLDER:-<default>}"
+        placeholder_pattern = "\$\{PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-(?P<default>.*)\}"
         for line in custom_code:
             if memory_format == "channellast":
                 line = re.sub(placeholder_3d_pattern, choose_channelsLast3d, line)

From 8f5751a6d78d2b6b8d6ab04e52c58e13237a622c Mon Sep 17 00:00:00 2001
From: zhaochaoxing <109726331+zhaochaoxing@users.noreply.github.com>
Date: Tue, 19 Dec 2023 16:34:48 +0800
Subject: [PATCH 22/58] [dicp][ascend] add dicp ci for ascend (#540)

---
 .github/workflows/_runs-on-ascend.yml      | 31 +++++++++++-----------
 dicp/scripts/ci/ascend/dipu_env.sh         |  4 +++
 dicp/scripts/ci/ascend/test_env.sh         |  6 +++++
 dicp/test/ascend_scripts/models/static.ini |  1 -
 dicp/test/ascend_scripts/ops/static.ini    | 25 -----------------
 5 files changed, 25 insertions(+), 42 deletions(-)
 create mode 100644 dicp/scripts/ci/ascend/dipu_env.sh
 create mode 100644 dicp/scripts/ci/ascend/test_env.sh

diff --git a/.github/workflows/_runs-on-ascend.yml b/.github/workflows/_runs-on-ascend.yml
index 21a93a328a..2ffdff77c6 100644
--- a/.github/workflows/_runs-on-ascend.yml
+++ b/.github/workflows/_runs-on-ascend.yml
@@ -12,7 +12,7 @@ on:
         description: Set up the build environment
         type: string
         required: false
-        default: "tps-ascend-ci"
+        default: "dicp-ascend-ci-910b"
 
 jobs:
   checkout_code:
@@ -22,25 +22,24 @@ jobs:
       - name: Checkout Code
         uses: DeepLink-org/deeplink.framework/.github/actions/checkout-code@main
 
-  build:
+  build_test:
     runs-on: ${{ inputs.runner }}
     needs: checkout_code
     steps:
-      - name: build on ascend
+      - name: build and test on ascend
         uses: DeepLink-org/deeplink.framework/.github/actions/code-build-test@main
         with:
-          build_shell: "pwd" #Write the script you want to execute here，If you don't know which parameters to fill in, you can refer to the actions/code-build-test
-          job_name: "build"
+          build_shell: "
+              source dicp/scripts/ci/ascend/dipu_env.sh && \
+              pip uninstall torch_dipu -y && \
+              pip uninstall dicp -y && \
+              cd dipu && python setup.py install --user && \
+              cd ../dicp && python setup.py install --user && \
+              source scripts/ci/ascend/test_env.sh /mnt/cache/share/deeplinkci/dicp_env/llama_models && \
+              export TEST_DIR=$(pwd)/test && echo ${TEST_DIR} && \
+              bash ${TEST_DIR}/ascend_scripts/ops/run_test_ops.sh false && \
+              bash ${TEST_DIR}/ascend_scripts/models/run_test_models.sh false
+            " #Write the script you want to execute here，If you don't know which parameters to fill in, you can refer to the actions/code-build-test
+          job_name: "build_test"
           cover_job: "0"
           cleaner: "clean_all_if_error"
-
-  test:
-    runs-on: ${{ inputs.runner }}
-    needs: build
-    steps:
-      - name: rt test on ascend
-        uses: DeepLink-org/deeplink.framework/.github/actions/code-build-test@main
-        with:
-          build_shell: "pwd" #Write the script you want to execute here，If you don't know which parameters to fill in, you can refer to the actions/code-build-test
-          job_name: "build"
-          cover_job: "1"
diff --git a/dicp/scripts/ci/ascend/dipu_env.sh b/dicp/scripts/ci/ascend/dipu_env.sh
new file mode 100644
index 0000000000..d123dedaf5
--- /dev/null
+++ b/dicp/scripts/ci/ascend/dipu_env.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+
+export DIPU_DEVICE=ascend
+export DIPU_WITH_DIOPI_LIBRARY=DISABLE
\ No newline at end of file
diff --git a/dicp/scripts/ci/ascend/test_env.sh b/dicp/scripts/ci/ascend/test_env.sh
new file mode 100644
index 0000000000..77a5aaede3
--- /dev/null
+++ b/dicp/scripts/ci/ascend/test_env.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+LLAMA_MODEL_DIR=$1
+
+export DIPU_MOCK_CUDA=false
+export LLAMA_MODEL_DIR=$1
diff --git a/dicp/test/ascend_scripts/models/static.ini b/dicp/test/ascend_scripts/models/static.ini
index e0d37fedc7..d632c5380a 100644
--- a/dicp/test/ascend_scripts/models/static.ini
+++ b/dicp/test/ascend_scripts/models/static.ini
@@ -1,4 +1,3 @@
 [pytest]
 testpaths = ../../model
 python_files = test_llama.py
-               test_resnet50.py
diff --git a/dicp/test/ascend_scripts/ops/static.ini b/dicp/test/ascend_scripts/ops/static.ini
index c97502290d..255d610fda 100644
--- a/dicp/test/ascend_scripts/ops/static.ini
+++ b/dicp/test/ascend_scripts/ops/static.ini
@@ -1,61 +1,36 @@
 [pytest]
 testpaths = ../../op
 python_files = test__log_softmax.py
-               test__native_batch_norm_legit_functional.py
                test__softmax.py
                test__unsafe_view.py
                test_add.py
-               test_amax.py
-               test_arange.py
                test_bernoulli.py
                test_bmm.py
                test_cat.py
                test_clone.py
-               test_convert.py
-               test_convolution_backward.py
-               test_convolution.py
                test_copy_.py
                test_copy.py
                test_div.py
-               test_embedding.py
-               test_empty_like.py
-               test_eq.py
                test_exp.py
-               test_expand.py
                test_fill.py
-               test_full_like.py
-               test_full.py
-               test_gather.py
                test_getitem.py
                test_index.py
                test_le.py
-               test_lift_fresh_copy.py
                test_log.py
                test_lt.py
                test_masked_fill.py
-               test_max_pool2d_with_indices.py
-               test_max_pool2d_with_indices_backward.py
                test_maximum.py
-               test_mean.py
-               test_mm.py
                test_mul.py
-               test_ne.py
                test_neg.py
-               test_new_empty_strided.py
-               test_ones.py
-               test_permute.py
                test_pow.py
                test_relu.py
                test_rsqrt.py
-               test_scatter.py
                test_select.py
                test_sigmoid.py
-               test_slice.py
                test_sqrt.py
                test_squeeze.py
                test_sub.py
                test_sum.py
-               test_transpose.py
                test_unsqueeze.py
                test_view_as_complex.py
                test_view_as_real.py

From fd7be1f38dd3c20a4670ebb07ee90bb7808039db Mon Sep 17 00:00:00 2001
From: Peter Ye <44945378+yewentao256@users.noreply.github.com>
Date: Tue, 19 Dec 2023 17:05:15 +0800
Subject: [PATCH 23/58] disable autocompare for
 _amp_foreach_non_finite_check_and_unscale_ (#543)

---
 dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
index 704c234b8a..df031140da 100755
--- a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
+++ b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
@@ -2463,6 +2463,8 @@
     });
     // NOLINTEND(cppcoreguidelines-pro-type-const-cast)
   interface: diopiAmpForeachNonFiniteCheckAndUnscaleInp(ctx, diopiTensorHandles.data(), static_cast<int64_t>(self.size()), found_inf, inv_scale)
+  # TODO(someone): fix this issue when `autocompare` is on
+  autocompare: disable
 
 - schema: _amp_update_scale_(Tensor(a!) self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor(a!)
   custom_fallback: True

From 60ec4f74c8497e92da1e6cf7732cd9fe6bee0ffa Mon Sep 17 00:00:00 2001
From: Joyce YU <30998166+MiaoYYu@users.noreply.github.com>
Date: Tue, 19 Dec 2023 20:42:11 +0800
Subject: [PATCH 24/58] Update QuickStart.md

---
 dipu/QuickStart.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dipu/QuickStart.md b/dipu/QuickStart.md
index b5f640a2ad..084aab26aa 100644
--- a/dipu/QuickStart.md
+++ b/dipu/QuickStart.md
@@ -283,7 +283,7 @@ diopi dyload init
 
 接入流程示意图：
 
-![结构图](https://deeplink.readthedocs.io/zh_CN/latest/_images/SOP_01.png)
+![结构图](https://deeplink.readthedocs.io/zh-cn/latest/_static/image/DIPU/SOP_01.png)
 
 ### 核心代码添加
 

From ebbd469241bbe0895fc00ff8b09038d44b742c21 Mon Sep 17 00:00:00 2001
From: caikun-pjlab <116071181+caikun-pjlab@users.noreply.github.com>
Date: Wed, 20 Dec 2023 14:57:10 +0800
Subject: [PATCH 25/58] [DIPU]improve profiler performance (#552)

* optimize profiler perf

* move extraceFunction to python autogen

* remove object pool

* cuda not flush ready event to avoid delay jitter

* add comments

* add comments
---
 .../autogen_diopi_wrapper.py                  |  4 +-
 .../diopi_wrapper_template.py                 |  2 +-
 .../csrc_dipu/profiler/profiler.cpp           | 54 ++++++-------------
 dipu/torch_dipu/csrc_dipu/profiler/profiler.h | 43 +++------------
 .../runtime/distributed/ProcessGroupDICL.cpp  | 38 ++++++-------
 5 files changed, 44 insertions(+), 97 deletions(-)

diff --git a/dipu/scripts/autogen_diopi_wrapper/autogen_diopi_wrapper.py b/dipu/scripts/autogen_diopi_wrapper/autogen_diopi_wrapper.py
index fc76ef0e0b..0a2184a24c 100644
--- a/dipu/scripts/autogen_diopi_wrapper/autogen_diopi_wrapper.py
+++ b/dipu/scripts/autogen_diopi_wrapper/autogen_diopi_wrapper.py
@@ -660,7 +660,8 @@ def functions_code_gen(fun_config):
     custom_code_at_the_beginning = fun_config.get('custom_code_at_the_beginning', fun_config.get('custom_code', ''))
     #strip all whitespace and divide code to different lines.
     custom_code_at_the_beginning = re.sub(';\s*$', ';\n',custom_code_at_the_beginning)
-    
+
+    interface_name = re.sub(R'.*::(.*?)\(.*', R'\1', diopi_fun_call_code)
     fbody = fun_template.substitute(
             comment=[fun_config['schema']],
             cppsignautre=[create_cpp_signature_from_schema(fun_config['schema'])],
@@ -673,6 +674,7 @@ def functions_code_gen(fun_config):
             diopi_fun_call_code=[diopi_fun_call_code],
             custom_code_before_return=[fun_config.get('custom_code_before_return', '').replace('; ', ';\n')],
             return_code=[return_code],
+            interface_name=[interface_name],
     )
     diopi_interface = fun_config.get('interface', create_call_diop_interface_code_from_schema(fun_config['schema']))
 
diff --git a/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py b/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py
index ecbba7e67b..667d841d24 100644
--- a/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py
+++ b/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py
@@ -74,7 +74,7 @@
 
   $custom_code_before_call_diopi
 
-  dipu::profile::RecordBlockCreator dipuRecorder(R"($diopi_fun_call_code)");
+  dipu::profile::RecordBlockCreator dipuRecorder(R"($interface_name)");
   ::diopiError_t ret = $diopi_fun_call_code
   dipuRecorder.end();
   TORCH_CHECK(ret == ::diopiSuccess, __FILE__, ":", __LINE__, R"($diopi_fun_call_code)", " error, error code is ", ret, "error message is ", diopiGetLastErrorString());
diff --git a/dipu/torch_dipu/csrc_dipu/profiler/profiler.cpp b/dipu/torch_dipu/csrc_dipu/profiler/profiler.cpp
index 0e752f37ad..f87855158a 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/profiler.cpp
+++ b/dipu/torch_dipu/csrc_dipu/profiler/profiler.cpp
@@ -136,8 +136,11 @@ class DeviceRecordsImpl final {
   DeviceRecordsImpl() {}
 
   static bool enableFlushReadyEvent() {
+    // There is no limit for cuda events on nv, so regular flushing is not
+    // necessary, thus reducing operator time consumption
     static bool enable_flush_ready =
-        (std::getenv("DIPU_DISABLE_FLUSH_READY_EVENT") == nullptr);
+        (std::getenv("DIPU_DISABLE_FLUSH_READY_EVENT") == nullptr &&
+         VENDOR_TYPE != devapis::VendorDeviceType::CUDA);
     return enable_flush_ready;
   }
 
@@ -202,7 +205,7 @@ class DeviceRecordsImpl final {
       ready_records_.push_back(Record(
           {r.name, r.opId, static_cast<size_t>(t1 * kMillisecondPerSecond),
            static_cast<size_t>((t1 + t2) * kMillisecondPerSecond), r.deviceId,
-           r.streamId, true, r.linkCorrelationId, r.extraInfo}));
+           r.streamId, true, r.linkCorrelationId}));
       records_.pop_front();
     }
   }
@@ -232,7 +235,7 @@ class DeviceRecordsImpl final {
         RecordsImpl::get().addRecord(
             Record({r.name, r.opId, getTime(*r.start, ratio, offset),
                     getTime(*r.stop, ratio, offset), r.deviceId, r.streamId,
-                    true, r.linkCorrelationId, r.extraInfo}));
+                    true, r.linkCorrelationId}));
       }
       records_.clear();
     }
@@ -277,15 +280,13 @@ void abandonAllRecords() {
 }
 
 RecordCreator::RecordCreator(string_t name, size_t opId,
-                             uint64_t linkCorrelationId,
-                             ExtraRecordInfo extraInfo) {
+                             uint64_t linkCorrelationId) {
   if (isEnable()) {
     name_ = std::move(name);
     opId_ = opId;
     begin_ = torch::profiler::impl::getTime();
     end_ = false;
     linkCorrelationId_ = linkCorrelationId;
-    extraInfo_ = std::move(extraInfo);
   }
 }
 
@@ -296,20 +297,18 @@ void RecordCreator::end() noexcept {
                static_cast<size_t>(torch::profiler::impl::getTime()),
                static_cast<size_t>(libkineto::processId()),
                static_cast<size_t>(libkineto::systemThreadId()), false,
-               linkCorrelationId_, extraInfo_});
+               linkCorrelationId_});
   }
   end_ = true;
 }
 
 DeviceRecordCreator::DeviceRecordCreator(string_t name, deviceStream_t stream,
                                          int streamId, size_t opId,
-                                         uint64_t linkCorrelationId,
-                                         ExtraRecordInfo extraInfo) {
+                                         uint64_t linkCorrelationId) {
   if (isEnable()) {
     DeviceRecordsImpl::get().ensureSetup(stream);
     name_ = std::move(name);
     opId_ = opId;
-    extraInfo_ = std::move(extraInfo);
     stream_ = stream;
     streamId_ = streamId;
     pStart_.reset(new DeviceEvent());
@@ -326,44 +325,25 @@ void DeviceRecordCreator::end() noexcept {
     TORCH_CHECK(pStop_, "dipu profiler error with pStop_ is not inited");
     dipu::devproxy::recordEvent(pStop_->get(), stream_);
     auto deviceId = dipu::devproxy::current_device();
-    DeviceRecordsImpl::get().addDeviceRecord(
-        DeviceRecord{pStart_, pStop_, static_cast<size_t>(deviceId),
-                     static_cast<size_t>(streamId_), name_, opId_,
-                     linkCorrelationId_, extraInfo_});
+    DeviceRecordsImpl::get().addDeviceRecord(DeviceRecord{
+        pStart_, pStop_, static_cast<size_t>(deviceId),
+        static_cast<size_t>(streamId_), name_, opId_, linkCorrelationId_});
     RecordsImpl::get().recordStream(deviceId, streamId_);
   }
   end_ = true;
 }
 
-static std::string extraceFunction(const std::string& functionName) {
-  auto start = functionName.find_first_not_of(':');
-  if (start == std::string::npos) {
-    return "";
-  }
-
-  auto end = functionName.find_first_of('(');
-  if (end == std::string::npos) {
-    end = functionName.size();
-  }
-
-  if (end <= start) {
-    return "";
-  }
-  return functionName.substr(start, end - start);
-}
-
-void RecordBlockCreator::initialize(string_t name, ExtraRecordInfo extraInfo,
-                                    deviceStream_t stream,
+void RecordBlockCreator::initialize(string_t name, deviceStream_t stream,
                                     c10::StreamId streamId) {
   size_t opId = generateId();
   uint64_t correlationId = CorrelationIDManager::instance().getCorrelationID();
-  name = extraceFunction(name);
+
   pHostRecord_ = std::make_unique<RecordCreator>("LaunchKernel_" + name, opId,
-                                                 correlationId, extraInfo);
+                                                 correlationId);
   pDeviceRecord_ = std::make_unique<DeviceRecordCreator>(
-      std::move(name), stream, streamId, opId, correlationId,
-      std::move(extraInfo));
+      std::move(name), stream, streamId, opId, correlationId);
 }
+
 }  // namespace profile
 
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/profiler/profiler.h b/dipu/torch_dipu/csrc_dipu/profiler/profiler.h
index 29237de286..818f215b3b 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/profiler.h
+++ b/dipu/torch_dipu/csrc_dipu/profiler/profiler.h
@@ -38,27 +38,6 @@ void setProfileOpen(bool profileFlag);
 void FlushAllRecords();
 void abandonAllRecords();
 
-struct ExtraRecordInfo {
-  string_t scope;
-  size_t opSeqId{};
-  string_t attrs;
-
-  ExtraRecordInfo& setScope(const string_t& scopeName) {
-    scope = scopeName;
-    return *this;
-  }
-
-  ExtraRecordInfo& setSeqId(size_t seqId) {
-    opSeqId = seqId;
-    return *this;
-  }
-
-  ExtraRecordInfo& setAttrs(const string_t& sAttrs) {
-    attrs = sAttrs;
-    return *this;
-  }
-};
-
 struct Record {
   string_t name;
   size_t opId;
@@ -69,7 +48,6 @@ struct Record {
   size_t threadIdx;
   bool isKernel = false;
   uint64_t linkCorrelationId = 0;
-  ExtraRecordInfo extraInfo;
 };
 
 class RecordsImpl final {
@@ -108,11 +86,10 @@ class RecordCreator final {
   size_t begin_;
   bool end_ = true;
   uint64_t linkCorrelationId_ = 0;
-  ExtraRecordInfo extraInfo_;
 
  public:
-  explicit RecordCreator(string_t name, size_t opId, uint64_t linkCorrelationId,
-                         ExtraRecordInfo extraInfo = ExtraRecordInfo());
+  RecordCreator() = default;
+  RecordCreator(string_t name, size_t opId, uint64_t linkCorrelationId);
 
   ~RecordCreator() { end(); }
 
@@ -129,7 +106,6 @@ struct DeviceRecord {
   string_t name;
   size_t opId;
   uint64_t linkCorrelationId = 0;
-  ExtraRecordInfo extraInfo;
 };
 
 class DeviceRecordCreator final {
@@ -141,12 +117,11 @@ class DeviceRecordCreator final {
   std::shared_ptr<DeviceEvent> pStart_, pStop_;
   bool end_ = true;
   uint64_t linkCorrelationId_ = 0;
-  ExtraRecordInfo extraInfo_;
 
  public:
+  DeviceRecordCreator() = default;
   DeviceRecordCreator(string_t name, deviceStream_t stream, int streamId,
-                      size_t opId, uint64_t linkCorrelationId,
-                      ExtraRecordInfo extraInfo = ExtraRecordInfo());
+                      size_t opId, uint64_t linkCorrelationId);
 
   ~DeviceRecordCreator() { end(); }
 
@@ -156,17 +131,14 @@ class DeviceRecordCreator final {
 
 class RecordBlockCreator {
  public:
+  RecordBlockCreator() = default;
   // TODO(lljbash): maybe use std::string_view and std::optional after c++17
   explicit RecordBlockCreator(
       c10::string_view name,
-      c10::optional<ExtraRecordInfo> extraInfo = c10::nullopt,
       c10::optional<deviceStream_t> stream = c10::nullopt,
       c10::optional<c10::StreamId> streamId = c10::nullopt,
       c10::optional<bool> enProfile = c10::nullopt) {
     if (enProfile.value_or(isEnable())) {
-      if (!extraInfo) {
-        extraInfo.emplace();
-      }
       if (!stream) {
         auto dipu_stream = getCurrentDIPUStream();
         if (!streamId) {
@@ -174,7 +146,7 @@ class RecordBlockCreator {
         }
         stream = static_cast<deviceStream_t>(dipu_stream);
       }
-      initialize(string_t(name), std::move(*extraInfo), *stream, *streamId);
+      initialize(string_t(name), *stream, *streamId);
     }
   }
 
@@ -189,8 +161,7 @@ class RecordBlockCreator {
   ~RecordBlockCreator() { end(); }
 
  private:
-  void initialize(string_t name, ExtraRecordInfo extraInfo,
-                  deviceStream_t stream, c10::StreamId streamId);
+  void initialize(string_t name, deviceStream_t stream, c10::StreamId streamId);
 
   std::unique_ptr<RecordCreator> pHostRecord_ = nullptr;
   std::unique_ptr<DeviceRecordCreator> pDeviceRecord_ = nullptr;
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/distributed/ProcessGroupDICL.cpp b/dipu/torch_dipu/csrc_dipu/runtime/distributed/ProcessGroupDICL.cpp
index 7b7b325efe..d94feb5862 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/distributed/ProcessGroupDICL.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/distributed/ProcessGroupDICL.cpp
@@ -481,9 +481,8 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::allreduce(
       [&](at::Tensor& input, at::Tensor& output, diclComm_t comm,
           DIPUStream& stream) {
         RECORD_FUNCTION("DiclAllreduce", std::vector<c10::IValue>({input}));
-        profile::RecordBlockCreator _("DiclAllreduce",
-                                      profile::ExtraRecordInfo(),
-                                      stream.rawstream(), stream.id());
+        profile::RecordBlockCreator _("DiclAllreduce", stream.rawstream(),
+                                      stream.id());
         return devproxy::diclAllReduce(
             input.data_ptr(), output.data_ptr(), (size_t)input.numel(),
             input.scalar_type(), opts.reduceOp, comm, stream.rawstream());
@@ -500,9 +499,8 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::broadcast(
       [&](at::Tensor& input, at::Tensor& output, diclComm_t comm,
           DIPUStream& stream) {
         RECORD_FUNCTION("DiclBroadcast", std::vector<c10::IValue>({input}));
-        profile::RecordBlockCreator _("DiclBroadcast",
-                                      profile::ExtraRecordInfo(),
-                                      stream.rawstream(), stream.id());
+        profile::RecordBlockCreator _("DiclBroadcast", stream.rawstream(),
+                                      stream.id());
         // only one root (root rank root device)
         const auto root = opts.rootRank * tensors.size() + opts.rootTensor;
         return devproxy::diclBroadcast(
@@ -524,8 +522,8 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::reduce(
       [&](at::Tensor& input, at::Tensor& output, diclComm_t comm,
           DIPUStream& stream) {
         RECORD_FUNCTION("DiclReduce", std::vector<c10::IValue>({input}));
-        profile::RecordBlockCreator _("DiclReduce", profile::ExtraRecordInfo(),
-                                      stream.rawstream(), stream.id());
+        profile::RecordBlockCreator _("DiclReduce", stream.rawstream(),
+                                      stream.id());
         const auto root = opts.rootRank * tensors.size() + opts.rootTensor;
         return devproxy::diclReduce(
             input.data_ptr(), output.data_ptr(), (size_t)input.numel(),
@@ -553,9 +551,8 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::allgather(
       [&](at::Tensor& input, at::Tensor& output, diclComm_t comm,
           DIPUStream& stream) {
         RECORD_FUNCTION("DiclAllgather", std::vector<c10::IValue>({input}));
-        profile::RecordBlockCreator _("DiclAllgather",
-                                      profile::ExtraRecordInfo(),
-                                      stream.rawstream(), stream.id());
+        profile::RecordBlockCreator _("DiclAllgather", stream.rawstream(),
+                                      stream.id());
 
         return devproxy::diclAllGather(
             input.data_ptr(), output.data_ptr(), (size_t)input.numel(),
@@ -597,9 +594,8 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::_allgather_base(
           DIPUStream& stream) {
         RECORD_FUNCTION("DiclAllgather_base",
                         std::vector<c10::IValue>({input}));
-        profile::RecordBlockCreator _("DiclAllgather_base",
-                                      profile::ExtraRecordInfo(),
-                                      stream.rawstream(), stream.id());
+        profile::RecordBlockCreator _("DiclAllgather_base", stream.rawstream(),
+                                      stream.id());
         return devproxy::diclAllGather(
             input.data_ptr(), output.data_ptr(), (size_t)input.numel(),
             input.scalar_type(), comm, stream.rawstream());
@@ -628,7 +624,6 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::_reduce_scatter_base(
         RECORD_FUNCTION("DiclReduceScatter_base",
                         std::vector<c10::IValue>({input}));
         profile::RecordBlockCreator _("DiclReduceScatter_base",
-                                      profile::ExtraRecordInfo(),
                                       stream.rawstream(), stream.id());
         return devproxy::diclReduceScatter(
             input.data_ptr(), output.data_ptr(), (size_t)output.numel(),
@@ -652,9 +647,8 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::reduce_scatter(
       [&](at::Tensor& input, at::Tensor& output, diclComm_t comm,
           DIPUStream& stream) {
         RECORD_FUNCTION("DiclReduceScatter", std::vector<c10::IValue>({input}));
-        profile::RecordBlockCreator _("DiclReduceScatter",
-                                      profile::ExtraRecordInfo(),
-                                      stream.rawstream(), stream.id());
+        profile::RecordBlockCreator _("DiclReduceScatter", stream.rawstream(),
+                                      stream.id());
         return devproxy::diclReduceScatter(
             input.data_ptr(), output.data_ptr(), (size_t)output.numel(),
             input.scalar_type(), opts.reduceOp, comm, stream.rawstream());
@@ -682,8 +676,8 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::send(
       [&](at::Tensor& input, at::Tensor& output, diclComm_t comm,
           DIPUStream& stream) {
         RECORD_FUNCTION("diclSend", std::vector<c10::IValue>({input}));
-        profile::RecordBlockCreator _("diclSend", profile::ExtraRecordInfo(),
-                                      stream.rawstream(), stream.id());
+        profile::RecordBlockCreator _("diclSend", stream.rawstream(),
+                                      stream.id());
         return devproxy::diclSend(input.data_ptr(), (size_t)input.numel(),
                                   input.scalar_type(), p2pPair.second, comm,
                                   stream.rawstream());
@@ -701,8 +695,8 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::recv(
       [&](at::Tensor& input, at::Tensor& output, diclComm_t comm,
           DIPUStream& stream) {
         RECORD_FUNCTION("diclRecv", std::vector<c10::IValue>({input}));
-        profile::RecordBlockCreator _("diclRecv", profile::ExtraRecordInfo(),
-                                      stream.rawstream(), stream.id());
+        profile::RecordBlockCreator _("diclRecv", stream.rawstream(),
+                                      stream.id());
         return devproxy::diclRecv(input.data_ptr(), (size_t)input.numel(),
                                   input.scalar_type(), p2pPair.second, comm,
                                   stream.rawstream());

From e4cfa76681638590af1f482582b151436dc26199 Mon Sep 17 00:00:00 2001
From: Juntao Chen <90135463+KevinfromTJ@users.noreply.github.com>
Date: Wed, 20 Dec 2023 15:16:07 +0800
Subject: [PATCH 26/58] [dicp][ascend] infer op resinfo (#517)

* fix a bug in get_cast_dtype: type(int+bool) should be int

* clean code format

* fix bug of wrong params in some op, add more ops' inference

* finish all the ops'inference except for those with running errors,fix some bugs, some ops implementation simplify situation

* modify judgement for ops' input by adding args and kwargs, adding more inference(though these op still have problem running)

* restrict SplitD only available for "view_as_complex"

* fix comment

---------

Co-authored-by: jinminxi104 <jinminxi104@hotmail.com>
---
 dicp/dicp/dynamo_bridge/operator.py           |   2 +-
 dicp/dicp/vendor/AscendGraph/ascend_op.py     | 162 ++++++++++++++----
 .../dicp/vendor/AscendGraph/codegen/ascend.py |   6 +-
 dicp/dicp/vendor/AscendGraph/conversion.py    |  35 ++--
 4 files changed, 158 insertions(+), 47 deletions(-)

diff --git a/dicp/dicp/dynamo_bridge/operator.py b/dicp/dicp/dynamo_bridge/operator.py
index 213b8bc31c..99411e9491 100644
--- a/dicp/dicp/dynamo_bridge/operator.py
+++ b/dicp/dicp/dynamo_bridge/operator.py
@@ -95,7 +95,7 @@ def make_cpu(x):
             except Exception as e:
                 log = logging.getLogger(__name__)
                 if hasattr(self, "infer_result"):
-                    log.warning(
+                    log.debug(
                         str(self.__name__) + ": infer shape and dtype failed,ignore"
                     )
                 elif hasattr(self, "torch_op"):
diff --git a/dicp/dicp/vendor/AscendGraph/ascend_op.py b/dicp/dicp/vendor/AscendGraph/ascend_op.py
index 014715031a..dc19ada845 100644
--- a/dicp/dicp/vendor/AscendGraph/ascend_op.py
+++ b/dicp/dicp/vendor/AscendGraph/ascend_op.py
@@ -42,6 +42,24 @@ class BroadcastTo(Operator):
     def __init__(self):
         super().__init__("BroadcastTo")
 
+    def infer_result(self, x, shape):
+        x, x_shape, x_dim, x_dtype = get_fake_tensor_meta_val(x)
+        shape, shape_shape, shape_dim, shape_dtype = get_fake_tensor_meta_val(shape)
+        shape = shape_shape
+        dims = zip(reversed(shape), reversed(x_shape))
+
+        for i, t in enumerate(dims):
+            tar_dim, cur_dim = t
+            if tar_dim == -1:
+                shape[-(i + 1)] = cur_dim
+                continue
+            elif cur_dim == 1:
+                continue
+            assert cur_dim == tar_dim, self.__class__.__name__ + ": shape mismatch!"
+
+        # broadcast keep get_memory_format
+        return torch.empty(shape, dtype=x_dtype, memory_format=get_memory_format(x))
+
 
 class Range(Operator):
     def __init__(self):
@@ -242,7 +260,6 @@ def infer_result(self, x, dim=None):
                     + ": can only squeeze a dimension that is 1!"
                 )
                 shape.pop(i)
-
         x_memory_format = get_memory_format(x)
         if len(shape) < 4:
             x_memory_format = torch.contiguous_format
@@ -253,6 +270,15 @@ class Pack(Operator):
     def __init__(self):
         super().__init__("Pack")
 
+    def infer_result(self, x, dim):
+        x0, x0_shape, x0_dim, x0_dtype = get_fake_tensor_meta_val(x[0])
+        dim = (dim + x0_dim + 1) % (x0_dim + 1)
+        out_shape = list(x0_shape)
+        out_shape.insert(dim, len(x))
+        return torch.empty(
+            out_shape, dtype=x0_dtype, memory_format=get_memory_format(x0)
+        )
+
 
 class Permute(Operator):
     def __init__(self):
@@ -263,11 +289,56 @@ class Expand(Operator):
     def __init__(self):
         super().__init__("Expand")
 
+    # TODO: unfinished, need furthur test
+    def infer_result(self, x, shape_tensor):
+        x, x_shape, x_dim, x_dtype = get_fake_tensor_meta_val(x, True)
+        (
+            shape_tensor,
+            shape_tensor_shape,
+            shape_tensor_dim,
+            shape_tensor_dtype,
+        ) = get_fake_tensor_meta_val(shape_tensor, True)
+        assert x_dim > 0, self.__class__.__name__ + ": scalar"
+        shape = list(shape_tensor_shape)
+        dims = zip(shape, x_shape)
+        x_stride = list(x.stride())
+        for i, t in enumerate(dims):
+            tar_dim, cur_dim = t
+            if tar_dim != cur_dim:
+                x_stride[i] = 0
+            if tar_dim == -1:
+                shape[-(i + 1)] = cur_dim
+                continue
+            elif cur_dim == 1:
+                continue
+            assert cur_dim == tar_dim, self.__class__.__name__ + ": shape mismatch!"
+        # broadcast keep get_memory_format
+        return torch.empty(shape, dtype=x_dtype, memory_format=get_memory_format(x))
+
 
 class ExpandD(Operator):
     def __init__(self):
         super().__init__("ExpandD")
 
+    def infer_result(self, x, shape):
+        x, x_shape, x_dim, x_dtype = get_fake_tensor_meta_val(x, True)
+        assert x_dim > 0, self.__class__.__name__ + ": scalar"
+        dims = zip(shape, x_shape)
+        x_stride = list(x.stride())
+        for i, t in enumerate(dims):
+            tar_dim, cur_dim = t
+            if tar_dim != cur_dim:
+                x_stride[i] = 0
+            if tar_dim == -1:
+                shape[-(i + 1)] = cur_dim
+                continue
+            elif cur_dim == 1:
+                continue
+            assert cur_dim == tar_dim, self.__class__.__name__ + ": shape mismatch!"
+        res = torch.empty(shape, dtype=x_dtype, memory_format=get_memory_format(x))
+        res = torch.as_strided(res, shape, x_stride, res.storage_offset())
+        return res
+
 
 class Sort(Operator):
     def __init__(self):
@@ -306,7 +377,7 @@ class Const(Operator):
     def __init__(self):
         super().__init__("Const")
 
-    def infer_result(self, new_args, kwargs):
+    def infer_result(self, *new_args, **kwargs):
         return new_args, kwargs
 
 
@@ -324,14 +395,11 @@ def __init__(self):
 
     def infer_result(self, base, expo):
         base, base_shape, base_dim, base_dtype = get_fake_tensor_meta_val(base)
-
         if isinstance(expo, Tuple):  # Const
             expo, expo_shape = get_op_const_arg_kwarg(expo)
             expo_dtype = type(expo[0]) if len(expo) > 0 else base_dtype
         else:  # fake Tensor
-            expo, expo_shape, expo_dim, expo_dtype = get_fake_tensor_meta_val(
-                expo
-            )
+            expo, expo_shape, expo_dim, expo_dtype = get_fake_tensor_meta_val(expo)
 
         out_shape = get_broadcast_res_two_shape(base_shape, expo_shape)
         dtype = get_cast_dtype(base_dtype, expo_dtype)
@@ -343,7 +411,7 @@ class Select(Operator):
     def __init__(self):
         super().__init__("Select")
 
-    def infer_result(self, x1, x2, condition):
+    def infer_result(self, condition, x1, x2):
         x1, x1_shape, x1_dim, x1_dtype = get_fake_tensor_meta_val(x1)
         x2, x2_shape, x2_dim, x2_dtype = get_fake_tensor_meta_val(x2)
         _, c_shape, _, _ = get_fake_tensor_meta_val(condition)
@@ -415,9 +483,18 @@ def __init__(self):
         super().__init__("Identity")
 
     def infer_result(self, x, idx=None):
-        x, x_shape, x_dim, x_dtype = get_fake_tensor_meta_val(x)
-        out_shape = list(x_shape[idx]) if idx is not None else list(x_shape)
-        return torch.empty(out_shape, dtype=x_dtype, memory_format=get_memory_format(x))
+        x, x_shape, _, x_dtype = get_fake_tensor_meta_val(x)
+        out_dtype = x_dtype
+        if x_dtype == torch.complex64:  # for complex64
+            out_shape = list(x_shape)
+            if idx == 0 or idx == 1:
+                out_dtype = torch.float32
+                out_shape.append(1)
+        else:
+            out_shape = [x_shape[idx]] if idx is not None else list(x_shape)
+        return torch.empty(
+            out_shape, dtype=out_dtype, memory_format=get_memory_format(x)
+        )
 
 
 class IdentityInp(Operator):
@@ -445,6 +522,18 @@ class Empty(Operator):
     def __init__(self):
         super().__init__("Empty")
 
+    def infer_result(
+        self, shape, dtype, layout, device, memory_format=torch.contiguous_format
+    ):
+        shape, _ = get_op_const_arg_kwarg(shape)
+        return torch.empty(
+            shape,
+            dtype=dtype,
+            layout=layout,
+            device=device,
+            memory_format=memory_format,
+        )
+
 
 class GatherV2(Operator):
     def __init__(self):
@@ -462,6 +551,9 @@ class OnesLike(Operator):
     def __init__(self):
         super().__init__("OnesLike")
 
+    def infer_result(self, x):
+        return common_unary_op_infer(x)
+
 
 class Fill(Operator):
     def __init__(self):
@@ -548,11 +640,31 @@ class SplitD(Operator):
     def __init__(self):
         super().__init__("SplitD")
 
+    def infer_result(self, x, split_dim, num_split, y, from_view_complex=False):
+        assert from_view_complex == True, self.__class__.__name__ + ": currently available only in op view_as_complex!"
+        x, x_shape, x_dim, x_dtype = get_fake_tensor_meta_val(x)
+        split_dim = (split_dim + x_dim) % x_dim
+        out_shape = list(x_shape)
+        del out_shape[-1]
+        return torch.empty(
+            out_shape,
+            dtype=torch.complex64 if from_view_complex else x_dtype,
+            memory_format=get_memory_format(x),
+        )
+
 
 class Slice(Operator):
     def __init__(self):
         super().__init__("Slice")
 
+    def infer_result(self, x, offset, size):
+        x, x_shape, _, x_dtype = get_fake_tensor_meta_val(x)
+        new_shape, _ = get_op_const_arg_kwarg(size)
+        offset, _ = get_op_const_arg_kwarg(offset)
+        _, storage_offset = cal_stride_offset(new_shape, offset, x)
+        res = torch.as_strided(x, new_shape, x.stride(), storage_offset)
+        return res
+
 
 class ConcatD(Operator):
     def __init__(self):
@@ -581,28 +693,17 @@ class Reshape(Operator):
     def __init__(self):
         super().__init__("Reshape")
 
-    # TODO:conflict in solving stride between "view" and "select"
-    def infer_result(self, x, shape_const_op):
-        x, x_shape, x_dim, x_dtype = get_fake_tensor_meta_val(x)
-        re_shape, re_dim = get_op_const_arg_kwarg(shape_const_op)
-        # check whether stride and storage_offset are manually specified
-        # if so, x is from operators like "Slice", and the stride and storage_offset still need to modify here
+    def infer_result(self, x, shape_const_op, ori_op=None, params_passed=None):
+        x, _, _, x_dtype = get_fake_tensor_meta_val(x)
+        re_shape, _ = get_op_const_arg_kwarg(shape_const_op)
         x_stride = list(x.stride())
-        x_shape = list(x_shape)
-
-        for i in range(len(x_stride) - 2, -1, -1):
-            if x_stride[i + 1] * x_shape[i + 1] != x_stride[i]:
-                del x_stride[i + 1]
-                del x_shape[i + 1]
-                break
-        else:
-            if len(x_shape) != len(re_shape):
-                del x_stride[0]
-                del x_shape[0]
-
-        x_storage_offset = x.storage_offset()
         res = torch.empty(re_shape, dtype=x_dtype, memory_format=get_memory_format(x))
-        res = torch.as_strided(res, re_shape, x_stride, x_storage_offset)
+        if ori_op == "Select":
+            assert "sel_dim" in params_passed, (
+                self.__class__.__name__ + ':param "sel_dim" from Select missing!'
+            )
+            del x_stride[params_passed["sel_dim"]]
+            res = torch.as_strided(res, re_shape, x_stride, x.storage_offset())
         return res
 
 
@@ -639,7 +740,6 @@ def __init__(self):
         super().__init__("Shape")
 
     def infer_result(self, x):
-        # like Const, we won't use this function, but it should exist as a flag for triggering inference of resinfo
         return common_unary_op_infer(x, spec_format=torch.contiguous_format)
 
 
diff --git a/dicp/dicp/vendor/AscendGraph/codegen/ascend.py b/dicp/dicp/vendor/AscendGraph/codegen/ascend.py
index a0cf0e78aa..c90c872df0 100644
--- a/dicp/dicp/vendor/AscendGraph/codegen/ascend.py
+++ b/dicp/dicp/vendor/AscendGraph/codegen/ascend.py
@@ -1042,7 +1042,7 @@ def BroadcastTo(name, x, shape):
         return broadcast_op.to_node()
 
     @staticmethod
-    def Empty(name, shape, dtype, layout=torch.strided, device='cpu'):
+    def Empty(name, shape, dtype, layout=torch.strided, device='cpu', memory_format=torch.contiguous_format):
         dtype = get_ascend_dtype_num(get_ascend_dtype(dtype))
         op = OP(name, "Empty")
         op.set_input("shape", shape)
@@ -1327,7 +1327,7 @@ def ThresholdGradV2D(name, grad_output, x, threshold):
         return op.to_node()
 
     @staticmethod
-    def SplitD(name, x, dim, num_split, y):
+    def SplitD(name, x, dim, num_split, y, from_view_complex):
         split_op = OP(name, "SplitD")
         split_op.set_input("x", x)
         split_op.set_attr_int("split_dim", dim)
@@ -1364,7 +1364,7 @@ def ConcatD(name, x, dim):
         return op.to_node()
 
     @staticmethod
-    def Reshape(name, x, shape):
+    def Reshape(name, x, shape, ori_op=None, params_passed=None):
         op = OP(name, "Reshape")
         op.set_input("x", x)
         op.set_input("shape", shape)
diff --git a/dicp/dicp/vendor/AscendGraph/conversion.py b/dicp/dicp/vendor/AscendGraph/conversion.py
index af5961c6c7..931374839f 100644
--- a/dicp/dicp/vendor/AscendGraph/conversion.py
+++ b/dicp/dicp/vendor/AscendGraph/conversion.py
@@ -308,10 +308,10 @@ def NewEmptyStrided(self, x, size, stride, dtype=torch.float32, layout=torch.str
         return self.empty_like(x)
 
     @register_conversion(aten.empty)
-    def empty(self, size, dtype=torch.int64, layout=torch.strided, device='cpu'):
+    def empty(self, size, dtype=torch.int64, layout=torch.strided, device='cpu', memory_format=torch.contiguous_format):
         shape_op = self.get_proxy(
             ascend_op.Const, (size, torch.int32, [len(size)]))
-        return self.get_proxy(ascend_op.Empty, (shape_op, dtype, layout, device))
+        return self.get_proxy(ascend_op.Empty, (shape_op, dtype, layout, device, memory_format))
 
     @register_conversion(aten.empty_like.default)
     def empty_like(self, x, dtype=torch.float32, layout=torch.strided,
@@ -320,7 +320,8 @@ def empty_like(self, x, dtype=torch.float32, layout=torch.strided,
         shape = list(x.node.meta['val'].shape)
         shape_op = self.get_proxy(
             ascend_op.Const, (shape, torch.int32, [len(shape)]))
-        return self.get_proxy(ascend_op.Empty, (shape_op, dtype, layout, device))
+        new_memory_format=x.node.meta['tensor_meta'].memory_format if memory_format is torch.preserve_format else memory_format
+        return self.get_proxy(ascend_op.Empty, (shape_op, dtype, layout, device, new_memory_format))
 
     @register_conversion(aten.select.int)
     def select(self, x, dim, index):
@@ -343,7 +344,13 @@ def select(self, x, dim, index):
         size = self.get_shape_proxy(size)
         slice = self.get_proxy(ascend_op.Slice, (x, offset, size))
         y_shape = self.get_shape_proxy(y_shape)
-        return self.get_proxy(ascend_op.Reshape, (slice, y_shape))
+        Reshape_kw = {
+            "ori_op": "Select",
+            "params_passed": {
+                "sel_dim": dim,
+            },
+        }
+        return self.get_proxy(ascend_op.Reshape, (slice, y_shape), Reshape_kw)
 
     @register_conversion(_operator.add)
     def inadd(self, x, y):
@@ -398,7 +405,7 @@ def view(self, x, size):
             return self.get_proxy(ascend_op.IdentityN, (real_reshape, imag_reshape))
         else:
             return self.get_proxy(ascend_op.Reshape, (x, shape))
-
+               
     @register_conversion(torch.ops.aten.where)
     def where(self, condition, x1, x2):
         # TODO(tangzhiyi): need to process scalars
@@ -437,9 +444,12 @@ def arange_start(self, start, end, step=1, dtype=None, device=None, layout=None,
     @register_conversion([aten.eq, aten.eq.Tensor])
     def eq(self, a, b):
         if not isinstance(b, torch.fx.proxy.Proxy):
-            assert isinstance(b, int)
+            a_dtype = a.node.meta['val'].dtype
+            const_dtype = torch.float32 if a_dtype == torch.float16 else a_dtype
             b_shape = list(a.node.meta['val'].shape)
-            b = self.get_param_proxy(b, torch.int64, b_shape)
+            b = self.get_param_proxy(b, const_dtype, b_shape)
+            if a_dtype == torch.float16:
+                b = self.get_proxy(ascend_op.Cast, (b, "FLOAT16"))
         return self.get_proxy(ascend_op.Equal, (a, b))
 
     @register_conversion([aten.lt.Scalar, aten.lt.Tensor])
@@ -529,7 +539,8 @@ def view_as_complex(self, x):
         assert x_val.dtype == torch.float32
         assert x_shape[-1] == 2
         dim = len(x_shape) - 1
-        return self.get_proxy(ascend_op.SplitD, (x, dim, 2, 2))
+        splitD_kw = { "from_view_complex": True }
+        return self.get_proxy(ascend_op.SplitD, (x, dim, 2, 2), splitD_kw)
 
     @register_conversion(torch.ops.aten.full.default)
     def full(self, dims, value, dtype=torch.float32, layout=torch.strided,
@@ -560,10 +571,10 @@ def sort(self, x, dim=-1, descending=False):
         return self.get_proxy(ascend_op.Sort, (x, dim, descending))
 
     @register_conversion(torch.ops.aten.ones.default)
-    def ones(self, shape, dtype=torch.int64, device='cpu', pin_memory=False):
+    def ones(self, shape, dtype=torch.float32, layout=torch.strided, device='cpu', pin_memory=False):
         shape = self.get_proxy(
             ascend_op.Const, (shape, torch.int32, [len(shape)]))
-        like = self.get_proxy(ascend_op.Empty, (shape, dtype))
+        like = self.get_proxy(ascend_op.Empty, (shape, dtype, layout, device))
         return self.get_proxy(ascend_op.OnesLike, (like,))
 
     @register_conversion(torch.ops.aten.new_ones.default)
@@ -990,7 +1001,7 @@ def sumdim(self, x, dims=[], keepdim=False):
         return self.get_proxy(ascend_op.ReduceSumD, (x, dims, keepdim))
 
     @register_conversion(torch.ops.aten.amax)
-    def amax(self, x, dims, keepdim):
+    def amax(self, x, dims, keepdim=False):
         if not isinstance(dims, list):
             dims = [dims]
         return self.get_proxy(ascend_op.ReduceMaxD, (x, dims, keepdim))
@@ -1031,7 +1042,7 @@ def identity(self, x, idx):
     @register_conversion(torch.ops.aten.full_like)
     def fulllike(self, x, value, dtype=torch.float32, layout=torch.strided,
                  device='cpu', pin_memory=False, memory_format=torch.preserve_format):
-        return self.get_proxy(ascend_op.ZerosLike, (x,))
+        return self.get_proxy(ascend_op.Fills, (x,float(value)))
 
     @register_conversion(torch.ops.aten.zeros_like.default)
     def zeros_like(self, x, dtype=torch.float32, layout=torch.strided,

From e8da1d937a91d18f9599db23d21df6e0c4143127 Mon Sep 17 00:00:00 2001
From: wugeshui <106943115+wugeshui@users.noreply.github.com>
Date: Wed, 20 Dec 2023 16:46:12 +0800
Subject: [PATCH 27/58] disable dicp ci on dipu (#554)

---
 .github/workflows/dicp.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/dicp.yml b/.github/workflows/dicp.yml
index 537fd239ee..b41f2603f1 100644
--- a/.github/workflows/dicp.yml
+++ b/.github/workflows/dicp.yml
@@ -1,6 +1,8 @@
 name: dicp ci
 on:
   workflow_dispatch:
+  schedule:
+    - cron: '10 23 * * *'
   push:
     branches:
       - main
@@ -10,6 +12,7 @@ on:
       - ".github/ISSUE_TEMPLATE/**"
       - ".git*"
       - "CODE_OF_CONDUCT**"
+      - "dipu/**"
 
 env:
   ENV_PATH: '/mnt/cache/share/platform/env'

From 6bba12323f01483d7fe765ea1ad4ab34b0ade17f Mon Sep 17 00:00:00 2001
From: Peter Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 20 Dec 2023 17:30:12 +0800
Subject: [PATCH 28/58] update submodule (#520)

* update submodule

* update submodule

* update submodule

* update submodule
---
 dipu/third_party/DIOPI | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dipu/third_party/DIOPI b/dipu/third_party/DIOPI
index 9b9589b226..98470dd554 160000
--- a/dipu/third_party/DIOPI
+++ b/dipu/third_party/DIOPI
@@ -1 +1 @@
-Subproject commit 9b9589b226d3a18482582037d9707574fe39fd48
+Subproject commit 98470dd55415bdf8fab63583f20f20643a2a8325

From 125077b5a0e096b6adcb1e1e0b24f66249b6fbec Mon Sep 17 00:00:00 2001
From: wiryls <7984500+wiryls@users.noreply.github.com>
Date: Wed, 20 Dec 2023 19:09:38 +0800
Subject: [PATCH 29/58] [DIPU] Move static global variables into static
 functions. (#531)

* refactor: move global into function

* debug: init static vars early

* doc: add more comments about this problem

* refactor: remove conversion operator

* fix: remove inline

* refactor: replace unordered_set with flat_hash_set
---
 .../csrc_dipu/aten/ops/DIPUCopy.hpp           |   3 +-
 dipu/torch_dipu/csrc_dipu/base/basedef.h      |   2 -
 dipu/torch_dipu/csrc_dipu/profiler/profiler.h |   2 +-
 .../csrc_dipu/runtime/core/DIPUEvent.h        |   4 +-
 .../csrc_dipu/runtime/core/DIPUStream.cpp     | 155 ++++++++----------
 .../csrc_dipu/runtime/core/DIPUStream.h       |  47 ++----
 .../core/allocator/DIPUBFCachingAllocator.cpp |   6 +-
 .../core/allocator/DIPUCachingAllocator.cpp   |  10 +-
 .../core/allocator/DIPUCachingAllocator.h     |   8 +-
 .../runtime/core/guardimpl/DIPUGuardImpl.h    |   8 +-
 .../runtime/distributed/DICLUtils.hpp         |   2 +
 .../runtime/distributed/ProcessGroupDICL.cpp  |   2 +-
 12 files changed, 104 insertions(+), 145 deletions(-)

diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUCopy.hpp b/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUCopy.hpp
index c7298900ae..bd79a4abde 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUCopy.hpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUCopy.hpp
@@ -4,6 +4,7 @@
 #include <ATen/ATen.h>
 #include <ATen/MemoryOverlap.h>
 #include <ATen/Tensor.h>
+#include <c10/core/Stream.h>
 
 #include <csrc_dipu/aten/DIPUATenFunctions.h>
 #include <csrc_dipu/aten/ops/OpUtils.hpp>
@@ -46,7 +47,7 @@ inline void tryRecordStream(const at::Tensor& tensor, DIPUStream& curStream,
                             bool is_default_stream) {
   if ((tensor.is_cpu() && tensor.options().pinned_memory()) ||
       !is_default_stream) {
-    tensor.record_stream(curStream);
+    tensor.record_stream(curStream.unwrap());
   }
 }
 
diff --git a/dipu/torch_dipu/csrc_dipu/base/basedef.h b/dipu/torch_dipu/csrc_dipu/base/basedef.h
index 69de4ddede..755f745b5f 100644
--- a/dipu/torch_dipu/csrc_dipu/base/basedef.h
+++ b/dipu/torch_dipu/csrc_dipu/base/basedef.h
@@ -6,8 +6,6 @@
 
 #include <csrc_dipu/runtime/device/basedef.h>
 
-auto static constexpr C10_COMPILE_TIME_MAX_DIPUS = 16;
-
 #define DIPU_DEVICE_TYPE_MACRO XPU
 #define DIPU_AUTOGRAD_DEVICE_TYPE_MACRO \
   C10_CONCATENATE(Autograd, DIPU_DEVICE_TYPE_MACRO)
diff --git a/dipu/torch_dipu/csrc_dipu/profiler/profiler.h b/dipu/torch_dipu/csrc_dipu/profiler/profiler.h
index 818f215b3b..a52055bfac 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/profiler.h
+++ b/dipu/torch_dipu/csrc_dipu/profiler/profiler.h
@@ -144,7 +144,7 @@ class RecordBlockCreator {
         if (!streamId) {
           streamId = dipu_stream.id();
         }
-        stream = static_cast<deviceStream_t>(dipu_stream);
+        stream = dipu_stream.rawstream();
       }
       initialize(string_t(name), *stream, *streamId);
     }
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEvent.h b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEvent.h
index 73846f0d4f..87063a8200 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEvent.h
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEvent.h
@@ -81,14 +81,14 @@ class DIPU_API DIPUEvent {
                 device_index_, " does not match recording stream's device ",
                 stream.device_index(), ".");
     DIPUGuard guard(device_index_);
-    devproxy::recordEvent(event_, stream);
+    devproxy::recordEvent(event_, stream.rawstream());
     was_recorded_ = true;
   }
 
   void wait(const DIPUStream& stream) {
     if (isCreated()) {
       DIPUGuard guard(stream.device_index());
-      devproxy::streamWaitEvent(stream, event_);
+      devproxy::streamWaitEvent(stream.rawstream(), event_);
     }
   }
 
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.cpp
index 4bb97e8839..fd47614563 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.cpp
@@ -5,7 +5,8 @@
 #include <atomic>
 #include <cstdint>
 #include <cstring>
-#include <iostream>
+#include <memory>
+#include <string>
 #include <sys/time.h>
 #include <unistd.h>
 #include <vector>
@@ -14,45 +15,29 @@
 
 #include "DIPUGuard.h"
 
-using dipu::devapis::deviceId_t;
 namespace dipu {
-
 namespace {
+
 enum class StreamIdType : uint8_t {
-  DEFAULT = 0x0,
-  POOL = 0x1,
+  DEFAULT = 0,
+  POOL = 1,
 };
 
-std::ostream& operator<<(std::ostream& stream, StreamIdType s) {
+std::string to_string(StreamIdType s) {
   switch (s) {
     case StreamIdType::DEFAULT:
-      stream << "DEFAULT";
-      break;
+      return "DEFAULT";
     case StreamIdType::POOL:
-      stream << "POOL";
-      break;
+      return "POOL";
     default:
-      stream << static_cast<uint8_t>(s);
-      break;
+      return std::to_string(static_cast<uint8_t>(s));
   }
-  return stream;
 }
+
 // follow old pytorch cuda, seems new version use an opposite strategy.
 constexpr int kStreamsPerPoolBits = 3;
 constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
 
-// Global stream state and constants
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-c10::DeviceIndex num_dipus = -1;
-// Default streams
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-std::once_flag global_init_flag;
-
-// streamid contains streamtype and/or raw stream id in DIPUStreamDevice pool
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-thread_local std::unique_ptr<std::vector<c10::StreamId>> current_streams =
-    nullptr;
-
 c10::StreamId makeC10StreamId(StreamIdType sType, size_t id) {
   return (static_cast<uint32_t>(static_cast<c10::StreamId>(sType)
                                 << kStreamsPerPoolBits)) |
@@ -65,11 +50,10 @@ struct DIPUStreamDevice {
   // Default streams
   std::once_flag pool_flag;
   std::once_flag default_flag;
-  deviceId_t devidx_{};
+  devapis::deviceId_t devidx_;
   // seems pytorch 2.0 giveup default stream and enable cuda per_thread stream
   // feature at compile time. it cannot be applied to other device.
-  deviceStream_t default_stream = nullptr;
-
+  deviceStream_t default_stream{};
   std::atomic<uint32_t> next_pool_pos{};
   std::array<deviceStream_t, kStreamsPerPool> pool_streams{};
 
@@ -87,11 +71,11 @@ struct DIPUStreamDevice {
     return static_cast<size_t>(static_cast<uint32_t>(s) &
                                ((1 << kStreamsPerPoolBits) - 1));
   }
+
   void _doInitPool() {
     DIPUGuard device_guard{devidx_};
-    for (auto i = decltype(kStreamsPerPool){0}; i < kStreamsPerPool; ++i) {
-      auto& raw_device_stream = pool_streams[i];
-      devproxy::createStream(&raw_device_stream);
+    for (auto& stream : pool_streams) {
+      devproxy::createStream(&stream);
     }
   }
 
@@ -103,8 +87,8 @@ struct DIPUStreamDevice {
   }
 
  public:
-  explicit DIPUStreamDevice(deviceId_t devidx)
-      : next_pool_pos(0), devidx_(devidx) {}
+  explicit DIPUStreamDevice(devapis::deviceId_t device_id)
+      : devidx_(device_id) {}
 
   DIPUStream getDIPUStreamfromPool() {
     const auto idx = getNextPoolIdx();
@@ -134,8 +118,8 @@ struct DIPUStreamDevice {
       case StreamIdType::POOL:
         return pool_streams[sidx];
       default:
-        AT_ASSERTM(0, "Unrecognized stream ", stream_id,
-                   " (I didn't recognize the stream type, ", st, ")");
+        // TODO(assert): AT_ERROR is deprecated.
+        AT_ERROR("Invalid stream", stream_id, " (type=", to_string(st), ")");
     }
   }
   void initPool() {
@@ -146,72 +130,74 @@ struct DIPUStreamDevice {
   }
 };
 
-std::array<std::unique_ptr<DIPUStreamDevice>, C10_COMPILE_TIME_MAX_DIPUS>
-    streamDeviceList;  // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
-
-void initGlobalStreamState() {
-  num_dipus = devproxy::getDeviceCount();
-  // Check if the number of DIPU matches the expected compile-time max number
-  // of DIPU.
-  AT_ASSERTM(
-      num_dipus <= C10_COMPILE_TIME_MAX_DIPUS,
-      "Number of DIPU devices on the machine is larger than the compiled "
-      "max number of dipus expected (",
-      C10_COMPILE_TIME_MAX_DIPUS, "). Increase that and recompile.");
-
-  for (int i = 0; i < num_dipus; i++) {
-    streamDeviceList[i] = std::move(std::make_unique<DIPUStreamDevice>(i));
-  }
+auto StreamDeviceList()
+    -> std::vector<std::unique_ptr<DIPUStreamDevice>> const& {
+  auto make_list = [] {
+    auto number_of_device = devproxy::getDeviceCount();
+    auto list = std::vector<std::unique_ptr<DIPUStreamDevice>>();
+    list.reserve(number_of_device);
+    for (auto i = 0; i < number_of_device; ++i) {
+      list.emplace_back(std::make_unique<DIPUStreamDevice>(i));
+    }
+    return list;
+  };
+
+  auto static device_list = make_list();
+  return device_list;
 }
 
-c10::DeviceIndex initDIPUGlobal(c10::DeviceIndex devIdx) {
-  // Inits default streams (once, globally)
-  std::call_once(global_init_flag, initGlobalStreamState);
+auto LocalStreams() -> std::vector<c10::StreamId>& {
+  auto static thread_local streams = std::vector<c10::StreamId>(
+      StreamDeviceList().size(), makeC10StreamId(StreamIdType::DEFAULT, 0));
 
-  // check device id
-  if (devIdx == -1) {
-    devIdx = devproxy::current_device();
-  }
-  AT_ASSERT(devIdx >= 0 && devIdx < num_dipus);
-  streamDeviceList[devIdx]->initDevice();
+  return streams;
+}
 
-  // current_streams is thread local. so check every time.
-  if (current_streams) {
-    return devIdx;
-  }
-  current_streams = std::make_unique<std::vector<c10::StreamId>>(num_dipus);
+// TODO(lifetime): remove it someday.
+//
+// This static variable is used to initialize StreamDevice and StreamIds. As
+// BFCachingAllocator depends on them via getDefaultDIPUStream. We need to make
+// sure its lifetime longer than static BFCachingAllocator.
+auto const& force_to_initialize_streams = LocalStreams();
 
-  // Inits current streams (thread local) to default streams
-  for (const auto i : c10::irange(num_dipus)) {
-    (*current_streams)[i] = makeC10StreamId(StreamIdType::DEFAULT, 0);
+c10::DeviceIndex setupDevice(c10::DeviceIndex device_index) {
+  if (device_index == -1) {
+    device_index = devproxy::current_device();
   }
-  // set device default stream in init
-  return devIdx;
+
+  auto& device_list = StreamDeviceList();
+  auto number_of_device = static_cast<int>(device_list.size());
+  // TODO(assert): AT_ASSERT is deprecated and TORCH_CHECK contains their own
+  // help message. We need our version.
+  AT_ASSERT(0 <= device_index && device_index < number_of_device);
+  device_list[device_index]->initDevice();
+  return device_index;
 }
 
 }  // end anonymous namespace
 
 // api
 deviceStream_t DIPUStream::rawstream() const {
-  return streamDeviceList[this->device_index()]->obtainRawStream(
-      this->unwrap().id());
+  return StreamDeviceList()[stream_.device_index()]->obtainRawStream(
+      stream_.id());
 }
 
 DIPUStream getDIPUStreamFromPool(c10::DeviceIndex device_index) {
-  device_index = initDIPUGlobal(device_index);
+  device_index = setupDevice(device_index);
   // Initializes the stream pools (once)
-  streamDeviceList[device_index]->initPool();
-  return streamDeviceList[device_index]->getDIPUStreamfromPool();
+  auto& device = *StreamDeviceList()[device_index];
+  device.initPool();
+  return device.getDIPUStreamfromPool();
 }
 
 DIPUStream getDefaultDIPUStream(c10::DeviceIndex device_index) {
-  device_index = initDIPUGlobal(device_index);
-  return streamDeviceList[device_index]->getDefaultDIPUStream();
+  device_index = setupDevice(device_index);
+  return StreamDeviceList()[device_index]->getDefaultDIPUStream();
 }
 
 DIPUStream getCurrentDIPUStream(c10::DeviceIndex device_index) {
-  device_index = initDIPUGlobal(device_index);
-  return DIPUStream(device_index, (*current_streams)[device_index]);
+  device_index = setupDevice(device_index);
+  return DIPUStream(device_index, LocalStreams()[device_index]);
 }
 
 // copy from pytorch, not verify
@@ -222,13 +208,10 @@ DIPUStream getStreamFromExternal(deviceStream_t ext_stream,
 }
 
 void setCurrentDIPUStream(DIPUStream stream) {
-  auto devIdx = stream.device_index();
-  initDIPUGlobal(devIdx);
-  (*current_streams)[devIdx] = stream.unwrap().id();
-}
-
-std::ostream& operator<<(std::ostream& stream, const DIPUStream& s) {
-  return stream << s.unwrap();
+  auto device_index = stream.device_index();
+  // TODO(assert): assert(setupDevice(device_index) == device_index)
+  setupDevice(device_index);
+  LocalStreams()[device_index] = stream.unwrap().id();
 }
 
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.h b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.h
index a16ece3d4a..63f75d4d3b 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.h
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.h
@@ -1,15 +1,10 @@
 // Copyright (c) 2023, DeepLink.
 #pragma once
 
-#include <cstdint>
-#include <functional>
-#include <mutex>
-
 #include <c10/core/Device.h>
 #include <c10/core/DeviceGuard.h>
 #include <c10/core/Stream.h>
 #include <c10/util/Exception.h>
-#include <c10/util/SmallVector.h>
 
 #include <csrc_dipu/base/basedef.h>
 #include <csrc_dipu/runtime/devproxy/deviceproxy.h>
@@ -17,33 +12,30 @@
 namespace dipu {
 
 class DIPU_API DIPUStream {
+ private:
+  c10::Stream stream_;
+
  public:
+  // Need more discussion to handle empty DIPUStream.
+  explicit DIPUStream() : DIPUStream(-1, 0) {}
+
   explicit DIPUStream(c10::Stream stream) : stream_(stream) {
     TORCH_CHECK(stream_.device_type() == dipu::DIPU_DEVICE_TYPE);
   }
 
-  explicit DIPUStream(devapis::deviceId_t devidx, c10::StreamId stream_id)
+  explicit DIPUStream(devapis::deviceId_t device_id, c10::StreamId stream_id)
       : DIPUStream(c10::Stream(c10::Stream::UNSAFE,
-                               c10::Device(dipu::DIPU_DEVICE_TYPE, devidx),
+                               c10::Device(dipu::DIPU_DEVICE_TYPE, device_id),
                                stream_id)) {}
 
-  // Need more discussion to handle empty DIPUStream.
-  explicit DIPUStream() : DIPUStream(-1, 0) {}
-
-  ~DIPUStream() = default;
-
   bool operator==(const DIPUStream& other) const noexcept {
     return unwrap() == other.unwrap();
   }
 
   bool operator!=(const DIPUStream& other) const noexcept {
-    return unwrap() != other.unwrap();
+    return not operator==(other);
   }
 
-  // FIXME: add explicit later as it is used by many other files.
-  operator c10::Stream() const { return unwrap(); }
-  operator deviceStream_t() const { return rawstream(); }
-
   /// Get the device index that this stream is associated with.
   c10::DeviceIndex device_index() const { return stream_.device_index(); }
 
@@ -65,23 +57,9 @@ class DIPU_API DIPUStream {
     return devproxy::isStreamEmpty(rawstream());
   }
 
-  /// Explicit conversion to rtStream_t.
-  deviceStream_t rawstream() const;
-
-  /// Explicit conversion to Stream.
   c10::Stream unwrap() const { return stream_; }
 
-  c10::StreamData3 pack3() const noexcept { return stream_.pack3(); }
-
-  static DIPUStream unpack3(c10::StreamId stream_id,
-                            c10::DeviceIndex device_index,
-                            c10::DeviceType device_type) {
-    TORCH_CHECK(device_type == dipu::DIPU_DEVICE_TYPE);
-    return DIPUStream(device_index, stream_id);
-  }
-
- private:
-  c10::Stream stream_;
+  deviceStream_t rawstream() const;
 };
 
 DIPU_API DIPUStream getDIPUStreamFromPool(c10::DeviceIndex device_index = -1);
@@ -95,7 +73,10 @@ DIPU_API void setCurrentDIPUStream(DIPUStream stream);
 DIPU_API DIPUStream getStreamFromExternal(deviceStream_t ext_stream,
                                           c10::DeviceIndex device_index);
 
-std::ostream& operator<<(std::ostream& stream, const DIPUStream& s);
+template <typename O>
+O& operator<<(O& oss, const dipu::DIPUStream& stream) {
+  oss << stream.unwrap();
+}
 }  // namespace dipu
 
 template <>
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBFCachingAllocator.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBFCachingAllocator.cpp
index 8ae079a494..cb25fb4bc2 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBFCachingAllocator.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBFCachingAllocator.cpp
@@ -479,11 +479,11 @@ class BFCachingAllocator : public CacheAllocator {
       if (allocator_->impl) {
         if (ptr()) {
           std::deque<DIPUEvent> events;
-          for (auto iter = streams().begin(); iter != streams().end(); iter++) {
+          for (auto const& stream : streams()) {
             events.emplace_back();
             DIPU_DEBUG_ALLOCATOR(8, "BFCachingAllocator: record to stream:"
-                                        << iter->rawstream());
-            events.back().record(*iter);
+                                        << stream.rawstream());
+            events.back().record(stream);
           }
           allocator_->async_mem_pool()->add(std::make_tuple(ptr(), id_),
                                             events);
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.cpp
index 631a92b93b..37071972e3 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.cpp
@@ -201,13 +201,9 @@ size_t maxMemoryAllocated(const c10::Device& device) {
 }
 
 void recordStream(const c10::DataPtr& ptr, const DIPUStream& stream) {
-  void* ctx = ptr.get_context();
-  if (ctx == nullptr) {
-    return;
-  }
-  auto base_cxt = static_cast<CacheAllocator::DataPtrContextBase*>(ctx);
-  if (base_cxt) {
-    base_cxt->streams().insert(stream);
+  using pointer = CacheAllocator::DataPtrContextBase*;
+  if (auto ctx = static_cast<pointer>(ptr.get_context())) {
+    ctx->streams().insert(stream);
   }
 }
 
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h
index 238ac88ebe..134f499f34 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h
@@ -1,10 +1,9 @@
 // Copyright (c) 2023, DeepLink.
 #pragma once
 
-#include <set>
-
 #include <c10/core/Allocator.h>
 #include <c10/core/Device.h>
+#include <c10/util/flat_hash_map.h>
 
 #include "DIPUAsyncResourcePool.h"
 #include "DIPURawAllocator.h"
@@ -93,8 +92,7 @@ class DIPU_API CacheAllocator : public c10::Allocator, public MemStats {
   c10::Device& device() const { return device_; }
 
   class DataPtrContextBase {
-   private:
-    std::set<DIPUStream> streams_;
+    ska::flat_hash_set<DIPUStream> streams_;
     mutable const CacheAllocator* allocator_ = nullptr;
     void* ptr_ = nullptr;
     size_t size_ = 0;
@@ -116,7 +114,7 @@ class DIPU_API CacheAllocator : public c10::Allocator, public MemStats {
 
     ~DataPtrContextBase() { MemChecker::instance().erase(ptr_); }
 
-    std::set<DIPUStream>& streams() { return streams_; }
+    ska::flat_hash_set<DIPUStream>& streams() { return streams_; }
 
     const CacheAllocator* allocator() { return allocator_; }
 
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/guardimpl/DIPUGuardImpl.h b/dipu/torch_dipu/csrc_dipu/runtime/core/guardimpl/DIPUGuardImpl.h
index bfc4182b55..bd50069d9c 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/guardimpl/DIPUGuardImpl.h
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/guardimpl/DIPUGuardImpl.h
@@ -4,6 +4,7 @@
 
 #include <limits>
 
+#include <c10/core/Stream.h>
 #include <c10/core/impl/DeviceGuardImplInterface.h>
 #include <c10/macros/Macros.h>
 
@@ -66,11 +67,11 @@ struct DIPUGuardImpl : public c10::impl::DeviceGuardImplInterface {
 
   c10::Stream getStreamFromGlobalPool(c10::Device d,
                                       bool isHighPriority) const override {
-    return getDIPUStreamFromPool(d.index());
+    return getDIPUStreamFromPool(d.index()).unwrap();
   }
 
   c10::Stream getDefaultStream(c10::Device device) const override {
-    return getDefaultDIPUStream(device.index());
+    return getDefaultDIPUStream(device.index()).unwrap();
   }
 
   void record(void** event, const c10::Stream& s,
@@ -83,7 +84,6 @@ struct DIPUGuardImpl : public c10::impl::DeviceGuardImplInterface {
 
     auto dipu_event = static_cast<deviceEvent_t>(*event);
     auto stream = DIPUStream(s);
-    auto raw_stream = stream.rawstream();
 
     // Moves to queue's device to record
     const c10::Device orig_device = this->getDevice();
@@ -93,7 +93,7 @@ struct DIPUGuardImpl : public c10::impl::DeviceGuardImplInterface {
     if (!dipu_event) {
       devproxy::createEvent(&dipu_event);
     }
-    devproxy::recordEvent(dipu_event, raw_stream);
+    devproxy::recordEvent(dipu_event, stream.rawstream());
     *event = dipu_event;
 
     // Resets device
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/distributed/DICLUtils.hpp b/dipu/torch_dipu/csrc_dipu/runtime/distributed/DICLUtils.hpp
index d0dad8c14d..2b0e98f7d0 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/distributed/DICLUtils.hpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/distributed/DICLUtils.hpp
@@ -5,6 +5,8 @@
 
 #include <c10/core/Device.h>
 
+#include <csrc_dipu/runtime/core/DIPUEvent.h>
+#include <csrc_dipu/runtime/core/DIPUStream.h>
 #include <csrc_dipu/runtime/devproxy/deviceproxy.h>
 #include <csrc_dipu/runtime/devproxy/diclproxy.h>
 
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/distributed/ProcessGroupDICL.cpp b/dipu/torch_dipu/csrc_dipu/runtime/distributed/ProcessGroupDICL.cpp
index d94feb5862..54b0626310 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/distributed/ProcessGroupDICL.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/distributed/ProcessGroupDICL.cpp
@@ -411,7 +411,7 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::doComm(
   // todo:: dipu need support multistream guard & remove
   // work->workEvents_(future already has events ).
   {
-    DIPUStreamGuard streamGuard(diclComms[0]->diclStream_);
+    DIPUStreamGuard guard(diclComms[0]->diclStream_.unwrap());
 
     work->future_ = c10::make_intrusive<at::ivalue::Future>(
         c10::ListType::create(c10::TensorType::get()), devices);

From 4f0ad85d5b57b678bf54b517ea162432594516b1 Mon Sep 17 00:00:00 2001
From: tangzhiyi11 <tangzhiyi11@users.noreply.github.com>
Date: Thu, 21 Dec 2023 10:51:52 +0800
Subject: [PATCH 30/58] [DICP][ascend] fix some ops in ascendgraph (#546)

* fix sumdim in ascendgraph

* fix eq and less
---
 dicp/dicp/vendor/AscendGraph/conversion.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/dicp/dicp/vendor/AscendGraph/conversion.py b/dicp/dicp/vendor/AscendGraph/conversion.py
index 931374839f..b086400c47 100644
--- a/dicp/dicp/vendor/AscendGraph/conversion.py
+++ b/dicp/dicp/vendor/AscendGraph/conversion.py
@@ -443,24 +443,28 @@ def arange_start(self, start, end, step=1, dtype=None, device=None, layout=None,
 
     @register_conversion([aten.eq, aten.eq.Tensor])
     def eq(self, a, b):
+        a_dtype = a.node.meta['val'].dtype
         if not isinstance(b, torch.fx.proxy.Proxy):
-            a_dtype = a.node.meta['val'].dtype
             const_dtype = torch.float32 if a_dtype == torch.float16 else a_dtype
             b_shape = list(a.node.meta['val'].shape)
             b = self.get_param_proxy(b, const_dtype, b_shape)
             if a_dtype == torch.float16:
                 b = self.get_proxy(ascend_op.Cast, (b, "FLOAT16"))
+        elif a_dtype != b.node.meta["val"].dtype:
+            b = self.get_proxy(ascend_op.Cast, (b, get_ascend_dtype(a_dtype)))
         return self.get_proxy(ascend_op.Equal, (a, b))
 
     @register_conversion([aten.lt.Scalar, aten.lt.Tensor])
     def lt(self, x, y):
+        x_dtype = x.node.meta['val'].dtype
         if not isinstance(y, torch.fx.proxy.Proxy):
-            x_dtype = x.node.meta['val'].dtype
             const_dtype = torch.float32 if x_dtype == torch.float16 else x_dtype
             y_shape = list(x.node.meta['val'].shape)
             y = self.get_param_proxy(y, const_dtype, y_shape)
             if x_dtype == torch.float16:
                 y = self.get_proxy(ascend_op.Cast, (y, "FLOAT16"))
+        elif x_dtype != y.node.meta['val'].dtype:
+            y = self.get_proxy(ascend_op.Cast, (y, get_ascend_dtype(x_dtype)))
         return self.get_proxy(ascend_op.Less, (x, y))
 
     @register_conversion(aten.masked_fill.Scalar)
@@ -995,10 +999,14 @@ def sum(self, a):
         return self.sumdim(a)
 
     @register_conversion(torch.ops.aten.sum.dim_IntList)
-    def sumdim(self, x, dims=[], keepdim=False):
+    def sumdim(self, x, dims=[], keepdim=False, dtype=None):
+        x_dtype = x.node.meta['val'].dtype
         if not isinstance(dims, list):
             dims = [dims]
-        return self.get_proxy(ascend_op.ReduceSumD, (x, dims, keepdim))
+        if dtype is None or x_dtype == dtype:
+            return self.get_proxy(ascend_op.ReduceSumD, (x, dims, keepdim))
+        sum = self.get_proxy(ascend_op.ReduceSumD, (x, dims, keepdim))
+        return self.get_proxy(ascend_op.Cast, (sum, get_ascend_dtype(dtype)))
 
     @register_conversion(torch.ops.aten.amax)
     def amax(self, x, dims, keepdim=False):

From 155eea31b00a2cba7200276cbcd3946de1c0f245 Mon Sep 17 00:00:00 2001
From: caikun-pjlab <116071181+caikun-pjlab@users.noreply.github.com>
Date: Fri, 22 Dec 2023 10:14:35 +0800
Subject: [PATCH 31/58] [DIPU]support using torch profiler on nv (#559)

* support using torch profiler on nv

* modify tests according to comments

* fix test
---
 .../individual_scripts/test_dipu_profiler.py  | 59 ++++++++++++++++++
 .../test_profiler_communication.py            |  6 +-
 dipu/tests/python/unittests/test_profiler.py  | 46 --------------
 .../python/unittests/test_profiler_cuda.py    | 60 +++++++++++++++++++
 dipu/torch_dipu/profiler/profiler.py          |  6 ++
 5 files changed, 130 insertions(+), 47 deletions(-)
 create mode 100644 dipu/tests/python/individual_scripts/test_dipu_profiler.py
 create mode 100644 dipu/tests/python/unittests/test_profiler_cuda.py

diff --git a/dipu/tests/python/individual_scripts/test_dipu_profiler.py b/dipu/tests/python/individual_scripts/test_dipu_profiler.py
new file mode 100644
index 0000000000..123bd1fa6f
--- /dev/null
+++ b/dipu/tests/python/individual_scripts/test_dipu_profiler.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2023, DeepLink.
+import os
+os.environ["FORCE_USE_DIPU_PROFILER"] = "True"
+
+import tempfile
+import torch
+import torch_dipu
+import torchvision.models as models
+from torch.profiler import profile, ProfilerActivity
+from torch_dipu.testing._internal.common_utils import TestCase, run_tests, onlyOn
+from torch_dipu.testing._internal.local_eviron import local_eviron
+
+
+class TestProfiler(TestCase):
+    def test_profiler(self):
+        model = models.resnet18().cuda()
+        inputs = torch.randn(5, 3, 224, 224).cuda()
+
+        with local_eviron({"KINETO_LOG_LEVEL": "999"}):  # suppress profiler logs
+            with profile(
+                activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                profile_memory=True,
+                record_shapes=True,
+                with_modules=True,
+                with_stack=True,
+                experimental_config=torch._C._profiler._ExperimentalConfig(verbose=True)
+            ) as prof:
+                output = model(inputs)
+                output.sum().backward()
+
+        profile_output = prof.key_averages(group_by_input_shape=True).table(
+            sort_by="self_cuda_time_total", row_limit=1000
+        )
+        self.assertIn("diopiConvolution2dBackward", profile_output)
+        self.assertIn("dipu_convolution_", profile_output)
+        self.assertIn("LaunchKernel_dipu", profile_output)
+        self.assertIn("LaunchKernel_diopi", profile_output)
+        self.assertIn("Self CPU time total", profile_output)
+        self.assertIn("Self CUDA time total", profile_output)
+        self.assertIn("5, 3, 224, 224", profile_output)
+
+        profile_stack_output = prof.key_averages(group_by_stack_n=15).table(
+            sort_by="cuda_time_total", row_limit=1000)
+        self.assertIn("Source Location", profile_stack_output)
+        self.assertIn("resnet.py", profile_stack_output)
+
+        profile_memory_output = prof.key_averages().table(
+            sort_by="self_cuda_memory_usage", row_limit=1000)
+        self.assertIn("Self CPU Mem", profile_memory_output)
+        self.assertIn("Self CUDA Mem", profile_memory_output)
+        self.assertIn("Mb", profile_memory_output)
+        self.assertIn("Kb", profile_memory_output)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            prof.export_chrome_trace(f"{tmpdir}/dipu_resnet18_profiler.json")
+
+
+if __name__ == "__main__":
+    run_tests()
\ No newline at end of file
diff --git a/dipu/tests/python/individual_scripts/test_profiler_communication.py b/dipu/tests/python/individual_scripts/test_profiler_communication.py
index dfc279b0a3..f3cce135f4 100644
--- a/dipu/tests/python/individual_scripts/test_profiler_communication.py
+++ b/dipu/tests/python/individual_scripts/test_profiler_communication.py
@@ -1,5 +1,8 @@
 import os
+os.environ["FORCE_USE_DIPU_PROFILER"] = "True"
+
 import random
+import tempfile
 import torch
 import torch.distributed as dist
 import torch.nn as nn
@@ -56,7 +59,8 @@ def demo_basic_ddp(rank, world_size, port):
     )
     assert("c10d::allreduce_" in profile_output)
     assert("LaunchKernel_DiclAllreduce" in profile_output)
-    prof.export_chrome_trace(f"./dipu_resnet18_profiler_{rank}.json")
+    with tempfile.TemporaryDirectory() as tmpdir:
+        prof.export_chrome_trace(f"{tmpdir}/dipu_resnet18_profiler_{rank}.json")
     cleanup()
 
 def test_profiler_communication():
diff --git a/dipu/tests/python/unittests/test_profiler.py b/dipu/tests/python/unittests/test_profiler.py
index 5343ce1712..fbe75e3e68 100644
--- a/dipu/tests/python/unittests/test_profiler.py
+++ b/dipu/tests/python/unittests/test_profiler.py
@@ -1,10 +1,7 @@
 # Copyright (c) 2023, DeepLink.
 import torch
 import torch_dipu
-import torchvision.models as models
-from torch.profiler import profile, ProfilerActivity
 from torch_dipu.testing._internal.common_utils import TestCase, run_tests, onlyOn
-from torch_dipu.testing._internal.local_eviron import local_eviron
 import torch._dynamo as dynamo
 import subprocess
 
@@ -17,50 +14,7 @@ def check_string_in_directory(directory, search_string):
         return False
 
 
-
 class TestProfiler(TestCase):
-    def test_profiler(self):
-        model = models.resnet18().cuda()
-        inputs = torch.randn(5, 3, 224, 224).cuda()
-
-        with local_eviron({"KINETO_LOG_LEVEL": "999"}):  # suppress profiler logs
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
-                profile_memory=True,
-                record_shapes=True,
-                with_modules=True,
-                with_stack=True,
-                experimental_config=torch._C._profiler._ExperimentalConfig(verbose=True)
-            ) as prof:
-                output = model(inputs)
-                output.sum().backward()
-
-        profile_output = prof.key_averages(group_by_input_shape=True).table(
-            sort_by="self_cuda_time_total", row_limit=1000
-        )
-        self.assertIn("diopiConvolution2dBackward", profile_output)
-        self.assertIn("dipu_convolution_", profile_output)
-        self.assertIn("LaunchKernel_dipu", profile_output)
-        self.assertIn("LaunchKernel_diopi", profile_output)
-        self.assertIn("Self CPU time total", profile_output)
-        self.assertIn("Self CUDA time total", profile_output)
-        self.assertIn("5, 3, 224, 224", profile_output)
-
-        profile_stack_output = prof.key_averages(group_by_stack_n=15).table(
-            sort_by="cuda_time_total", row_limit=1000)
-        self.assertIn("Source Location", profile_stack_output)
-        self.assertIn("resnet.py", profile_stack_output)
-        self.assertIn("test_profiler.py", profile_stack_output)
-
-        profile_memory_output = prof.key_averages().table(
-            sort_by="self_cuda_memory_usage", row_limit=1000)
-        self.assertIn("Self CPU Mem", profile_memory_output)
-        self.assertIn("Self CUDA Mem", profile_memory_output)
-        self.assertIn("Mb", profile_memory_output)
-        self.assertIn("Kb", profile_memory_output)
-
-        prof.export_chrome_trace("./dipu_resnet18_profiler.json")
-
     @onlyOn("NPU")
     def test_aot_profiler(self):
         x = torch.randn(3, 4).cuda()
diff --git a/dipu/tests/python/unittests/test_profiler_cuda.py b/dipu/tests/python/unittests/test_profiler_cuda.py
new file mode 100644
index 0000000000..bd62aa9051
--- /dev/null
+++ b/dipu/tests/python/unittests/test_profiler_cuda.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2023, DeepLink.
+import tempfile
+import torch
+import torch_dipu
+import torchvision.models as models
+from torch.profiler import profile, ProfilerActivity
+from torch_dipu.testing._internal.common_utils import TestCase, run_tests, onlyOn
+from torch_dipu.testing._internal.local_eviron import local_eviron
+
+
+class TestProfiler(TestCase):
+    @onlyOn("CUDA")
+    def test_profiler(self):
+        model = models.resnet18().cuda()
+        inputs = torch.randn(5, 3, 224, 224).cuda()
+
+        with local_eviron({"KINETO_LOG_LEVEL": "999"}):  # suppress profiler logs
+            with profile(
+                activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                profile_memory=True,
+                record_shapes=True,
+                with_modules=True,
+                with_stack=True,
+                experimental_config=torch._C._profiler._ExperimentalConfig(verbose=True)
+            ) as prof:
+                output = model(inputs)
+                output.sum().backward()
+
+        profile_output = prof.key_averages(group_by_input_shape=True).table(
+            sort_by="self_cuda_time_total", row_limit=1000
+        )
+        self.assertNotIn("diopiConvolution2dBackward", profile_output)
+        self.assertNotIn("dipu_convolution_", profile_output)
+        self.assertNotIn("LaunchKernel_dipu", profile_output)
+        self.assertNotIn("LaunchKernel_diopi", profile_output)
+        self.assertIn("aten::cudnn_convolution", profile_output)
+        self.assertIn("aten::add", profile_output)
+        self.assertIn("vectorized_elementwise_kernel", profile_output)
+        self.assertIn("Self CPU time total", profile_output)
+        self.assertIn("Self CUDA time total", profile_output)
+        self.assertIn("5, 3, 224, 224", profile_output)
+
+        profile_stack_output = prof.key_averages(group_by_stack_n=15).table(
+            sort_by="cuda_time_total", row_limit=1000)
+        self.assertIn("Source Location", profile_stack_output)
+        self.assertIn("resnet.py", profile_stack_output)
+
+        profile_memory_output = prof.key_averages().table(
+            sort_by="self_cuda_memory_usage", row_limit=1000)
+        self.assertIn("Self CPU Mem", profile_memory_output)
+        self.assertIn("Self CUDA Mem", profile_memory_output)
+        self.assertIn("Mb", profile_memory_output)
+        self.assertIn("Kb", profile_memory_output)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            prof.export_chrome_trace(f"{tmpdir}/resnet18_profiler.json")
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/dipu/torch_dipu/profiler/profiler.py b/dipu/torch_dipu/profiler/profiler.py
index 0011f9ac9c..61313844cd 100644
--- a/dipu/torch_dipu/profiler/profiler.py
+++ b/dipu/torch_dipu/profiler/profiler.py
@@ -412,6 +412,12 @@ def trim_path(path, src_column_width):
 
 
 def apply_profiler_patch():
+    # The data collected by dipu profiler differs significantly from pytorch profiler,
+    # making it difficult to align during performance analysis.
+    # Reuse pytorch profiler logic on NV, while providing environment variables to switch to dipu profiler.
+    if _C.dipu_vendor == 'CUDA' and os.environ.get("FORCE_USE_DIPU_PROFILER", 'False').lower() == 'false' :
+        return
+
     setattr(torch.profiler.profiler, 'kineto_available', dipu_kineto_available)
     setattr(torch.autograd.profiler, 'kineto_available', dipu_kineto_available)
     setattr(torch.autograd.profiler, '_prepare_profiler', _C._prepare_profiler)

From d032a06d3fb81d896b6ddfb7b4a33e784758d1bd Mon Sep 17 00:00:00 2001
From: brianlcy123 <brianlcy123@gmail.com>
Date: Fri, 22 Dec 2023 17:38:22 +0800
Subject: [PATCH 32/58] [dipu] Add kunlunxin backend (#556)

* add kunlunxin backend

* add kunlunxin device

* update copy_ for kunlunxin

* lcy/clang-tidy (#483)

* fix namespace declaration format

* update diopi_functions.yaml

* update clang-tidy

* update clang-tidy

* change tab into spaces

* allow const_cast

* fix bug

* fix comment

* fix comments

* fix comments

* [FIX] fix virtual memory error of using SUPA (#468)

* [FIX] fix virtual memory of SUPA

* [FIX] fix incorrect copy

* [FIX] remove useless copy and add missing 'supa'in cmakelists.txt

* make conv2d out at right memory-format (#502)

* [dicp][ascend] add fusion switch file for ascend (#512)

* [dipu] Speedup profiler ctor when not enabled (#526)

* speedup profiler ctor

* clean & format include

* [DIPU]clang-tidy_shanhang (#516)

* Create main readme

* Update readme.md

* Update readme.md

* Update readme.md

* add clone kineto for dicp (#457)

add clone kineto for dicp

* [dicp][ascend] infer op result_info (#448)

* finish res_op_infer for softmax+log_softmax+add+amax(keepdim=True) pass static test

* repeal modification to diopi

* modify operator logic in /DIPU/dicp/dicp/dynamo_bridge/operator.py to support test of'infer_result'

* fix a bug in get_cast_dtype: type(int+bool) should be int

* clean code format

* fix gettupleelem in topsgraph

---------

Co-authored-by: jinminxi104 <jinminxi104@hotmail.com>

* Fdy/enhance copy (#430)

* mv vopy file path

* add new copy

* fix static param err

* fix copy err

* fix direct copy bug

* rm unused bcast template name

* change clang format

* change name hpp

* rm unused header file

* remove unused header 2

* change override behavior

* change comment

* change cudacopy

* fix d2d copy err

* change register to use autogen

* revert incorrect format

* config fallback

* fix link err

* fix comment wanglei

* add newline

* fix cpu copy err

* add camb vendor copy

* fix copy err

* fix copy err 2

* fix compile err

* fix lingjie comment1

* fix caikun comment

* fix camb ci

* fix camb ci

* fix device switch err

* fix ling jie caikun comment 2

* fix comment incorrect local  ref

* change init copy

* update DIOPI submodule (#458)

* update DIOPI submodule

* diopi update to main

* update mmcv version

* update submodule

* update mmcv commit id

* feat: pass CMAKE_BUILD_TYPE into DIOPI (#428)

* [dipu] Fix copy_ fallback of topsrider. (#477)

* [dicp][tops] Add dicp ci of tops. (#469)

* Add dicp ci of tops.

* Fix dicp ci of tops.

* fix recycle dep (#474)

* Fdy/fix copy tidy (#471)

* fix tidy 0

* fix clang tidy copy

* fix lingjie comment

* add tidy msg

* fix lint comment

* fix format

* add copy right

* fuj/ add ceil.out (#480)

* add ceil.out

* add floor_ and cases for floor_, ceil and ceil_

* [dipu] tidy some source files and update nv build script (#453)

* fix: tidy some source files
- and also update build nv script

* fix: make clang-format v16 happy

* fix: make clang-format v16 happy

* fix: remove usings and simplify some code

* fix: remove index

* fix: remove initialized_

* fix: add keyword VERSION

* fix: remove VERSION 3.25 as CI is using CMake 3.22

* add 910B CI && remove 910 CI && update DIOPI (#481)

* add 910b

* add 910b

* add 910b

* add 910b

* add resnet50

* fix bugs

* fix bugs

* fix bugs

* fix bugs

* fix bugs

* rm nouse code

* update DIOPI submodule (#458)

* update DIOPI submodule

* diopi update to main

* update mmcv version

* update submodule

* update mmcv commit id

* feat: pass CMAKE_BUILD_TYPE into DIOPI (#428)

* [dipu] Fix copy_ fallback of topsrider. (#477)

* [dicp][tops] Add dicp ci of tops. (#469)

* Add dicp ci of tops.

* Fix dicp ci of tops.

* fix recycle dep (#474)

* rm 910 ci

* update diopi

* rm 910

---------

Co-authored-by: wugeshui <wugeshui@sensetime.com>
Co-authored-by: CyCle1024 <ccy_justin@163.com>
Co-authored-by: Peter Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: wiryls <7984500+wiryls@users.noreply.github.com>
Co-authored-by: yaofengchen <67218893+yao-fengchen@users.noreply.github.com>
Co-authored-by: fandaoyi <fandaoyi@sensetime.com>
Co-authored-by: wugeshui <106943115+wugeshui@users.noreply.github.com>

* [dipu]add ascend profiler (#476)

* add ascend profiler

* support with_stack

* code format

* fix clang tidy

* optimize naming

* optimize naming

* add dipu ci on dicp (#488)

* [dicp][ascend] fix ascend mm/bmm on 910B (#482)

* mock torch.cuda.XXXTensor (#462)

* mock torch.cuda.XXXTensor

* add newline at end of file

* fix conflict

* fix format

* fix format

* fix comment

* Fix `multiprocessing.Process` tests not collected by coverage and gcov (#486)

* Fix `multiprocessing.Process` tests not collected by coverage and gcov

* fix --concurrency=multiprocessing

* [dipu] update tidy configuration and remove if-constexpr in C++14 (#470)

* fix: update tidy config and remove if-constexpr

* fix: it should be a list instead of bool value

* feat: update clangd config

* fix: move the comment out of yaml scalar

* docs: add comments

* fix: add DeviceIndex

* fix: add some checks for headers

* feat: update .clang-tidy

* add profiler readme (#489)

* add profiler readme

* Update readme.md

* update

* Update readme.md

* Update readme.md

* Update readme.md

---------

Co-authored-by: caikun-pjlab <116071181+caikun-pjlab@users.noreply.github.com>

* [dicp][tops] support outputs with inplace copy (#440)

* add dipu stream synchronize.

* adjust some ops.

* fix some paras error and rename device name.

* unset keep_inference_input_mutations.

* fix paras error in conversion.

* fix para dtype conversion.

* fix empty output and inplace copy of input paras in optimizer case.

* remove inplace output gen_empty_tensor.

* Ywt/fix autocompare compile error (#492)

* pass string to python

* disable _amp_foreach_non_finite_check_and_unscale_ autocompare

* [dipu] Wx/support the test for llm inference (#454)

* add one iter for llm

* add bert ci using the correct transformers repository

* add test for the inference of llama 7b using the transformers repository

* one iter test for traditional models by default

* fix bug

* add test for the inference of internlm 7b using the transformers repository

* test for torch_dipu

* set device check args other for maximum.out

* fix the partition arg parsing bug on cuda

* test the setting of CUDA_PARTITION

* fix the bug of setting CUDA_PARTATION

* add llm

* add llm

* optimize the selection of model list

* set pythonpath for torch_dipu

* test

* fix bug in the command of setting pythonpath

---------

Co-authored-by: wugeshui <wugeshui@sensetime.com>

* [DIPU]Wx/check the status of build dipu (#490)

* check the status of build dipu on camb and nv

* add check for ascend

* fix the bug of pipe

* [DIPU] Wx/add schema for logical or and logical not ops (#484)

* add schema for logical or and logical not ops

* fix bug and add test cases for these ops

* add the test case: out is empty tensor

* [dicp][ascend] infer op resinfo (part 2) (#491)

* fix a bug in get_cast_dtype: type(int+bool) should be int

* clean code format

* finish res_op_infer for more simple operators

* Update operator.py

delete some unnecessary print()

* Update operator.py

clean code

* finish operators' info inference except for those having trouble testing solely without inference and operators involving Reshape still have problems

* clean code format

* Update warning message output in operator.py

* extract common function for general binary and unary operator ,add op bmm's inference

* Update ascend_op.py

delete unuse param

* update DIOPI submodule (#485)

* update DIOPI submodule

* update submodule

* temporily forbid resnet50

* move the testing code to dir under torch_dipu (#465)

* move the testing code to dir under torch_dipu

* fix a little bug

* create two soft link to avoid import torch_dipu  too early.

* add one more soft link file to solve bugs.

* support dev fork ci (#496)

* support dev fork ci

* [dipu] add markdownlint and update most markdown files (#493)

* doc: update docs and add markdownlint

* doc: rename readme.md to README.md

* fix: remove MD013

* doc: format

* [dicp][tops] Support some ops for stable-diffusion. (#467)

* Add sin, cos, erf, split.

1. Generalize MakeTuple in tops_op.
2. Generalize make_const in enflame codegen.
3. Add sin, cos, erf, split for tops.
4. Format Python code in dicp tops.

* refine code

* fix abs test path

* clean up code of split.

* adjust const op generation.

* fix nullptr case in const generation.

---------

Co-authored-by: jinminxi104 <jinminxi104@hotmail.com>
Co-authored-by: Reinerzhou <1768552509@qq.com>

* [DIPU] Wx/modify maximum schema due to the case in the inference of internlm (#494)

* improve maximum schema due to the case in the inference of internlm

* fix bug according to comments

* fix bug

* [both] fix, format and remove spaces in README.md (#497)

* doc(readme): fix, format and remove spaces

* fix: typo and try auto-correct

* feat(ci): add autocorrect into ci

* fix: remove autocorrect form ci as it's not ready

* update env python 3.10 (#503)

* fix clang tidy

* [dicp][ascend] get soc_version from aclrt (#505)

* fix clang tidy

* fix format

* fix format

---------

Co-authored-by: MiaoYYu <13160677487@163.com>
Co-authored-by: wugeshui <106943115+wugeshui@users.noreply.github.com>
Co-authored-by: Juntao Chen <90135463+KevinfromTJ@users.noreply.github.com>
Co-authored-by: jinminxi104 <jinminxi104@hotmail.com>
Co-authored-by: fandaoyi <fandaoyi@sensetime.com>
Co-authored-by: Peter Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: wiryls <7984500+wiryls@users.noreply.github.com>
Co-authored-by: yaofengchen <67218893+yao-fengchen@users.noreply.github.com>
Co-authored-by: Fu Jingguo <fujingguo@sensetime.com>
Co-authored-by: hellozmz <407190054@qq.com>
Co-authored-by: wugeshui <wugeshui@sensetime.com>
Co-authored-by: CyCle1024 <ccy_justin@163.com>
Co-authored-by: caikun-pjlab <116071181+caikun-pjlab@users.noreply.github.com>
Co-authored-by: tangzhiyi11 <tangzhiyi11@users.noreply.github.com>
Co-authored-by: wyz5864 <109072365+wyz5864@users.noreply.github.com>
Co-authored-by: Lingjie <lilingjie@sensetime.com>
Co-authored-by: Joyce YU <30998166+MiaoYYu@users.noreply.github.com>
Co-authored-by: Reinerzhou <87467364+Reinerzhou@users.noreply.github.com>
Co-authored-by: POI-WX <131418410+POI-WX@users.noreply.github.com>
Co-authored-by: HuayiL <442488254@qq.com>
Co-authored-by: Reinerzhou <1768552509@qq.com>
Co-authored-by: liwenjian-sensetime <109193776+liwenjian-sensetime@users.noreply.github.com>
Co-authored-by: shanhang <shanhang@sensetime.com>

* Speedup dumpOnArgLevel by using lazy initialization (#524)

* [dicp][ascend] fuse transpose/mm in ascendgraph (#523)

* [dicp][ascend] remove unnecessary broadcast (#527)

* update kineto (#530)

* [dicp][ascend] opt inplace copy (#533)

* opt copy inplace

* optimzer load_and_run

* remove chech return value if (#534)

* [dipu] Optimize `getAllocator` by adopting lookup table (#532)

* [dipu] Optimize `getAllocator` by adopting lookup table

* fix typos & clean includes

* resolve comments

* shrink lookup table & speedup devproxy::getDeviceCount

* Op preference mem format (#525)

* add memory perference in op for camb.
This change will add a TAG in diopi_functions.yaml and the autogen will replace it with the prefered memory format depending on the convert_config.yaml of the device

* fix bug found in ci running

* improve the code according to the comment.

* improve code format.

* improve CMakeLists.txt code.

* lyp_clang_tidy: warning uint64_t->int (#518)

* clang_tidy:torch_dipu/csrc_dipu/profiler/CorrelationIDManager.cpp
                                         CorrelationIDManager.h

* clang_tidy dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.cpp .h

* clang_tidy:torch_dipu/csrc_dipu/profiler/profiler.cpp

* clang_tidy:torch_dipu/csrc_dipu/profiler/patch.cpp

* clang_tidy:torch_dipu/csrc_dipu/profiler/patch.cpp --v2

* clang_tidy:dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBFCachingAllocator.cpp

* clang_tidy:dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBFCachingAllocator.cpp -v2

* clang_tidy: dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEvent.h

* clang_tidy: torch_dipu/csrc_dipu/profiler/profiler.h --v2

* clang_tidy: torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.cpp --v2

* clang_tidy: torch_dipu/csrc_dipu/profiler/CorrelationIDManager.cpp .h --v2

* clang_tidy: magic number; const_cast

* clang_tidy: fix some review issus

* clang_tidy: modify format by using run_format.sh

* [dipu] fix: `torch.prod` int type promotion (#541)

`prod` (and other reduction ops) should promote int type (including `bool`) to `int64` when `dtype` is not explicitly provided.

Only `prod` (without `dim`) should be taken care of, because the other cases are already correctly handled in PyTorch.

* [dipu] fix typo PREFERED -> PREFERRED (#545)

* [dicp][ascend] add dicp ci for ascend (#540)

* disable autocompare for _amp_foreach_non_finite_check_and_unscale_ (#543)

* Update QuickStart.md

* revert unnecessary changes

* fix linter erros and implement getRuntimeVersion&getDriverVersion for kunlunxin

* change device from XPU to KLX

* fix build

* remove uused code

* use DIPU_LOG install of printf

* change kunlunxin device key from xpu to klx

---------

Co-authored-by: Chengyuan Li <37681002+cyLi-Tiger@users.noreply.github.com>
Co-authored-by: Aaron <dswei@birentech.com>
Co-authored-by: wyz5864 <109072365+wyz5864@users.noreply.github.com>
Co-authored-by: tangzhiyi11 <tangzhiyi11@users.noreply.github.com>
Co-authored-by: Lingjie <lilingjie@sensetime.com>
Co-authored-by: ustclight-sls <55499123+ustclight-sls@users.noreply.github.com>
Co-authored-by: MiaoYYu <13160677487@163.com>
Co-authored-by: wugeshui <106943115+wugeshui@users.noreply.github.com>
Co-authored-by: Juntao Chen <90135463+KevinfromTJ@users.noreply.github.com>
Co-authored-by: jinminxi104 <jinminxi104@hotmail.com>
Co-authored-by: fandaoyi <fandaoyi@sensetime.com>
Co-authored-by: Peter Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: wiryls <7984500+wiryls@users.noreply.github.com>
Co-authored-by: yaofengchen <67218893+yao-fengchen@users.noreply.github.com>
Co-authored-by: Fu Jingguo <fujingguo@sensetime.com>
Co-authored-by: hellozmz <407190054@qq.com>
Co-authored-by: wugeshui <wugeshui@sensetime.com>
Co-authored-by: CyCle1024 <ccy_justin@163.com>
Co-authored-by: caikun-pjlab <116071181+caikun-pjlab@users.noreply.github.com>
Co-authored-by: Joyce YU <30998166+MiaoYYu@users.noreply.github.com>
Co-authored-by: Reinerzhou <87467364+Reinerzhou@users.noreply.github.com>
Co-authored-by: POI-WX <131418410+POI-WX@users.noreply.github.com>
Co-authored-by: HuayiL <442488254@qq.com>
Co-authored-by: Reinerzhou <1768552509@qq.com>
Co-authored-by: liwenjian-sensetime <109193776+liwenjian-sensetime@users.noreply.github.com>
Co-authored-by: shanhang <shanhang@sensetime.com>
Co-authored-by: lyp-liuyipeng <49586870+lyp-liuyipeng@users.noreply.github.com>
Co-authored-by: zhaochaoxing <109726331+zhaochaoxing@users.noreply.github.com>
---
 dipu/CMakeLists.txt                           |   5 +
 .../diopi_functions.yaml                      |   2 +-
 .../csrc_dipu/runtime/device/basedef.h        |   1 +
 dipu/torch_dipu/csrc_dipu/utils/helpfunc.hpp  |   2 +
 .../csrc_dipu/vendor/kunlunxin/CMakeLists.txt |  14 ++
 .../vendor/kunlunxin/KLXGeneratorImpl.cpp     |  29 +++
 .../kunlunxin/cmake/FindKLXRuntime.cmake      |  49 ++++
 .../vendor/kunlunxin/communicatorimpl.cpp     |  81 +++++++
 .../csrc_dipu/vendor/kunlunxin/deviceimpl.cpp | 223 ++++++++++++++++++
 .../csrc_dipu/vendor/kunlunxin/vendorapi.h    |  33 +++
 10 files changed, 438 insertions(+), 1 deletion(-)
 create mode 100644 dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/CMakeLists.txt
 create mode 100644 dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/KLXGeneratorImpl.cpp
 create mode 100644 dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/cmake/FindKLXRuntime.cmake
 create mode 100644 dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/communicatorimpl.cpp
 create mode 100644 dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/deviceimpl.cpp
 create mode 100644 dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/vendorapi.h

diff --git a/dipu/CMakeLists.txt b/dipu/CMakeLists.txt
index 24b368a9de..4ea3ec28c9 100644
--- a/dipu/CMakeLists.txt
+++ b/dipu/CMakeLists.txt
@@ -19,6 +19,7 @@ list(APPEND DEVICE_ASCEND "ASCEND" "ascend")
 list(APPEND DEVICE_TOPSRIDER "TOPS" "tops" "TOPSRIDER" "topsrider")
 list(APPEND DEVICE_SUPA "SUPA" "supa")
 list(APPEND DEVICE_DROPLET "DROPLET" "droplet")
+list(APPEND DEVICE_KUNLUNXIN "kunlunxin" "klx")
 
 execute_process(COMMAND git rev-parse --short HEAD
                 OUTPUT_VARIABLE DIPU_GIT_HASH)
@@ -50,6 +51,10 @@ elseif (${DEVICE} IN_LIST DEVICE_DROPLET)
   set(USE_DROPLET ON)
   set(UsedVendor droplet)
   set(DIOPI_IMPL_OPT "droplet")
+elseif (${DEVICE} IN_LIST DEVICE_KUNLUNXIN)
+  set(USE_KUNLUNXIN ON)
+  set(UsedVendor kunlunxin)
+  set(DIOPI_IMPL_OPT "kunlunxin")
 else()
   message(FATAL_ERROR "No implementation module is compiled, cmake requires option -DDEVICE=CAMB or CUDA or ASCEND or SUPA")
 endif()
diff --git a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
index df031140da..4ddaca2362 100755
--- a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
+++ b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
@@ -2436,7 +2436,7 @@
 - schema: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
   dummy_call_diopi: True
   custom_fallback: True
-  device: [cuda, camb, ascend, droplet, supa]
+  device: [cuda, camb, ascend, droplet, supa, kunlunxin]
   custom_code_at_the_beginning: |
     dipu::getDipuCopyInstance()->run(self, src, non_blocking);
     return self;
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/device/basedef.h b/dipu/torch_dipu/csrc_dipu/runtime/device/basedef.h
index 364844e3ad..4ecf8c9621 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/device/basedef.h
+++ b/dipu/torch_dipu/csrc_dipu/runtime/device/basedef.h
@@ -38,6 +38,7 @@ enum class VendorDeviceType : enum_t {
   GCU,      // gcu
   SUPA,     // Biren
   DROPLET,  // droplet
+  KLX,      // Kunlunxin
 };
 
 enum class EventStatus : enum_t { PENDING, RUNNING, DEFERRED, READY };
diff --git a/dipu/torch_dipu/csrc_dipu/utils/helpfunc.hpp b/dipu/torch_dipu/csrc_dipu/utils/helpfunc.hpp
index ae4ce62477..04b1e11b09 100644
--- a/dipu/torch_dipu/csrc_dipu/utils/helpfunc.hpp
+++ b/dipu/torch_dipu/csrc_dipu/utils/helpfunc.hpp
@@ -20,6 +20,8 @@ constexpr const char* VendorTypeToStr(VendorDeviceType t) noexcept {
       return "SUPA";
     case VendorDeviceType::DROPLET:
       return "DROPLET";
+    case VendorDeviceType::KLX:
+      return "KLX";
   }
   return "null";
 }
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/CMakeLists.txt b/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/CMakeLists.txt
new file mode 100644
index 0000000000..4bfa284350
--- /dev/null
+++ b/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/CMakeLists.txt
@@ -0,0 +1,14 @@
+cmake_minimum_required(VERSION 3.14)
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
+
+include(cmake/FindKLXRuntime.cmake)
+
+message(STATUS XPURT_INCLUDE_DIR ${XPURT_INCLUDE_DIR})
+
+set(VENDOR_INCLUDE_DIRS ${VENDOR_INCLUDE_DIRS} ${XPURT_INCLUDE_DIR} ${XDNN_INCLUDE_DIR} PARENT_SCOPE)
+set(VENDOR_LIB_DIRS ${VENDOR_LIB_DIRS} ${XPURT_LIBRARIES} ${XDNN_LIBRARIES} PARENT_SCOPE)
+#set(DIPU_VENDOR_LIB ${DIPU_VENDOR_LIB} xpurt xpuapi PARENT_SCOPE)
+
+file(GLOB SRC_FILES  *.cpp)
+set(VENDOR_FILES  ${SRC_FILES} PARENT_SCOPE)
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/KLXGeneratorImpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/KLXGeneratorImpl.cpp
new file mode 100644
index 0000000000..b7fffe4909
--- /dev/null
+++ b/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/KLXGeneratorImpl.cpp
@@ -0,0 +1,29 @@
+#include <ATen/Functions.h>
+#include <ATen/Utils.h>
+
+#include <csrc_dipu/runtime/core/DIPUGeneratorImpl.h>
+#include <csrc_dipu/runtime/core/DIPUGuard.h>
+#include <csrc_dipu/runtime/device/deviceapis.h>
+
+namespace dipu {
+
+// Discriminate floating device type.
+// static bool is_floating_device = true;
+
+// just an example
+// not implemented now
+class KLXGeneratorImpl : public dipu::DIPUGeneratorImpl {
+ public:
+  KLXGeneratorImpl(at::DeviceIndex device_index)
+      : dipu::DIPUGeneratorImpl(device_index) {}
+
+  void set_state(const c10::TensorImpl& state) override {}
+
+  void update_state() const override {}
+};
+
+const at::Generator vendorMakeGenerator(at::DeviceIndex device_index) {
+  return at::make_generator<KLXGeneratorImpl>(device_index);
+}
+
+}  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/cmake/FindKLXRuntime.cmake b/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/cmake/FindKLXRuntime.cmake
new file mode 100644
index 0000000000..11e37c918e
--- /dev/null
+++ b/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/cmake/FindKLXRuntime.cmake
@@ -0,0 +1,49 @@
+set(XPURT_TOOLKIT_ROOT /workspace/baidu/personal-code/diopi/xpu_toolchain/xpurt)
+set(XDNN_TOOLKIT_ROOT /workspace/baidu/personal-code/diopi/xpu_toolchain/xdnn)
+
+include(FindPackageHandleStandardArgs)
+
+## xdnn
+find_path(XDNN_INCLUDE_DIR
+    NAMES xpu/xdnn.h
+    HINTS ${XDNN_TOOLKIT_ROOT}/include
+          $ENV{XDNN_TOOLKIT_ROOT}/include
+)
+message("XDNN_INCLUDE_DIR:" ${XDNN_INCLUDE_DIR})
+find_library(XDNN_LIBRARIES
+    NAMES xpuapi
+    HINTS ${XDNN_TOOLKIT_ROOT}/so
+          $ENV{XDNN_TOOLKIT_ROOT}/so
+)
+message("XDNN_TOOLKIT_ROOT: " ${XDNN_TOOLKIT_ROOT})
+message("XDNN_LIBRARIES:" ${XDNN_LIBRARIES})
+if(NOT XDNN_INCLUDE_DIR OR NOT XDNN_LIBRARIES)
+    message(FATAL_ERROR "Cannot find Xdnn TOOLKIT for kunlunxin, set ENV 'XDNN_TOOLKIT_ROOT' correctly")
+endif()
+
+## runtime
+find_path(XPURT_INCLUDE_DIR
+    NAMES xpu/runtime.h
+    HINTS ${XPURT_TOOLKIT_ROOT}/include
+          $ENV{XPURT_TOOLKIT_ROOT}/include
+)
+message("XPURT_INCLUDE_DIR:" ${XPURT_INCLUDE_DIR})
+find_library(XPURT_LIBRARIES
+    NAMES xpurt
+    HINTS ${XPURT_TOOLKIT_ROOT}/so
+          $ENV{XPURT_TOOLKIT_ROOT}/so
+)
+message("XPURT_LIBRARIES:" ${XPURT_LIBRARIES})
+if(NOT XPURT_INCLUDE_DIR OR NOT XPURT_LIBRARIES)
+    message(FATAL_ERROR "Cannot find XPURT TOOLKIT for kunlunxin, set ENV 'XPURT_TOOLKIT_ROOT' correctly")
+endif()
+
+find_package_handle_standard_args(XPURT DEFAULT_MSG
+    XPURT_INCLUDE_DIR
+    XPURT_LIBRARIES)
+
+find_package_handle_standard_args(XDNN DEFAULT_MSG
+    XDNN_INCLUDE_DIR
+    XDNN_LIBRARIES)
+
+mark_as_advanced(XPURT_INCLUDE_DIR XPURT_LIBRARIES XDNN_INCLUDE_DIR XDNN_LIBRARIES)
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/communicatorimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/communicatorimpl.cpp
new file mode 100644
index 0000000000..b0c8bab58a
--- /dev/null
+++ b/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/communicatorimpl.cpp
@@ -0,0 +1,81 @@
+#include <stdexcept>
+#include <string>
+
+#include <c10/core/ScalarType.h>
+#include <torch/csrc/distributed/c10d/Types.hpp>
+
+#include <csrc_dipu/common.h>
+#include <csrc_dipu/runtime/device/diclapis.h>
+
+namespace dipu {
+
+namespace devapis {
+
+const int DICL_UNIQUE_ID_BYTES_SIZE = 0;
+
+DIPU_API diclResult_t diclGetCommAsyncError(diclComm_t comm) {
+  return DICL_ERR_UNDEF;
+}
+
+DIPU_API diclResult_t diclGetUniqueId(pcclUniqueId* uniqueId) {
+  return DICL_ERR_UNDEF;
+}
+
+DIPU_API diclResult_t diclCommInitRank(diclComm_t* comm, int nranks,
+                                       pcclUniqueId uniqueId, int rank,
+                                       int localDeviceId) {
+  return DICL_ERR_UNDEF;
+}
+
+DIPU_API diclResult_t diclCommDestroy(diclComm_t comm) {
+  return DICL_ERR_UNDEF;
+}
+
+DIPU_API diclResult_t diclAllReduce(const void* sendbuff, void* recvbuff,
+                                    size_t count, at::ScalarType datatype,
+                                    const ReduceOp& reduceOp, diclComm_t comm,
+                                    deviceStream_t stream) {
+  return DICL_ERR_UNDEF;
+}
+
+DIPU_API diclResult_t diclBroadcast(const void* sendbuff, void* recvbuff,
+                                    size_t count, at::ScalarType datatype,
+                                    int root, diclComm_t comm,
+                                    deviceStream_t stream) {
+  return DICL_ERR_UNDEF;
+}
+
+DIPU_API diclResult_t diclAllGather(const void* sendBuf, void* recvBuf,
+                                    size_t count, at::ScalarType datatype,
+                                    diclComm_t comm, deviceStream_t stream) {
+  return DICL_ERR_UNDEF;
+}
+
+DIPU_API diclResult_t diclReduce(const void* sendbuff, void* recvbuff,
+                                 size_t count, at::ScalarType datatype,
+                                 const ReduceOp& reduceOp, int root,
+                                 diclComm_t comm, deviceStream_t stream) {
+  return DICL_ERR_UNDEF;
+}
+
+DIPU_API diclResult_t diclReduceScatter(
+    void* sendBuf, void* recvBuf, size_t recvCount, at::ScalarType datatype,
+    const ReduceOp& reduceOp, diclComm_t comm, deviceStream_t stream) {
+  return DICL_ERR_UNDEF;
+}
+
+DIPU_API diclResult_t diclSend(void* sendbuff, size_t count,
+                               at::ScalarType datatype, int peer,
+                               diclComm_t comm, deviceStream_t stream) {
+  return DICL_ERR_UNDEF;
+}
+
+DIPU_API diclResult_t diclRecv(void* recvbuff, size_t count,
+                               at::ScalarType datatype, int peer,
+                               diclComm_t comm, deviceStream_t stream) {
+  return DICL_ERR_UNDEF;
+}
+
+}  // end namespace devapis
+
+}  // end namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/deviceimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/deviceimpl.cpp
new file mode 100644
index 0000000000..4bd06f069d
--- /dev/null
+++ b/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/deviceimpl.cpp
@@ -0,0 +1,223 @@
+
+
+#include <csrc_dipu/common.h>
+#include <csrc_dipu/runtime/device/deviceapis.h>
+
+namespace dipu {
+DIPU_API devapis::VendorDeviceType VENDOR_TYPE = devapis::VendorDeviceType::KLX;
+
+namespace devapis {
+
+using klx_deviceId = int;
+
+// =====================
+//  Device class related
+// =====================
+
+void initializeVendor() {}
+
+void finalizeVendor() {}
+
+deviceId_t current_device() {
+  klx_deviceId devId_;
+  DIPU_CALLKLX(xpu_current_device(&devId_))
+  return static_cast<deviceId_t>(devId_);
+}
+
+DIPUDeviceProperties getDeviceProperties(int32_t device_index) {
+  DIPUDeviceProperties prop;
+  return prop;
+}
+
+// set current device given device according to id
+void setDevice(deviceId_t devId) {
+  klx_deviceId devId_ = static_cast<deviceId_t>(devId);
+  DIPU_CALLKLX(xpu_set_device(devId_))
+}
+
+void resetDevice(deviceId_t devId) {
+  DIPU_CALLKLX_ERROR("[kunlunxin]resetDevice is not implemented")
+}
+
+void syncDevice() { DIPU_CALLKLX(xpu_wait()); }
+
+// check last launch succ or not, throw if fail
+void checkLastError() {
+  DIPU_CALLKLX_ERROR("[kunlunxin]checkLastError is not implemented")
+}
+
+int getDeviceCount() {
+  int num = -1;
+  DIPU_CALLKLX(xpu_device_count(reinterpret_cast<int*>(&num)))
+  return num;
+}
+
+void getDriverVersion(int* version) {
+  uint32_t major;
+  uint32_t minor;
+  DIPU_CALLKLX(xpu_get_driver_version(&major, &minor));
+  *version = static_cast<int32_t>(major);
+}
+
+void getRuntimeVersion(int* version) {
+  uint32_t major;
+  uint32_t minor;
+  DIPU_CALLKLX(xpu_get_runtime_version(&major, &minor));
+  *version = static_cast<int32_t>(major);
+}
+
+// =====================
+//  device stream related
+// =====================
+void createStream(deviceStream_t* stream, bool prior) {
+  if (prior) {
+    DIPU_LOGW(
+        "kunlunxin device doesn't support prior queue(stream)."
+        " Fall back on creating queue without priority.");
+  }
+  DIPU_CALLKLX(xpu_stream_create(stream));
+}
+
+void destroyStream(deviceStream_t stream) {
+  DIPU_CALLKLX(xpu_stream_destroy(stream))
+}
+
+void destroyStream(deviceStream_t stream, deviceId_t devId) {
+  setDevice(devId);
+  destroyStream(stream);
+}
+
+void releaseStream() {}
+
+bool streamNotNull(deviceStream_t stream) { return stream != nullptr; }
+
+void syncStream(deviceStream_t stream) { DIPU_CALLKLX(xpu_wait(stream)); }
+
+void streamWaitEvent(deviceStream_t stream, deviceEvent_t event) {
+  DIPU_CALLKLX(xpu_stream_wait_event(stream, event))
+}
+
+bool isStreamEmpty(deviceStream_t stream) {
+  DIPU_CALLKLX_ERROR("[kunlunxin]isStreamEmpty is not implemented")
+}
+
+// =====================
+//  device event related
+// =====================
+
+void createEvent(deviceEvent_t* event) { DIPU_CALLKLX(xpu_event_create(event)) }
+
+void destroyEvent(deviceEvent_t event) {
+  DIPU_CALLKLX(xpu_event_destroy(event))
+}
+
+void waitEvent(deviceEvent_t event) { DIPU_CALLKLX(xpu_event_wait(event)) }
+
+void recordEvent(deviceEvent_t event, deviceStream_t stream) {
+  DIPU_CALLKLX(xpu_event_record(event, stream))
+}
+
+void eventElapsedTime(float* time, deviceEvent_t start, deviceEvent_t end) {}
+
+EventStatus getEventStatus(deviceEvent_t event) {
+  return devapis::EventStatus::READY;
+}
+
+// =====================
+//  mem related
+// =====================
+void mallocHost(void** p, size_t nbytes) {
+  DIPU_CALLKLX(xpu_host_alloc(p, nbytes, 0))
+}
+
+void freeHost(void* p){DIPU_CALLKLX(xpu_host_free(p))}
+
+OpStatus mallocDevice(void** p, size_t nbytes, bool throwExcepion) {
+  if (nbytes == 0) {
+    return OpStatus::SUCCESS;
+  }
+  int r = xpu_malloc(p, nbytes);
+  if (r != 0) {
+    if (throwExcepion) {
+      DIPU_LOGE("call xpu_malloc function failed.");
+      throw std::runtime_error("alloc failed in dipu");
+    } else if (r == XPUERR_NOMEM) {
+      return OpStatus::ERR_NOMEM;
+    } else {
+      return OpStatus::ERR_UNKNOWN;
+    }
+  }
+  return OpStatus::SUCCESS;
+}
+
+void freeDevice(void* p) { DIPU_CALLKLX(xpu_free(p)) }
+
+bool isPinnedPtr(const void* p) { return false; }
+
+static int _xpuMemset(void* ptr, int value, size_t count,
+                      deviceStream_t stream) {
+  if (count == 0) {
+    // skip if nothing to write.
+    return 0;
+  }
+  if (ptr == nullptr) {
+    return -1;
+  }
+
+  void* ptr_host = nullptr;
+  ptr_host = malloc(count);
+  if (ptr_host == nullptr) {
+    return -1;
+  }
+  int ret = xpu_memcpy(ptr, ptr_host, static_cast<uint64_t>(count),
+                       XPU_HOST_TO_DEVICE);
+  free(ptr_host);
+  return ret;
+}
+
+void memSetAsync(const deviceStream_t stream, void* ptr, int val, size_t size) {
+  DIPU_CALLKLX(_xpuMemset(ptr, val, size, stream))
+}
+
+void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void* dst,
+                deviceId_t srcDevId, const void* src) {
+  if (dstDevId == srcDevId) {
+    DIPU_CALLKLX(xpu_memcpy(dst, src, nbytes, XPU_DEVICE_TO_DEVICE))
+  } else {
+    DIPU_CALLKLX(xpu_memcpy_peer(dstDevId, dst, srcDevId, src,
+                                 static_cast<uint64_t>(nbytes)))
+  }
+}
+
+// (synchronous) copy from host to a DROPLET device
+void memCopyH2D(size_t nbytes, void* dst, const void* src) {
+  DIPU_CALLKLX(xpu_memcpy(dst, src, nbytes, XPU_HOST_TO_DEVICE))
+}
+
+// (synchronous) copy from a DROPLET device to host
+void memCopyD2H(size_t nbytes, void* dst, const void* src) {
+  DIPU_CALLKLX(xpu_memcpy(dst, src, nbytes, XPU_DEVICE_TO_HOST))
+}
+
+// (asynchronous) copy from device to a device
+void memCopyD2DAsync(const deviceStream_t stream, size_t nbytes,
+                     deviceId_t dstDevId, void* dst, deviceId_t srcDevId,
+                     const void* src) {
+  memCopyD2D(nbytes, dstDevId, dst, srcDevId, src);
+}
+
+// (asynchronous) copy from host to a device
+void memCopyH2DAsync(const deviceStream_t stream, size_t nbytes, void* dst,
+                     const void* src) {
+  memCopyH2D(nbytes, dst, src);
+}
+
+// (asynchronous) copy from a device to host
+void memCopyD2HAsync(const deviceStream_t stream, size_t nbytes, void* dst,
+                     const void* src) {
+  memCopyD2H(nbytes, dst, src);
+}
+
+}  // end namespace devapis
+
+}  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/vendorapi.h b/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/vendorapi.h
new file mode 100644
index 0000000000..829e45b290
--- /dev/null
+++ b/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/vendorapi.h
@@ -0,0 +1,33 @@
+#pragma once
+#include <xpu/runtime.h>
+#include <xpu/xdnn.h>
+
+#include <c10/util/Exception.h>
+
+#include <csrc_dipu/common.h>
+
+namespace xdnn = baidu::xpu::api;
+namespace dipu {
+
+#define DIPU_CALLKLX_ERROR(Expr) \
+  { throw std::runtime_error(#Expr); }
+
+#define DIPU_CALLKLX(Expr)                                           \
+  {                                                                  \
+    int ret = (Expr);                                                \
+    TORCH_CHECK(ret == XPU_SUCCESS, "call ku error, expr = ", #Expr, \
+                ", ret = ", ret);                                    \
+  }
+
+using deviceId_t = int;
+using deviceStream_t = XPUStream;
+#define deviceDefaultStreamLiteral nullptr
+using deviceEvent_t = XPUEvent;
+using deviceHandle_t = xdnn::Context*;
+
+class pcclComm_t {};
+using diclComm_t = pcclComm_t*;
+class pcclUniqueId {};
+using commUniqueId = pcclUniqueId;
+
+}  // namespace dipu

From af3adf00e06d1dc18301081ab4af16b028e4ca06 Mon Sep 17 00:00:00 2001
From: Lingjie <lilingjie@sensetime.com>
Date: Mon, 25 Dec 2023 11:43:41 +0800
Subject: [PATCH 33/58] [format] do not break NOLINT-like comments (#564)

---
 dipu/.clang-format | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dipu/.clang-format b/dipu/.clang-format
index 61244b861c..06601c0aa8 100644
--- a/dipu/.clang-format
+++ b/dipu/.clang-format
@@ -1,5 +1,6 @@
 ---
 BasedOnStyle: InheritParentConfig
+CommentPragmas: '^ (IWYU pragma:|NOLINT(BEGIN|END|NEXTLINE)?(\(.+\))?:? )'
 IncludeCategories:
   - Regex:         '^("|<)csrc_dipu/'
     Priority:      90

From fafff342c197fe46bfe891d12c682bb15d4930d1 Mon Sep 17 00:00:00 2001
From: yaofengchen <67218893+yao-fengchen@users.noreply.github.com>
Date: Mon, 25 Dec 2023 23:40:21 +0800
Subject: [PATCH 34/58] [dicp][tops] fix and supoort some ops (#550)

* fix and supoort some ops
1.fix: index, embedding, hardswish_backward, expand
2.support: stack

* Fix test.
1.fix test config in tops
2.fix index test case for different backend

* fix and support ops
1.fix: var_mean, addmm
2.support: square, argmax, argmin

* Add some comments.
---
 dicp/dicp/vendor/TopsGraph/codegen/enflame.py |  59 +++++---
 dicp/dicp/vendor/TopsGraph/config.py          |   2 +-
 dicp/dicp/vendor/TopsGraph/conversion.py      | 130 +++++++++++++-----
 dicp/dicp/vendor/TopsGraph/tops_op.py         |  43 +++++-
 dicp/test/op/test_addmm.py                    |  45 ++++++
 dicp/test/op/test_argmax.py                   |  42 ++++++
 dicp/test/op/test_argmin.py                   |  42 ++++++
 dicp/test/op/test_expand.py                   |   4 +-
 dicp/test/op/test_index.py                    |  27 +++-
 dicp/test/op/test_sin.py                      |   2 +-
 dicp/test/op/test_stack.py                    |  49 +++++++
 dicp/test/op/test_sum.py                      |   2 +-
 dicp/test/op/test_unsqueeze.py                |   6 +-
 dicp/test/op/test_var_mean.py                 |  44 ++++++
 dicp/test/tops_scripts/ops/static.ini         |   7 +-
 15 files changed, 442 insertions(+), 62 deletions(-)
 create mode 100644 dicp/test/op/test_addmm.py
 create mode 100644 dicp/test/op/test_argmax.py
 create mode 100644 dicp/test/op/test_argmin.py
 create mode 100644 dicp/test/op/test_stack.py
 create mode 100644 dicp/test/op/test_var_mean.py

diff --git a/dicp/dicp/vendor/TopsGraph/codegen/enflame.py b/dicp/dicp/vendor/TopsGraph/codegen/enflame.py
index 6682a883ec..fcab2dc4ab 100644
--- a/dicp/dicp/vendor/TopsGraph/codegen/enflame.py
+++ b/dicp/dicp/vendor/TopsGraph/codegen/enflame.py
@@ -84,11 +84,13 @@ def placeholder(self, name, target, args, kwargs):
 
         in_shape = self.get_shape()
         if in_shape == '{}':
-            in_shape = '{1}'
-        self.build_graph_code.writeline(
-            f"std::vector<int64_t> {self.args_dict[name]}_in_shape{in_shape};")
-        self.build_graph_code.writeline(
-            f"builder::Type {self.args_dict[name]}_input_type({self.args_dict[name]}_in_shape, {type_set[data_type]});")
+            self.build_graph_code.writeline(
+                f"builder::Type {self.args_dict[name]}_input_type({type_set[data_type]});")
+        else:
+            self.build_graph_code.writeline(
+                f"std::vector<int64_t> {self.args_dict[name]}_in_shape{in_shape};")
+            self.build_graph_code.writeline(
+                f"builder::Type {self.args_dict[name]}_input_type({self.args_dict[name]}_in_shape, {type_set[data_type]});")
         self.build_graph_code.writeline(
             f"builder::Op {self.args_dict[name]} = hlir_builder->CreateInput({self.args_dict[name]}_input_type);")
         self.build_graph_code.writeline("")
@@ -695,6 +697,10 @@ def Pow(op_var, shape, dtype, x, y, **kwargs_list):
         src_code += f"builder::Op {op_var} = builder::Pow({x}, {y});"
         return src_code
 
+    @staticmethod
+    def Square(op_var, shape, dtype, x, **kwargs_list):
+        return f"builder::Op {op_var} = builder::Square({x});"
+
     @staticmethod
     def Exp(op_var, shape, dtype, x, **kwargs_list):
         return f"builder::Op {op_var} = builder::Exp({x});"
@@ -719,6 +725,14 @@ def Relu(op_var, shape, dtype, x, **kwargs_list):
     def Erf(op_var, shape, dtype, x, **kwargs_list):
         return f"builder::Op {op_var} = builder::Erf({x});"
 
+    @staticmethod
+    def ArgMax(op_var, shape, dtype, x, dim, keepdim, **kwargs_list):
+        return f"builder::Op {op_var} = builder::ArgMax({x}, {dim}, {keepdim});"
+
+    @staticmethod
+    def ArgMin(op_var, shape, dtype, x, dim, keepdim, **kwargs_list):
+        return f"builder::Op {op_var} = builder::ArgMin({x}, {dim}, {keepdim});"
+
     @staticmethod
     def Sigmoid(op_var, shape, dtype, x, **kwargs_list):
         return f"builder::Op {op_var} = builder::Sigmoid({x});"
@@ -807,12 +821,17 @@ def Reshape(op_var, shape, dtype, x, new_size, **kwargs_list):
         return src_code
 
     @staticmethod
-    def Expand(op_var, shape, dtype, x, new_shape, **kwargs_list):
+    def Expand(op_var, shape, dtype, x, new_shape, broadcast_dims, **kwargs_list):
         src_code, op_type = EnflameOverrides.make_type(
             op_var, dtype, new_shape)
-        src_code += f"builder::Op {op_var} = builder::BroadcastInDim({x}, {{{', '.join(map(str, range(len(shape))))}}}, {op_type});"
+        src_code += f"builder::Op {op_var} = builder::BroadcastInDim({x}, {{{', '.join(map(str, broadcast_dims))}}}, {op_type});"
         return src_code
 
+    @staticmethod
+    def Stack(op_var, shape, dtype, inputs, axis=0, **kwargs_list):
+        inputs = [inputs] if isinstance(inputs, str) else inputs
+        return f"builder::Op {op_var} = builder::Stack({{{', '.join(inputs)}}}, {axis});"
+
     @staticmethod
     def Squeeze(op_var, shape, dtype, x, y, **kwargs_list):
         src_code, y = EnflameOverrides.make_const(
@@ -925,7 +944,12 @@ def ComplexMul(op_var, out_shape, out_dtype, x, y):
     def Concatenate(op_var, out_shape, out_dtype, tensors, dim):
         return f"builder::Op {op_var} = builder::Concatenate({'{' + ', '.join(tensors) + '}'}, {dim});"
 
-    # Add an additional true flag for accuration in tops softmax.
+    """
+    Add an additional true flag for accuration in hlir_builder Softmax.
+    The third parameter, half_to_float, in aten._softmax represents whether cast
+    inputs from float16 to float32 or not, while the third parameter ,accurate,
+    in hlir_builder represents whether precision calculation is performed.
+    """
     @staticmethod
     def Softmax(op_var, out_shape, out_dtype, x, y):
         return f"builder::Op {op_var} = builder::Softmax({x}, {y}, true);"
@@ -950,15 +974,12 @@ def Iota(op_var, out_shape, out_dtype, length, **kwargs_list):
         return src_code
 
     @staticmethod
-    def XlaGather(op_var, out_shape, out_dtype, operand, indices, offset_dims, collapsed_slice_dims,
-                  start_index_map, index_vector_dim, slice_size):
-        gather_dim_params = f"{op_var}_gather_dim_params"
-        src_code = f"auto {gather_dim_params} = builder::GatherDimensionNumbers(\n" \
-                   f"{'{'}{str(offset_dims)[1:-1]}{'}'}, {'{'}{str(collapsed_slice_dims)[1:-1]}{'}'}," \
-                   f"{'{'}{str(start_index_map)[1:-1]}{'}'}, {index_vector_dim}\n" \
-                   f");\n" \
-                   f"builder::Op {op_var} = builder::Gather(\n" \
-                   f"{operand}, {indices}, {gather_dim_params}, {'{'}{str(slice_size)[1:-1]}{'}'}\n" \
-                   f");"
-
+    def XlaGather(op_var, out_shape, out_dtype, operand, start_indices, offset_dims, collapsed_slice_dims,
+                  start_index_map, index_vector_dim, slice_sizes, *args, **kwargs_list):
+        src_code, op_type = EnflameOverrides.make_type(op_var, out_dtype, out_shape)
+        src_code += f"auto {op_var}_dimension_numbers = builder::GatherDimensionNumbers(" \
+                    f"{{{', '.join(map(str, offset_dims))}}}, {{{', '.join(map(str, collapsed_slice_dims))}}}, " \
+                    f"{{{', '.join(map(str, start_index_map))}}}, {index_vector_dim});\n" \
+                    f"builder::Op {op_var} = builder::Gather({operand}, {start_indices}, {op_var}_dimension_numbers, " \
+                    f"{{{', '.join(map(str, slice_sizes))}}}, false, {op_type});"
         return src_code
diff --git a/dicp/dicp/vendor/TopsGraph/config.py b/dicp/dicp/vendor/TopsGraph/config.py
index c83654e501..61a6d73a43 100644
--- a/dicp/dicp/vendor/TopsGraph/config.py
+++ b/dicp/dicp/vendor/TopsGraph/config.py
@@ -22,7 +22,7 @@
                    aten.hardswish.default, aten.gelu_backward.default,
                    aten.hardswish_backward.default, aten.dot.default,
                    aten.zeros_like.default, aten.ones_like.default,
-                   aten.bmm.default, aten.copy.default]
+                   aten.bmm.default, aten.copy.default, aten.stack.default]
 
 
 def get_decomp():
diff --git a/dicp/dicp/vendor/TopsGraph/conversion.py b/dicp/dicp/vendor/TopsGraph/conversion.py
index fa9ccbfbc9..f3cacd7207 100644
--- a/dicp/dicp/vendor/TopsGraph/conversion.py
+++ b/dicp/dicp/vendor/TopsGraph/conversion.py
@@ -18,6 +18,7 @@
     PatternMatcherPass,
     register_backend_patterns,
 )
+from functools import reduce
 
 conversions = {}
 patterns = []
@@ -164,6 +165,14 @@ def Relu(self, *args, **kwargs):
     def Erf(self, *args, **kwargs):
         return self.get_proxy(tops_op.Erf, args, kwargs)
 
+    @register_conversion(aten.argmax)
+    def ArgMax(self, x, dim=0, keepdim=False):
+        return self.get_proxy(tops_op.ArgMax, (x, dim, keepdim))
+
+    @register_conversion(aten.argmin)
+    def ArgMin(self, x, dim=0, keepdim=False):
+        return self.get_proxy(tops_op.ArgMin, (x, dim, keepdim))
+
     @register_conversion(aten.split.Tensor)
     def Split(self, a, size, dim=0, **kwargs):
         in_shape = a.node.meta["val"].shape
@@ -187,12 +196,55 @@ def GetTupleElement(self, a, dim, **kwargs):
 
     @register_conversion(aten.index.Tensor)
     def Index(self, *args, **kwargs):
-        assert len(args[1]) == 1, "Only support aten.index with one index arg"
-        idx_rank = len(args[1][0].node.meta['val'].shape)
-        slice_size = list(args[0].node.meta['val'].shape)
-        slice_size[0] = 1
-        return self.get_proxy(tops_op.XlaGather, (args[0], args[1][0],
-                              [idx_rank,], [0,], [0,], idx_rank, [1, args[0].node.meta['val'].shape[1]]))
+        # Prepare some info for calculating the parameters of Gather.
+        operand, indices, start_dim = args[0], args[1], 0
+        in_shape = list(operand.node.meta["val"].shape)
+        out_shape = fx_traceback.get_current_meta()["val"][0].shape
+        if len(set(indices)) == 1 and indices[0] is None:
+            return operand
+        for i in range(len(indices)):
+            if indices[i] is not None:
+                start_dim = i
+                break
+        new_indices, new_indices_shape, support_index = [], [], []
+        for i in range(len(indices)):
+            if indices[i] is not None:
+                support_index.append(i)
+                assert (
+                    len(support_index) == 1 or support_index[-1] - support_index[-2] == 1
+                ), "Only sequential non-None indices are supported!"
+                new_indices.append(indices[i])
+                new_indices_shape.append(indices[i].node.meta["val"].shape)
+        broadcast_shape = torch.broadcast_shapes(*new_indices_shape)
+        # Get start_index_map of Gather.
+        num_index_dims = len(new_indices)
+        start_index_map = []
+        for i in range(num_index_dims):
+            start_index_map.append(i + start_dim)
+        # Get index_vector_dim of Gather.
+        index_vector_dim = len(broadcast_shape)
+        # Get offset_dims, collapsed_slice_dims, index_vector_dim, slice_sizes of Gather.
+        slice_sizes, offset_dims, collapsed_slice_dims = [], [], []
+        for i in range(len(in_shape)):
+            if i >= start_dim and i < start_dim + num_index_dims:
+                collapsed_slice_dims.append(i)
+                slice_sizes.append(1)
+            else:
+                slice_sizes.append(in_shape[i])
+                if i < start_dim:
+                    offset_dims.append(i)
+                else:
+                    offset_dims.append(i - num_index_dims + index_vector_dim)
+        # Get start_indices of Gather.
+        start_indices = []
+        for index in new_indices:
+            in_shape = index.node.meta["val"].shape
+            offset = len(broadcast_shape) - len(in_shape)
+            broadcast_dims = [i + offset for i in range(len(in_shape))]
+            start_indices.append(self.get_proxy(tops_op.Expand, (index, tuple(broadcast_shape), broadcast_dims)))
+        start_indices = self.get_proxy(tops_op.Stack, (start_indices, -1))
+        return self.get_proxy(tops_op.XlaGather, (operand, start_indices, offset_dims, collapsed_slice_dims,
+                                                  start_index_map, index_vector_dim, slice_sizes, out_shape))
 
     # tops_dropout only returns a tensor, not a tuple of tensor
     @register_conversion(aten.native_dropout.default)
@@ -318,7 +370,10 @@ def Adaptive_avg_pool2d(self, *args, **kwargs):
     @register_conversion(aten._adaptive_avg_pool2d_backward.default)
     def Adaptive_avg_pool2d_backward(self, grad_output, inputs):
         out_shape = fx_traceback.get_current_meta()["val"].shape
-        expand = self.get_proxy(tops_op.Expand, (grad_output, out_shape))
+        grad_output_shape = grad_output.node.meta["val"].shape
+        offset = len(out_shape) - len(grad_output_shape)
+        broadcast_dims = [i + offset for i in range(len(grad_output_shape))]
+        expand = self.get_proxy(tops_op.Expand, (grad_output, out_shape, broadcast_dims))
         value = out_shape[2] * out_shape[3]
         scalar = self.get_proxy(tops_op.Scalar, (value, ))
         return self.get_proxy(tops_op.Div, (expand, scalar))
@@ -345,6 +400,12 @@ def BatchNorm(self, *args, **kwargs):
     def BatchNormBackward(*args, **kwargs):
         return tops_op.BatchNormBackward(*args, **kwargs)
 
+    """
+    Add an additional true flag for accuration in hlir_builder Softmax.
+    The third parameter, half_to_float, in aten._softmax represents whether cast
+    inputs from float16 to float32 or not, while the third parameter ,accurate,
+    in hlir_builder represents whether precision calculation is performed.
+    """
     @register_conversion(aten._softmax)
     def Softmax(self, a, dim, half_to_float):
         out_shape = fx_traceback.get_current_meta()["val"].shape
@@ -383,7 +444,15 @@ def NewEmptyStrided(self, *args, **kwargs):
 
     @register_conversion(aten.expand.default)
     def Expand(self, *args, **kwargs):
-        return self.get_proxy(tops_op.Expand, args, kwargs)
+        in_shape = args[0].node.meta["val"].shape
+        out_shape = fx_traceback.get_current_meta()["val"].shape
+        offset = len(out_shape) - len(in_shape)
+        broadcast_dims = [i + offset for i in range(len(in_shape))]
+        return self.get_proxy(tops_op.Expand, (*args, broadcast_dims), kwargs)
+
+    @register_conversion(aten.stack)
+    def Stack(self, *args, **kwargs):
+        return self.get_proxy(tops_op.Stack, args, kwargs)
 
     @register_conversion(aten.full.default)
     def Full(self, *args, **kwargs):
@@ -401,6 +470,10 @@ def Max(self, *args, **kwargs):
     def Pow(self, *args, **kwargs):
         return self.get_proxy(tops_op.Pow, args, kwargs)
 
+    @register_conversion(aten.square)
+    def Square(self, *args, **kwargs):
+        return self.get_proxy(tops_op.Square, args, kwargs)
+
     @register_conversion(aten.sigmoid.default)
     def Sigmoid(self, *args, **kwargs):
         return self.get_proxy(tops_op.Sigmoid, args, kwargs)
@@ -468,9 +541,10 @@ def Scalar(self, a, **kwargs):
 
     @register_conversion(aten.embedding)
     def Embedding(self, *args, **kwargs):
+        out_shape = fx_traceback.get_current_meta()["val"].shape
         idx_rank = len(args[1].node.meta['val'].shape)
         return self.get_proxy(tops_op.XlaGather, (args[0], args[1], [idx_rank,], [0,], [0,], idx_rank,
-                                                  [1, args[0].node.meta['val'].shape[1]]))
+                                                  [1, args[0].node.meta['val'].shape[1]], out_shape))
 
     @register_conversion(prims.convert_element_type)
     def Convert(self, *args, **kwargs):
@@ -512,6 +586,22 @@ def Iota(self, length, **kwargs):
             return self.get_proxy(tops_op.Add, (offset, kwargs["start"]))
         return iota
 
+    @register_conversion(aten.var_mean.correction)
+    def VarMean(self, x, dims, *args, correction=1, keepdim=False):
+        in_shape = x.node.meta["val"].shape
+        samples = [in_shape[dim] for dim in dims]
+        samples = max(0, reduce(lambda x, y: x * y, samples + [1]) - correction)
+        if dims is None:
+            dims = list(range(len(in_shape)))
+            mean1 = self.get_proxy(tops_op.ReduceMean, (x, dims, keepdim))
+        else:
+            dims = [(item + len(in_shape)) if item < 0 else item for item in dims]
+            mean1 = self.get_proxy(tops_op.ReduceMean, (x, dims, keepdim))
+        diffs = self.get_proxy(tops_op.Square, (self.get_proxy(tops_op.Sub, (x, mean1)),))
+        sum_dim = self.get_proxy(tops_op.ReduceSum, (diffs, dims, keepdim))
+        div1 = self.get_proxy(tops_op.Div, (sum_dim, samples))
+        return self.get_proxy(tops_op.MakeTuple, (div1, mean1))
+
 
 # Patterns
 tops_patterns = PatternMatcherPass()
@@ -556,28 +646,6 @@ def replacement(inputs, dims):
         return x_var
 
 
-# %var_mean_correction_4 : [#users=2] = call_function[target=torch.ops.aten.var_mean.correction]
-#                                      (args = (%convolution_4, [0, 2, 3]), kwargs = {correction: 0, keepdim: True})
-@register_aten_patterns
-class ReplacePatternVarMean(BackendPatternBase):
-    @staticmethod
-    def pattern(a, b):
-        return torch.ops.aten.var_mean.correction(a, b, correction=0, keepdim=True)
-
-    @staticmethod
-    def replacement(inputs, dims):
-        keepdim = True
-        correction = 0
-        denom = 64
-        denom = denom - correction
-        mean1 = torch.ops.aten.mean.dim(inputs, dims, keepdim)
-        diffs = torch.ops.aten.square.default(
-            torch.ops.aten.sub.Tensor(inputs, mean1))
-        sum_results = torch.ops.aten.sum.dim_IntList(diffs, dims, keepdim)
-        x_var = torch.ops.aten.div.Tensor(sum_results, denom)
-        return tops_op.ret_tuples(x_var, mean1)
-
-
 @register_aten_patterns
 class ReplacePatternT(BackendPatternBase):
     @staticmethod
diff --git a/dicp/dicp/vendor/TopsGraph/tops_op.py b/dicp/dicp/vendor/TopsGraph/tops_op.py
index dc89a407f1..efa55085d0 100644
--- a/dicp/dicp/vendor/TopsGraph/tops_op.py
+++ b/dicp/dicp/vendor/TopsGraph/tops_op.py
@@ -170,6 +170,22 @@ def __init__(self, a):
         self.torch_op = aten.erf
 
 
+class ArgMax(Operator):
+    def __init__(self, *args, **kwargs):
+        super().__init__("ArgMax")
+        self.args = args
+        self.kwargs = kwargs
+        self.torch_op = aten.argmax
+
+
+class ArgMin(Operator):
+    def __init__(self, *args, **kwargs):
+        super().__init__("ArgMin")
+        self.args = args
+        self.kwargs = kwargs
+        self.torch_op = aten.argmin
+
+
 class ReduceSum(Operator):
     def __init__(self, *args, **kwargs):
         super().__init__("ReduceSum")
@@ -535,6 +551,19 @@ def __init__(self, *args, **kwargs):
         self.kwargs = kwargs
         self.torch_op = aten.expand.default
 
+    # The third parameter broadcast_dims is only required in hlir_builder Expand.
+    def __call__(self, *args, **kwargs):
+        new_args = args[:2]
+        return super().__call__(*new_args, **kwargs)
+
+
+class Stack(Operator):
+    def __init__(self, *args, **kwargs):
+        super().__init__("Stack")
+        self.args = args
+        self.kwargs = kwargs
+        self.torch_op = aten.stack
+
 
 class Full(Operator):
     def __init__(self, *args, **kwargs):
@@ -568,6 +597,14 @@ def __init__(self, *args, **kwargs):
         self.torch_op = aten.pow
 
 
+class Square(Operator):
+    def __init__(self, *args, **kwargs):
+        super().__init__("Square")
+        self.args = args
+        self.kwargs = kwargs
+        self.torch_op = aten.square
+
+
 class Sigmoid(Operator):
     def __init__(self, *args, **kwargs):
         super().__init__("Sigmoid")
@@ -769,13 +806,11 @@ def __call__(self, *args):
 
 class XlaGather(Operator):
     def __init__(self, operand, indices, offset_dims, collapsed_slice_dims,
-                 start_index_map, index_vector_dim, slice_size):
+                 start_index_map, index_vector_dim, slice_size, out_shape):
         super().__init__("XlaGather")
 
     def __call__(self, operand, indices, offset_dims, collapsed_slice_dims,
-                 start_index_map, index_vector_dim, slice_size):
-        out_shape = indices.meta['val'].shape + operand.meta['val'].shape[1:]
-
+                 start_index_map, index_vector_dim, slice_size, out_shape):
         with operand.meta['val'].fake_mode:
             return aten.empty(out_shape, device=operand.meta["val"].device)
 
diff --git a/dicp/test/op/test_addmm.py b/dicp/test/op/test_addmm.py
new file mode 100644
index 0000000000..715194db53
--- /dev/null
+++ b/dicp/test/op/test_addmm.py
@@ -0,0 +1,45 @@
+import pytest
+from ..common.utils import (
+    torch,
+    dynamo,
+    parse_args,
+    compile_model,
+    get_device,
+    Size,
+    update_dynamo_config,
+)
+
+
+class OpModule(torch.nn.Module):
+    def forward(self, a, b, c):
+        res_default = torch.ops.aten.addmm.default(a, b, c)
+        return res_default
+
+
+model = OpModule()
+args = parse_args()
+compiled_model = compile_model(model, args.backend, args.dynamic)
+
+
+class TestAddmm():
+    @pytest.mark.parametrize("dtype", [torch.float32])
+    @pytest.mark.parametrize("sizes", [Size(((5, 3), (5, 2), (2, 3)), ((5, 3), (5, 2), (2, 3))),
+                                       Size(((2, 4), (2, 5), (5, 4)), ((2, 4), (2, 5), (5, 4)))])
+    @pytest.mark.parametrize("compiled_model", compiled_model)
+    def test_torch_addmm(self, sizes, dtype, compiled_model):
+        device = get_device()
+        size = sizes.dynamic if compiled_model.dynamic else sizes.static
+        input1 = torch.randn(size[0], dtype=dtype)
+        input2 = torch.randn(size[1], dtype=dtype)
+        input3 = torch.randn(size[2], dtype=dtype)
+
+        dicp_input1 = input1.to(device)
+        dicp_input2 = input2.to(device)
+        dicp_input3 = input3.to(device)
+
+        output = model(input1, input2, input3)
+        dynamo.reset()
+        update_dynamo_config(compiled_model.dynamic)
+        dicp_output = compiled_model.model(dicp_input1, dicp_input2, dicp_input3)
+
+        assert torch.allclose(output, dicp_output.cpu(), equal_nan=True)
diff --git a/dicp/test/op/test_argmax.py b/dicp/test/op/test_argmax.py
new file mode 100644
index 0000000000..e3d192d9c9
--- /dev/null
+++ b/dicp/test/op/test_argmax.py
@@ -0,0 +1,42 @@
+import pytest
+from ..common.utils import (
+    torch,
+    dynamo,
+    parse_args,
+    compile_model,
+    get_device,
+    Size,
+    update_dynamo_config,
+)
+
+
+class OpModule(torch.nn.Module):
+    def forward(self, a, dim, keepdim):
+        res_default = torch.ops.aten.argmax.default(a, dim, keepdim)
+        return res_default
+
+
+model = OpModule()
+args = parse_args()
+compiled_model = compile_model(model, args.backend, args.dynamic)
+
+
+class TestArgmax():
+    @pytest.mark.parametrize("dtype", [torch.float32])
+    @pytest.mark.parametrize("sizes", [Size((5,), (5, 3)), Size((3, 5), (5, 3)), Size((2, 3, 4), (2, 4))])
+    @pytest.mark.parametrize("dim", [-1, 0])
+    @pytest.mark.parametrize("keepdim", [False, True])
+    @pytest.mark.parametrize("compiled_model", compiled_model)
+    def test_torch_argmax(self, sizes, dim, keepdim, dtype, compiled_model):
+        device = get_device()
+        size = sizes.dynamic if compiled_model.dynamic else sizes.static
+        input1 = torch.randn(size, dtype=dtype)
+
+        dicp_input1 = input1.to(device)
+
+        output = model(input1, dim, keepdim)
+        dynamo.reset()
+        update_dynamo_config(compiled_model.dynamic)
+        dicp_output = compiled_model.model(dicp_input1, dim, keepdim)
+
+        assert torch.allclose(output, dicp_output.cpu(), equal_nan=True)
diff --git a/dicp/test/op/test_argmin.py b/dicp/test/op/test_argmin.py
new file mode 100644
index 0000000000..82beb3e4c0
--- /dev/null
+++ b/dicp/test/op/test_argmin.py
@@ -0,0 +1,42 @@
+import pytest
+from ..common.utils import (
+    torch,
+    dynamo,
+    parse_args,
+    compile_model,
+    get_device,
+    Size,
+    update_dynamo_config,
+)
+
+
+class OpModule(torch.nn.Module):
+    def forward(self, a, dim, keepdim):
+        res_default = torch.ops.aten.argmin.default(a, dim, keepdim)
+        return res_default
+
+
+model = OpModule()
+args = parse_args()
+compiled_model = compile_model(model, args.backend, args.dynamic)
+
+
+class TestArgmin():
+    @pytest.mark.parametrize("dtype", [torch.float32])
+    @pytest.mark.parametrize("sizes", [Size((5,), (5, 3)), Size((3, 5), (5, 3)), Size((2, 3, 4), (2, 4))])
+    @pytest.mark.parametrize("dim", [-1, 0])
+    @pytest.mark.parametrize("keepdim", [False, True])
+    @pytest.mark.parametrize("compiled_model", compiled_model)
+    def test_torch_argmin(self, sizes, dim, keepdim, dtype, compiled_model):
+        device = get_device()
+        size = sizes.dynamic if compiled_model.dynamic else sizes.static
+        input1 = torch.randn(size, dtype=dtype)
+
+        dicp_input1 = input1.to(device)
+
+        output = model(input1, dim, keepdim)
+        dynamo.reset()
+        update_dynamo_config(compiled_model.dynamic)
+        dicp_output = compiled_model.model(dicp_input1, dim, keepdim)
+
+        assert torch.allclose(output, dicp_output.cpu(), equal_nan=True)
diff --git a/dicp/test/op/test_expand.py b/dicp/test/op/test_expand.py
index c112b4d6c5..eea29b31c5 100644
--- a/dicp/test/op/test_expand.py
+++ b/dicp/test/op/test_expand.py
@@ -24,7 +24,9 @@ def forward(self, a, b):
 class TestExpand():
     @pytest.mark.parametrize("dtype", [torch.float32])
     @pytest.mark.parametrize("sizes", [Size(((3, 1), (3, 4)), ((3, 1), (3, 5))),
-                                       Size(((5, 3, 1), (5, 3, 4)), ((5, 1), (5, 3)))])
+                                       Size(((5, 3, 1), (5, 3, 4)), ((5, 1), (5, 3))),
+                                       Size(((3, 4), (2, 3, 4)), ((3, 4), (2, 3, 4))),
+                                       Size(((3, 4), (2, 5, 2, 3, 4)), ((3, 4), (2, 5, 2, 3, 4)))])
     @pytest.mark.parametrize("compiled_model", compiled_model)
     def test_torch_expand(self, sizes, dtype, compiled_model):
         device = get_device()
diff --git a/dicp/test/op/test_index.py b/dicp/test/op/test_index.py
index 3b00ea7497..d9c8d70d5d 100644
--- a/dicp/test/op/test_index.py
+++ b/dicp/test/op/test_index.py
@@ -22,10 +22,12 @@ def forward(self, a, b):
 
 
 class TestIndex():
+    @pytest.mark.skipif(args.backend != "ascendgraph",
+                        reason="This is the test case for index in ascendgraph!")
     @pytest.mark.parametrize("dtype", [torch.float32])
     @pytest.mark.parametrize("sizes", [Size((5, 3), (5, 3)), Size((3, 5), (5, 3)), Size((4, 2), (4, 2))])
     @pytest.mark.parametrize("compiled_model", compiled_model)
-    def test_torch_index(self, sizes, dtype, compiled_model):
+    def test_torch_index_ascend(self, sizes, dtype, compiled_model):
         device = get_device()
         size = sizes.dynamic if compiled_model.dynamic else sizes.static
         input1 = torch.randn(size, dtype=dtype)
@@ -34,6 +36,29 @@ def test_torch_index(self, sizes, dtype, compiled_model):
         dicp_input1 = input1.to(device)
         dicp_input2 = [torch.tensor([0, 2], device=device)]
 
+        output = model(input1, input2)
+        dynamo.reset()
+        update_dynamo_config(compiled_model.dynamic)
+        dicp_output = compiled_model.model(dicp_input1, dicp_input2)
+        assert torch.allclose(output, dicp_output.cpu(), equal_nan=True)
+
+    @pytest.mark.skipif(args.backend != "topsgraph",
+                        reason="This is the test case for index in topsgraph!")
+    @pytest.mark.parametrize("dtype", [torch.float32])
+    @pytest.mark.parametrize("sizes", [Size(((10, 15), ((2,),)), ((10, 15), ((2,),))),
+                                       Size(((10, 15, 20), (None, (1, 2), (3, 1))), ((10, 20), (None, (2,)))),
+                                       Size(((10, 15, 20, 25, 30), (None, (10, 1, 2), (10, 3, 1), None, None)),
+                                            ((25, 25), (None, (2,))))])
+    @pytest.mark.parametrize("compiled_model", compiled_model)
+    def test_torch_index_tops(self, sizes, dtype, compiled_model):
+        device = get_device()
+        size = sizes.dynamic if compiled_model.dynamic else sizes.static
+        input1 = torch.randn(size[0], dtype=dtype)
+        input2 = [item if item is None else torch.randint(0, 2, item) for item in size[1]]
+
+        dicp_input1 = input1.to(device)
+        dicp_input2 = [item if item is None else item.to(device) for item in input2]
+
         output = model(input1, input2)
         dynamo.reset()
         update_dynamo_config(compiled_model.dynamic)
diff --git a/dicp/test/op/test_sin.py b/dicp/test/op/test_sin.py
index e37cfaf7cd..411953c0cf 100644
--- a/dicp/test/op/test_sin.py
+++ b/dicp/test/op/test_sin.py
@@ -37,4 +37,4 @@ def test_torch_sin(self, sizes, dtype, compiled_model):
         update_dynamo_config(compiled_model.dynamic)
         dicp_output = compiled_model.model(dicp_input1)
 
-        assert torch.allclose(output, dicp_output.cpu(), equal_nan=True)
+        assert torch.allclose(output, dicp_output.cpu(), atol=1e-4, equal_nan=True)
diff --git a/dicp/test/op/test_stack.py b/dicp/test/op/test_stack.py
new file mode 100644
index 0000000000..0521981ee7
--- /dev/null
+++ b/dicp/test/op/test_stack.py
@@ -0,0 +1,49 @@
+import pytest
+from ..common.utils import (
+    torch,
+    dynamo,
+    parse_args,
+    compile_model,
+    get_device,
+    Size,
+    update_dynamo_config,
+)
+
+
+class OpModule(torch.nn.Module):
+    def forward(self, a, b, c):
+        res_default1 = torch.ops.aten.stack.default((a, b), c)
+        res_default2 = torch.ops.aten.stack.default((a, b, a), c)
+        res_default3 = torch.ops.aten.stack.default((a,), c)
+        return res_default1, res_default2, res_default3
+
+
+model = OpModule()
+args = parse_args()
+compiled_model = compile_model(model, args.backend, args.dynamic)
+
+
+class TestStack():
+    @pytest.mark.parametrize("dtype", [torch.float32])
+    @pytest.mark.parametrize("sizes", [Size((5,), (3, 5)),
+                                       Size((2, 5), (2, 5)),
+                                       Size((2, 3, 4), (2, 3, 4))])
+    @pytest.mark.parametrize("dim", [0, 1, 2, -1])
+    @pytest.mark.parametrize("compiled_model", compiled_model)
+    def test_torch_stack(self, sizes, dim, dtype, compiled_model):
+        device = get_device()
+        size = sizes.dynamic if compiled_model.dynamic else sizes.static
+        input1 = torch.randn(size, dtype=dtype)
+        input2 = torch.randn(size, dtype=dtype)
+        dim = min(dim, len(size) - 1)
+
+        dicp_input1 = input1.to(device)
+        dicp_input2 = input2.to(device)
+
+        output = model(input1, input2, dim)
+        dynamo.reset()
+        update_dynamo_config(compiled_model.dynamic)
+        dicp_output = compiled_model.model(dicp_input1, dicp_input2, dim)
+
+        for i, item in enumerate(output):
+            assert torch.allclose(item, dicp_output[i].cpu(), equal_nan=True)
diff --git a/dicp/test/op/test_sum.py b/dicp/test/op/test_sum.py
index 5707328fdb..3287b4e84b 100644
--- a/dicp/test/op/test_sum.py
+++ b/dicp/test/op/test_sum.py
@@ -42,4 +42,4 @@ def test_torch_sum(self, sizes, keepdim, dtype, compiled_model):
         dicp_output = compiled_model.model(dicp_input1, dim, keepdim)
 
         for i, item in enumerate(output):
-            assert torch.allclose(item, dicp_output[i].cpu(), equal_nan=True)
+            assert torch.allclose(item, dicp_output[i].cpu(), atol=1e-4, equal_nan=True)
diff --git a/dicp/test/op/test_unsqueeze.py b/dicp/test/op/test_unsqueeze.py
index ee2f123027..159a3425ad 100644
--- a/dicp/test/op/test_unsqueeze.py
+++ b/dicp/test/op/test_unsqueeze.py
@@ -23,13 +23,15 @@ def forward(self, a, dim):
 
 class TestUnsqueeze():
     @pytest.mark.parametrize("dtype", [torch.float32])
-    @pytest.mark.parametrize("sizes", [Size((5,), (5, 3)), Size((3, 5), (5, 3)), Size((2, 3, 4), (2, 4))])
+    @pytest.mark.parametrize("sizes", [Size(5, (2, 4)), Size((5,), (5, 3)),
+                                       Size((3, 5), (5, 3)), Size((2, 3, 4), (2, 4))])
     @pytest.mark.parametrize("dim", [0, 1, -1])
     @pytest.mark.parametrize("compiled_model", compiled_model)
     def test_torch_unsqueeze(self, sizes, dim, dtype, compiled_model):
         device = get_device()
         size = sizes.dynamic if compiled_model.dynamic else sizes.static
-        input1 = torch.randn(size, dtype=dtype)
+        input1 = torch.randn(size, dtype=dtype) if isinstance(size, tuple) else torch.tensor(size, dtype=dtype)
+        dim = dim if isinstance(size, tuple) else min(dim, 0)
 
         dicp_input1 = input1.to(device)
 
diff --git a/dicp/test/op/test_var_mean.py b/dicp/test/op/test_var_mean.py
new file mode 100644
index 0000000000..3a22e85fb9
--- /dev/null
+++ b/dicp/test/op/test_var_mean.py
@@ -0,0 +1,44 @@
+import pytest
+from ..common.utils import (
+    torch,
+    dynamo,
+    parse_args,
+    compile_model,
+    get_device,
+    Size,
+    update_dynamo_config,
+)
+
+
+class OpModule(torch.nn.Module):
+    def forward(self, a, b):
+        res_correction = torch.ops.aten.var_mean.correction(a, b, correction=0, keepdim=True)
+        return res_correction
+
+
+model = OpModule()
+args = parse_args()
+compiled_model = compile_model(model, args.backend, args.dynamic)
+
+
+class TestVarMean():
+    @pytest.mark.parametrize("dtype", [torch.float32])
+    @pytest.mark.parametrize("dim", [[0], [1], [0, -1]])
+    @pytest.mark.parametrize("sizes", [Size((77, 1027), (77, 1024)),
+                                       Size((4, 77, 1024), (4, 1024)),
+                                       Size((2, 32, 10, 9216), (4, 77))])
+    @pytest.mark.parametrize("compiled_model", compiled_model)
+    def test_torch_var_mean(self, sizes, dtype, dim, compiled_model):
+        device = get_device()
+        size = sizes.dynamic if compiled_model.dynamic else sizes.static
+        input1 = torch.randn(size, dtype=dtype)
+
+        dicp_input1 = input1.to(device)
+
+        output = model(input1, dim)
+        dynamo.reset()
+        update_dynamo_config(compiled_model.dynamic)
+        dicp_output = compiled_model.model(dicp_input1, dim)
+
+        for i, item in enumerate(output):
+            assert torch.allclose(item, dicp_output[i].cpu(), atol=1e-3, equal_nan=True)
diff --git a/dicp/test/tops_scripts/ops/static.ini b/dicp/test/tops_scripts/ops/static.ini
index 69d12d2a98..478fcf89e5 100644
--- a/dicp/test/tops_scripts/ops/static.ini
+++ b/dicp/test/tops_scripts/ops/static.ini
@@ -1,6 +1,6 @@
 [pytest]
 testpaths = ../../op
-python_files = test__adpative_avg_pool2d_backward.py
+python_files = test__adaptive_avg_pool2d_backward.py
                test__adaptive_avg_pool2d.py
                test__log_softmax.py
                test__native_batch_norm_legit_functional.py
@@ -8,9 +8,12 @@ python_files = test__adpative_avg_pool2d_backward.py
                test__unsafe_view.py
                test_abs.py
                test_add.py
+               test_addmm.py
                test_alias
                test_amax.py
                test_arange.py
+               test_argmax.py
+               test_argmin.py
                test_bernoulli.py
                test_bmm.py
                test_cat.py
@@ -72,10 +75,12 @@ python_files = test__adpative_avg_pool2d_backward.py
                test_split.py
                test_square.py
                test_squeeze.py
+               test_stack.py
                test_sub.py
                test_sum.py
                test_transpose.py
                test_unsqueeze.py
+               test_var_mean.py
                test_view_as_complex.py
                test_view_as_real.py
                test_view.py

From b823cfe7f29835f561ff04727381804901d66a4a Mon Sep 17 00:00:00 2001
From: lwysense <147478868+lwysense@users.noreply.github.com>
Date: Tue, 26 Dec 2023 10:55:46 +0800
Subject: [PATCH 35/58] [dipu] Fix warning based on the results of clang-tidy
 (#509)

* [dipu]Fix warning based on the results of clang tidy

* Fix an error reported by CI

* Fix warning based on the results of clang-tidy

* Fix warning with high cognitive complexity in collection.cpp

* Formatting c++ code

* Fix some unreasonable code based on review

* Fix warning based on review

* Fix warning based on review

* Update dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctions.hpp

---------

Co-authored-by: Lingjie <lljbash@gmail.com>
---
 .../csrc_dipu/aten/RegisterDIPU.cpp           |  89 +++++----
 .../csrc_dipu/aten/RegisterDIPU.hpp           |  24 +--
 .../aten/ops/CustomFallbackFunctions.hpp      |  19 +-
 ...ustomFallbackFunctionsForAmpGradScaler.cpp |   5 +-
 .../torch_dipu/csrc_dipu/aten/ops/DIPUAmp.cpp |   6 +-
 .../csrc_dipu/aten/ops/EmptyOpsKernel.cpp     |  14 +-
 .../torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp |  71 +++----
 .../csrc_dipu/aten/ops/PinMemoryKernel.cpp    |  10 +-
 .../csrc_dipu/aten/ops/StorageShapeKernel.cpp |  41 ++--
 .../torch_dipu/csrc_dipu/base/DIPUGlobals.cpp |   8 +-
 .../torch_dipu/csrc_dipu/binding/DIPUpybind.h |   6 +-
 .../torch_dipu/csrc_dipu/binding/ExportRT.cpp |  33 ++--
 .../csrc_dipu/binding/ExportTensor.cpp        |  21 +-
 .../csrc_dipu/diopirt/diopi_helper.cpp        |  36 ++--
 .../csrc_dipu/profiler/collection.cpp         | 179 +++++++++---------
 .../csrc_dipu/profiler/collection.h           |   8 +-
 16 files changed, 306 insertions(+), 264 deletions(-)

diff --git a/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.cpp b/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.cpp
index 6898e83b0c..2f2cc88d76 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.cpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.cpp
@@ -16,40 +16,55 @@
 
 using dnative = dipu::native::DIPUATenFunctions;
 
-static std::string force_fallback_operators_list = []() -> std::string {
-  std::ifstream stream(".dipu_force_fallback_op_list.config",
-                       std::ios_base::in | std::ios::binary);
-  std::string content;
-  const char* env = std::getenv("DIPU_FORCE_FALLBACK_OPS_LIST");
-  if (env != nullptr) {
-    content += env;
-  }
-  if (stream.is_open()) {
-    while (!stream.eof()) {
-      std::string line;
-      stream >> line;
-      content += "," + line;
+namespace dipu {
+namespace {
+
+void read_comma_separated_list(std::istream& input,
+                               std::vector<std::string>& output) {
+  auto line = std::string();
+  while (std::getline(input, line)) {
+    auto buffer = std::stringstream(line);
+    auto value = std::string();
+    while (std::getline(buffer, value, ',')) {
+      output.push_back(std::move(value));
     }
   }
-  return content;
-}();
+}
+
+std::vector<std::string> getFallbackList() {
+  auto fallback_list = std::vector<std::string>();
+  if (auto env = std::getenv("DIPU_FORCE_FALLBACK_OPS_LIST")) {
+    auto iss = std::stringstream(env);
+    read_comma_separated_list(iss, fallback_list);
+  }
+  auto file = std::ifstream(".dipu_force_fallback_op_list.config",
+                            std::ios_base::in | std::ios::binary);
+  read_comma_separated_list(file, fallback_list);
+
+  return fallback_list;
+}
+
+const std::vector<std::string> force_fallback_operators_list =
+    getFallbackList();
+
+}  // end of namespace
 
-namespace dipu {
 bool get_force_fallback(const char* opname) {
-  if (force_fallback_operators_list.size() <= 0 || opname == nullptr) {
+  if (force_fallback_operators_list.empty() || opname == nullptr) {
     return false;
-  } else {
-    std::stringstream strstream(force_fallback_operators_list);
-    std::string force_fallback_pattern;
-    while (std::getline(strstream, force_fallback_pattern, ',')) {
-      if (force_fallback_pattern.size() <= 0) {
-        continue;
-      }
+  }
+  for (auto& force_fallback_pattern : force_fallback_operators_list) {
+    if (force_fallback_pattern.empty()) {
+      continue;
+    }
+    try {
       bool force_fallback =
           std::regex_match(opname, std::regex(force_fallback_pattern));
       if (force_fallback) {
         return true;
       }
+    } catch (const std::regex_error& e) {
+      TORCH_CHECK(false, e.what());
     }
   }
   return false;
@@ -76,7 +91,7 @@ void dump_fallback_op_args(const c10::OperatorHandle& op,
   const auto num_arguments = schema_args.size();
   auto arguments = torch::jit::last(stack, num_arguments);
 
-  auto dumpTensor = [&](const at::Tensor tensor) {
+  auto dumpTensor = [&](const at::Tensor& tensor) {
     if (tensor.defined()) {
       std::cout << "numel: " << tensor.numel() << ", sizes: " << tensor.sizes()
                 << ", stride: " << tensor.strides()
@@ -97,7 +112,6 @@ void dump_fallback_op_args(const c10::OperatorHandle& op,
     }
   };
 
-  const auto arguments_begin = stack->size() - num_arguments;
   for (const auto idx : c10::irange(arguments.size())) {
     std::cout << "\t" << name << ": \t" << schema_args[idx].name() << ": ";
     const auto& ivalue = arguments[idx];
@@ -108,9 +122,9 @@ void dump_fallback_op_args(const c10::OperatorHandle& op,
     } else if (ivalue.isTensorList()) {
       const auto& tensorlist = ivalue.toTensorList();
       std::cout << std::endl;
-      for (size_t i = 0; i < tensorlist.size(); i++) {
+      for (const auto& tensor : tensorlist) {
         std::cout << "\t";
-        dumpTensor(tensorlist[i]);
+        dumpTensor(tensor);
         std::cout << std::endl;
       }
     } else {
@@ -149,16 +163,17 @@ void dipu_fallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys,
   }
 }
 
+// NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables)
 std::deque<std::tuple<torch::Library*, DIPUOpRegister::OpRegFunPtr>>
     DIPUOpRegister::dipuOpRegisterList;
 std::mutex DIPUOpRegister::mutex_;
+// NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables)
 
 void DIPUOpRegister::register_op() {
   std::lock_guard<std::mutex> guard(mutex_);
-  for (auto iter = dipuOpRegisterList.begin(); iter != dipuOpRegisterList.end();
-       ++iter) {
-    torch::Library* lib = std::get<0>(*iter);
-    DIPUOpRegister::OpRegFunPtr fun_ptr = std::get<1>(*iter);
+  for (auto& iter : dipuOpRegisterList) {
+    torch::Library* lib = std::get<0>(iter);
+    DIPUOpRegister::OpRegFunPtr fun_ptr = std::get<1>(iter);
     fun_ptr(*lib);
   }
   dipuOpRegisterList.clear();
@@ -288,6 +303,7 @@ at::Scalar wrapper_DIPU___local_scalar_dense(const at::Tensor& self) {
   return dnative::_local_scalar_dense_dipu(self);
 }
 
+// NOLINTBEGIN(performance-unnecessary-value-param)
 at::Tensor& wrapper_DIPU_source_Storage_set_(at::Tensor& self,
                                              at::Storage source) {
   // No device check
@@ -302,10 +318,11 @@ at::Tensor& wrapper_DIPU_source_Storage_offset_set_(
     c10::SymIntArrayRef size, c10::SymIntArrayRef stride) {
   // No device check
   // DeviceGuard omitted
-  return dnative::set_storage_dipu_(self, source, storage_offset.expect_int(),
-                                    C10_AS_INTARRAYREF_SLOW(size),
-                                    C10_AS_INTARRAYREF_SLOW(stride));
+  return dnative::set_storage_dipu_(
+      self, std::move(source), storage_offset.expect_int(),
+      C10_AS_INTARRAYREF_SLOW(size), C10_AS_INTARRAYREF_SLOW(stride));
 }
+// NOLINTEND(performance-unnecessary-value-param)
 
 at::Tensor& wrapper_DIPU_source_Tensor_set_(at::Tensor& self,
                                             const at::Tensor& source) {
@@ -413,7 +430,7 @@ DIPU_LIBRARY_IMPL(aten, DIPU_DEVICE_TYPE_MACRO, m) {
 
 class IgnoreWarningHandler : public c10::WarningHandler {
  public:
-  void process(const c10::Warning& warning) {
+  void process(const c10::Warning& warning) override {
     // do nothing
   }
 };
diff --git a/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.hpp b/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.hpp
index 9cef60995c..aa5acbb20b 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.hpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.hpp
@@ -18,6 +18,7 @@ void dipu_fallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys,
                    torch::jit::Stack* stack);
 
 // Print the warning message only once for one process.
+// NOLINTBEGIN(bugprone-macro-parentheses): x cannot be in parentheses
 #define DIPU_LOG_WARNING_ONCE(x)     \
   do {                               \
     static bool should_print = true; \
@@ -26,6 +27,7 @@ void dipu_fallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys,
       should_print = false;          \
     }                                \
   } while (0)
+// NOLINTEND(bugprone-macro-parentheses)
 
 // Check the environment variable and call the DIPU_LOG_WARNING_ONCE
 #define DIPU_OP_LOG_WARNING_ONCE(...)                      \
@@ -53,8 +55,8 @@ void dipu_fallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys,
       } else {                                                               \
         DIPU_OP_LOG_WARNING_ONCE("force fallback has been set, ");           \
       }                                                                      \
-      DIPU_OP_LOG_WARNING_ONCE(opname << " will be fallback to cpu"          \
-                                      << std::endl);                         \
+      DIPU_OP_LOG_WARNING_ONCE((opname) << " will be fallback to cpu"        \
+                                        << "\n");                            \
     }                                                                        \
   } while (false);
 
@@ -62,7 +64,7 @@ void dipu_fallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys,
                                         wapper_func, custom_fallback_func)    \
   do {                                                                        \
     if ((reinterpret_cast<void*>(diopi_func) != nullptr) &&                   \
-        !(force_fallback || dipu::get_force_fallback(opname))) {              \
+        !((force_fallback) || dipu::get_force_fallback(opname))) {            \
       m.impl(opname, TORCH_FN(wapper_func));                                  \
     } else {                                                                  \
       if ((reinterpret_cast<void*>(diopi_func) == nullptr)) {                 \
@@ -70,22 +72,24 @@ void dipu_fallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys,
       } else {                                                                \
         DIPU_OP_LOG_WARNING_ONCE("force fallback has been set, ");            \
       }                                                                       \
-      DIPU_OP_LOG_WARNING_ONCE(opname << " will be fallback to cpu"           \
-                                      << std::endl);                          \
+      DIPU_OP_LOG_WARNING_ONCE((opname) << " will be fallback to cpu"         \
+                                        << "\n");                             \
       m.impl(opname, TORCH_FN(custom_fallback_func));                         \
     }                                                                         \
   } while (false);
 
 class DIPUOpRegister {
  public:
-  typedef void (*OpRegFunPtr)(torch::Library&);
+  using OpRegFunPtr = void (*)(torch::Library&);
 
  private:
   OpRegFunPtr fun_ptr_;
   torch::Library lib_;
+  // NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables)
   static std::deque<std::tuple<torch::Library*, OpRegFunPtr>>
       dipuOpRegisterList;
   static std::mutex mutex_;
+  // NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables)
 
  public:
   DIPUOpRegister(OpRegFunPtr fun_ptr, const char* ns,
@@ -97,7 +101,7 @@ class DIPUOpRegister {
       fun_ptr_(lib_);
     } else {
       std::lock_guard<std::mutex> guard(mutex_);
-      dipuOpRegisterList.push_back(std::make_tuple(&lib_, fun_ptr_));
+      dipuOpRegisterList.emplace_back(&lib_, fun_ptr_);
     }
   }
 
@@ -106,8 +110,6 @@ class DIPUOpRegister {
 
 }  // namespace at
 
-namespace {
-
 #define DIPU_LIBRARY_IMPL(ns, k, m) _DIPU_LIBRARY_IMPL(ns, k, m, C10_UID)
 
 #define _DIPU_LIBRARY_IMPL(ns, k, m, uid)                                 \
@@ -124,6 +126,4 @@ namespace {
           []() { return [](torch::Library&) -> void {}; }),               \
       #ns, c10::make_optional(c10::DispatchKey::k), __FILE__, __LINE__);  \
   void C10_CONCATENATE(DIPU_LIBRARY_IMPL_init_##ns##_##k##_,              \
-                       uid)(torch::Library & m)
-
-}  // namespace
+                       uid)(torch::Library & (m))
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctions.hpp b/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctions.hpp
index 7de896f582..f65e04263a 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctions.hpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctions.hpp
@@ -22,9 +22,8 @@ static at::Tensor to_cpu_no_half(const at::Tensor& devtensor) {
   auto intype = devtensor.options().dtype_opt()->toScalarType();
   if (intype == at::ScalarType::Half) {
     return cpu_tensor.to(at::ScalarType::Float);
-  } else {
-    return cpu_tensor;
   }
+  return cpu_tensor;
 }
 
 static at::Tensor& custom_fallback_dipu_silu_out(const at::Tensor& self,
@@ -33,6 +32,7 @@ static at::Tensor& custom_fallback_dipu_silu_out(const at::Tensor& self,
                            << std::endl);
   auto self_cpu = to_cpu_no_half(self);
   auto out_cpu = to_cpu_no_half(self);
+  // NOLINTNEXTLINE(readability-suspicious-call-argument): It's the correct order
   out_cpu = at::silu_out(self_cpu, out_cpu);
   out.copy_(out_cpu);
   return out;
@@ -153,7 +153,9 @@ custom_fallback_dipu_convolution_backward_overrideable(
       grad_output_cpu, input_cpu, weight_cpu, c10::nullopt, stride, padding,
       dilation, transposed, output_padding, groups, output_mask_temp);
 
-  at::Tensor grad_input, grad_weight, grad_bias;
+  at::Tensor grad_input;
+  at::Tensor grad_weight;
+  at::Tensor grad_bias;
 
   if (output_mask[0]) {
     grad_input = std::get<0>(result).to(device);
@@ -226,8 +228,15 @@ custom_fallback_dipu_linear_backward(const at::Tensor& input,
   auto grad_output_cpu = grad_output.cpu();
   auto weight_cpu = weight.cpu();
 
-  at::Tensor grad_input_cpu, grad_weight_cpu, grad_bias_cpu;
-  at::Tensor grad_input, grad_weight, grad_bias;
+  at::Tensor grad_input;
+  at::Tensor grad_input_cpu;
+
+  at::Tensor grad_weight;
+  at::Tensor grad_weight_cpu;
+
+  at::Tensor grad_bias;
+  at::Tensor grad_bias_cpu;
+
   int64_t dims = input.dim();
   const auto device = input.device();
 
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctionsForAmpGradScaler.cpp b/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctionsForAmpGradScaler.cpp
index 2514e1e163..03a8fb2334 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctionsForAmpGradScaler.cpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctionsForAmpGradScaler.cpp
@@ -18,7 +18,7 @@ void _amp_non_finite_check_and_unscale_(at::Tensor& scaled_grad,
                                         const at::Tensor& inv_scale) {
   scaled_grad *= inv_scale.item();
   if (!scaled_grad.isfinite().all().item<bool>()) {
-    found_inf[0] = 1.f;
+    found_inf[0] = 1.F;
   }
 }
 
@@ -46,8 +46,7 @@ void custom_fallback_dipu__amp_foreach_non_finite_check_and_unscale_(
   TORCH_CHECK(inv_scale.numel() == 1, "inv_scale must be a 1-element tensor.");
   TORCH_CHECK(found_inf.numel() == 1, "found_inf must be a 1-element tensor.");
   for (const at::Tensor& t : scaled_grads) {
-    // NOLINTNEXTLINE: const_cast here is safe according to pytorch's source
-    // code
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast): const_cast here is safe according to pytorch's source code
     _amp_non_finite_check_and_unscale_(const_cast<at::Tensor&>(t), found_inf,
                                        inv_scale);
   }
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUAmp.cpp b/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUAmp.cpp
index b7c5c347d3..7181d9892e 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUAmp.cpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUAmp.cpp
@@ -237,8 +237,10 @@ DIPU_DEFINE_CAST_POLICY_CONVERSION(kPromote, promote);
 
 // This function will throw an error message when
 // torch.nn.functional.binary_cross_entropy is called within an autocast block
-Tensor DipuBinaryCrossEntropyBanned(const Tensor&, const Tensor&,
-                                    const c10::optional<Tensor>&, int64_t) {
+Tensor DipuBinaryCrossEntropyBanned(const Tensor& /*unused*/,
+                                    const Tensor& /*unused*/,
+                                    const c10::optional<Tensor>& /*unused*/,
+                                    int64_t /*unused*/) {
   AT_ERROR(
       R"(torch.nn.functional.binary_cross_entropy and torch.nn.BCELoss are unsafe to autocast.
 Many models use a sigmoid layer right before the binary cross entropy layer.
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/EmptyOpsKernel.cpp b/dipu/torch_dipu/csrc_dipu/aten/ops/EmptyOpsKernel.cpp
index 2837967029..2e34c78061 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/EmptyOpsKernel.cpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/EmptyOpsKernel.cpp
@@ -9,13 +9,8 @@
 #include <csrc_dipu/profiler/profiler.h>
 #include <csrc_dipu/runtime/rthelper.h>
 
-using at::Layout;
-using c10::device_or_default;
-using c10::layout_or_default;
-using c10::StorageImpl;
-using c10::TensorImpl;
-
-namespace dipu::native {
+namespace dipu {
+namespace native {
 
 static c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
   if (pin_memory) {
@@ -69,7 +64,7 @@ at::Tensor DIPUATenFunctions::empty_strided(
   dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__);
   auto device = c10::device_or_default(device_opt);
   AT_ASSERT(device.type() == dipu::DIPU_DEVICE_TYPE);
-  AT_ASSERT(layout_or_default(layout_opt) == Layout::Strided);
+  AT_ASSERT(c10::layout_or_default(layout_opt) == at::Layout::Strided);
   auto dtype = dtype_or_default(dtype_opt);
 
   c10::Allocator* allocator = dipu::getAllocator(dipu::DIPU_DEVICE_TYPE);
@@ -96,4 +91,5 @@ at::Tensor DIPUATenFunctions::empty_strided_cpu(
                                            dtype);
 }
 
-}  // namespace dipu::native
+}  // namespace native
+}  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp b/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp
index c787528084..173aca3b43 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp
@@ -3,7 +3,8 @@
 #include <csrc_dipu/runtime/rthelper.h>
 #include <csrc_dipu/utils/Log.h>
 
-namespace dipu::native {
+namespace dipu {
+namespace native {
 
 inline bool checkTensorDevice() {
   static bool enable = []() {
@@ -11,7 +12,7 @@ inline bool checkTensorDevice() {
     if (env_ptr == nullptr) {
       return false;
     }
-    return std::atoi(env_ptr) > 0 ? true : false;
+    return std::atoi(env_ptr) > 0;
   }();
   return enable;
 }
@@ -25,7 +26,6 @@ inline void synchronizeIfEnable() {
                   << std::endl;
     dipu::getCurrentDIPUStream().synchronize();
   }
-  return;
 }
 
 inline int dumpOpArgLevel() {
@@ -90,19 +90,19 @@ std::string dumpArg(const at::Tensor& tensor) {
 }
 
 template <>
-std::string dumpArg(const at::Scalar& scalar) {
+std::string dumpArg(const at::Scalar& t) {
   std::stringstream stream;
-  stream << scalar;
+  stream << t;
   return stream.str();
 }
 
 template <>
-std::string dumpArg(const c10::string_view& str) {
-  return dumpArg(std::string(str.data()));
+std::string dumpArg(const c10::string_view& t) {
+  return dumpArg(std::string(t.data()));
 }
 
 template <>
-std::string dumpArg(const at::Generator& generator) {
+std::string dumpArg(const at::Generator& t) {
   return "";
 }
 
@@ -136,7 +136,7 @@ static std::vector<int64_t> infer_reduce_op_shape(
     const container1<T1>& input_shape, const container2<T2>& dims,
     bool keepdim) {
   if (dims.size() <= 0) {
-    return std::vector<int64_t>();
+    return {};
   }
   if (keepdim) {
     std::vector<int64_t> output_shape(input_shape.begin(), input_shape.end());
@@ -146,48 +146,48 @@ static std::vector<int64_t> infer_reduce_op_shape(
       output_shape[dim] = 1;
     }
     return output_shape;
-  } else {
-    std::vector<int64_t> output_shape;
-    output_shape.reserve(input_shape.size() - dims.size());
-    for (int i = 0; i < input_shape.size(); ++i) {
-      bool reduce_dim = false;
-      for (auto iter = dims.begin(); iter != dims.end(); ++iter) {
-        auto dim = *iter;
-        dim += dim < 0 ? input_shape.size() : 0;
-        if (dim == i) {
-          reduce_dim = true;
-          break;
-        }
-      }
-      if (reduce_dim == false) {
-        output_shape.push_back(input_shape.at(i));
+  }
+  std::vector<int64_t> output_shape;
+  output_shape.reserve(input_shape.size() - dims.size());
+  for (int i = 0; i < input_shape.size(); ++i) {
+    bool reduce_dim = false;
+    for (auto iter = dims.begin(); iter != dims.end(); ++iter) {
+      auto dim = *iter;
+      dim += dim < 0 ? input_shape.size() : 0;
+      if (dim == i) {
+        reduce_dim = true;
+        break;
       }
     }
-    return output_shape;
+    if (!reduce_dim) {
+      output_shape.push_back(input_shape.at(i));
+    }
   }
+  return output_shape;
 }
 
 static std::string _allclose(const at::Tensor& a, const at::Tensor& b) {
   if (a.defined() && b.defined()) {
     try {
-      if (at::allclose(a.cpu(), b.cpu(), 1e-4, 1e-5, true)) {
+      constexpr double tolerance_absolute = 1e-4;
+      constexpr double tolerance_relative = 1e-5;
+      if (at::allclose(a.cpu(), b.cpu(), tolerance_absolute, tolerance_relative,
+                       true)) {
         return "allclose";
-      } else {
-        auto diff = at::abs(a.cpu() - b.cpu());
-        auto mae = diff.mean().item<double>();
-        auto max_diff = diff.max().item<double>();
-        return "not_close, max diff: " + std::to_string(max_diff) +
-               ", MAE: " + std::to_string(mae);
       }
+      auto diff = at::abs(a.cpu() - b.cpu());
+      auto mae = diff.mean().item<double>();
+      auto max_diff = diff.max().item<double>();
+      return "not_close, max diff: " + std::to_string(max_diff) +
+             ", MAE: " + std::to_string(mae);
     } catch (...) {
       return "compare_error: not_close";
     }
   } else {
     if (a.defined() != b.defined()) {
       return "not_close, one of tensor inputs is empty";
-    } else {
-      return "allclose";
     }
+    return "allclose";
   }
 }
 
@@ -203,4 +203,5 @@ static std::string _allclose(const c10::ArrayRef<at::Tensor>& a,
   return result;
 }
 
-}  // namespace dipu::native
+}  // namespace native
+}  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/PinMemoryKernel.cpp b/dipu/torch_dipu/csrc_dipu/aten/ops/PinMemoryKernel.cpp
index 36e411c9c5..f51f62cde6 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/PinMemoryKernel.cpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/PinMemoryKernel.cpp
@@ -8,7 +8,8 @@
 #include <csrc_dipu/aten/DIPUATenFunctions.h>
 #include <csrc_dipu/runtime/rthelper.h>
 
-namespace dipu::native {
+namespace dipu {
+namespace native {
 
 bool DIPUATenFunctions::is_pinned(const at::Tensor& self,
                                   c10::optional<at::Device> device) {
@@ -27,8 +28,8 @@ at::Tensor DIPUATenFunctions::_pin_memory(const at::Tensor& self,
   auto allocator = dipu::getAllocator(at::DeviceType::CPU);
   auto storage =
       c10::Storage(c10::Storage::use_byte_size_t(),
-                   at::detail::computeStorageNbytes(
-                       self.sizes(), self.strides(), self.dtype().itemsize()),
+                   static_cast<int64_t>(at::detail::computeStorageNbytes(
+                       self.sizes(), self.strides(), self.dtype().itemsize())),
                    allocator, false);
   auto tensor = at::cpu::empty({0}, self.options())
                     .set_(storage, 0, self.sizes(), self.strides());
@@ -36,4 +37,5 @@ at::Tensor DIPUATenFunctions::_pin_memory(const at::Tensor& self,
   return tensor;
 }
 
-}  // namespace dipu::native
+}  // namespace native
+}  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/StorageShapeKernel.cpp b/dipu/torch_dipu/csrc_dipu/aten/ops/StorageShapeKernel.cpp
index 4b33d68fa4..777af21cbe 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/StorageShapeKernel.cpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/StorageShapeKernel.cpp
@@ -12,17 +12,9 @@
 #include <csrc_dipu/runtime/core/MemChecker.h>
 #include <csrc_dipu/runtime/rthelper.h>
 
-using at::IntArrayRef;
-using at::Layout;
-using c10::device_or_default;
-using c10::layout_or_default;
-using c10::MemoryFormat;
-using c10::StorageImpl;
-using c10::TensorImpl;
-using dipu::devproxy::current_device;
-
-namespace dipu::native {
-void DIPUATenFunctions::resize_bytes_dipu(StorageImpl* storage,
+namespace dipu {
+namespace native {
+void DIPUATenFunctions::resize_bytes_dipu(c10::StorageImpl* storage,
                                           size_t newsize_bytes) {
   TORCH_CHECK(storage->resizable(),
               "Trying to resize dipu storage that is not resizable");
@@ -30,7 +22,7 @@ void DIPUATenFunctions::resize_bytes_dipu(StorageImpl* storage,
   TORCH_CHECK(allocator != nullptr,
               "Trying to resize dipu storage without an allocator");
 
-  auto device = current_device();
+  auto device = dipu::devproxy::current_device();
   dipu::DIPUStream stream = dipu::getCurrentDIPUStream();
   if (newsize_bytes == 0) {
     storage->set_data_ptr_noswap(
@@ -53,8 +45,9 @@ void DIPUATenFunctions::resize_bytes_dipu(StorageImpl* storage,
   storage->set_nbytes(newsize_bytes);
 }
 
-static inline TensorImpl* _resize_impl_dipu_(TensorImpl* self, IntArrayRef size,
-                                             at::OptionalIntArrayRef stride) {
+static inline c10::TensorImpl* _resize_impl_dipu_(
+    c10::TensorImpl* self, at::IntArrayRef size,
+    at::OptionalIntArrayRef stride) {
   if (self->sizes() == size && (!stride || self->strides() == stride)) {
     return self;
   }
@@ -89,10 +82,10 @@ const at::Tensor& DIPUATenFunctions::resize_(
   }
   auto* self_ = self.unsafeGetTensorImpl();
   // not support stride now
-  _resize_impl_dipu_(self_, size, /*strides=*/c10::nullopt);
+  _resize_impl_dipu_(self_, size, /*stride=*/c10::nullopt);
   if (optional_memory_format.has_value()) {
     auto memory_format = optional_memory_format.value();
-    TORCH_CHECK(memory_format != MemoryFormat::Preserve,
+    TORCH_CHECK(memory_format != at::MemoryFormat::Preserve,
                 "Unsupported memory format", memory_format);
     self_->empty_tensor_restride(memory_format);
   }
@@ -104,7 +97,8 @@ at::Tensor& DIPUATenFunctions::set_storage_dipu_(at::Tensor& result,
                                                  int64_t storage_offset,
                                                  at::IntArrayRef size,
                                                  at::IntArrayRef stride) {
-  at::native::checkSetStorage(result, storage, storage_offset, size, stride);
+  at::native::checkSetStorage(result, std::move(storage), storage_offset, size,
+                              stride);
 
   result.unsafeGetTensorImpl()->set_storage_offset(storage_offset);
   at::OptionalIntArrayRef stride_opt =
@@ -113,12 +107,13 @@ at::Tensor& DIPUATenFunctions::set_storage_dipu_(at::Tensor& result,
   return result;
 }
 
-at::Tensor& DIPUATenFunctions::set_dipu_(at::Tensor& result) {
-  caffe2::TypeMeta dtype = result.dtype();
+at::Tensor& DIPUATenFunctions::set_dipu_(at::Tensor& self) {
+  caffe2::TypeMeta dtype = self.dtype();
   c10::Storage storage(c10::Storage::use_byte_size_t(), 0,
                        dipu::getAllocator(dipu::DIPU_DEVICE_TYPE), true);
-  DIPUATenFunctions::set_storage_dipu_(result, storage, 0, {0}, {});
-  TORCH_INTERNAL_ASSERT(dtype == result.dtype());
-  return result;
+  DIPUATenFunctions::set_storage_dipu_(self, storage, 0, {0}, {});
+  TORCH_INTERNAL_ASSERT(dtype == self.dtype());
+  return self;
 }
-}  // namespace dipu::native
+}  // namespace native
+}  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/base/DIPUGlobals.cpp b/dipu/torch_dipu/csrc_dipu/base/DIPUGlobals.cpp
index bb8fc899df..721431a5ac 100644
--- a/dipu/torch_dipu/csrc_dipu/base/DIPUGlobals.cpp
+++ b/dipu/torch_dipu/csrc_dipu/base/DIPUGlobals.cpp
@@ -20,8 +20,8 @@ static void printPromptAtStartup() {
 }
 
 static void initResourceImpl() {
-  static std::atomic_bool called(false);
-  if (called == true) {
+  static bool called(false);
+  if (called) {
     return;
   }
   called = true;
@@ -33,8 +33,8 @@ static void initResourceImpl() {
 }
 
 static void releaseAllResourcesImpl() {
-  static std::atomic_bool called(false);
-  if (called == true) {
+  static bool called(false);
+  if (called) {
     return;
   }
   called = true;
diff --git a/dipu/torch_dipu/csrc_dipu/binding/DIPUpybind.h b/dipu/torch_dipu/csrc_dipu/binding/DIPUpybind.h
index 34baa917d9..048d26a992 100644
--- a/dipu/torch_dipu/csrc_dipu/binding/DIPUpybind.h
+++ b/dipu/torch_dipu/csrc_dipu/binding/DIPUpybind.h
@@ -17,7 +17,7 @@ at::ScalarType dtypeToScalarType(PyObject* dtype_obj) {
   // In PyTorch they would write:
   //   return reinterpret_cast<THPDtype*>(dtype_obj)->scalar_type;
   // But we do care about aliasing.
-  THPDtype dtype;
+  THPDtype dtype{};
   std::memcpy(&dtype, dtype_obj, sizeof(dtype));
   return dtype.scalar_type;
 }
@@ -61,7 +61,7 @@ struct type_caster<at::ScalarType> {
  public:
   PYBIND11_TYPE_CASTER(at::ScalarType, _("torch.dtype"));
 
-  bool load(py::handle src, bool) {
+  bool load(py::handle src, bool /*unused*/) {
     // Convert Python torch.dtype to at::ScalarType
     PyObject* obj = src.ptr();
     if (isDtype(obj)) {
@@ -75,7 +75,7 @@ struct type_caster<at::ScalarType> {
                          py::return_value_policy /* policy */,
                          py::handle /* parent */) {
     // Convert at::ScalarType to Python torch.dtype
-    return py::handle(scalarTypeToDtype(src));
+    return {{py::handle(scalarTypeToDtype(src))}};
   }
 };
 
diff --git a/dipu/torch_dipu/csrc_dipu/binding/ExportRT.cpp b/dipu/torch_dipu/csrc_dipu/binding/ExportRT.cpp
index 7f0853d5bf..38f53d1471 100644
--- a/dipu/torch_dipu/csrc_dipu/binding/ExportRT.cpp
+++ b/dipu/torch_dipu/csrc_dipu/binding/ExportRT.cpp
@@ -15,14 +15,14 @@
 
 #include "DIPUpybind.h"
 #include "exportapi.h"
+
 using dipu::DIPUEvent;
 using dipu::DIPUStream;
-using dipu::getDIPUStreamFromPool;
 namespace py = pybind11;
 
 namespace dipu {
 
-static constexpr size_t kMega = 1024 * 1024;
+static constexpr auto kMega = static_cast<const size_t>(1024 * 1024);
 using dipu::devapis::DIPUDeviceProperties;
 using dipu::devapis::DIPUDeviceStatus;
 
@@ -112,13 +112,14 @@ static void exportStream(py::module& m) {
                              dipu::DIPU_DEVICE_TYPE);
                }
                return DIPUStream(device_index, stream_id);
-             } else if (stream_ptr) {
+             }
+             if (stream_ptr) {
                return dipu::getStreamFromExternal(
+                   // NOLINTNEXTLINE(performance-no-int-to-ptr)
                    reinterpret_cast<deviceStream_t>(stream_ptr),
                    devproxy::current_device());
-             } else {
-               return getDIPUStreamFromPool();
              }
+             return getDIPUStreamFromPool();
            }),
            py::arg("priority") = 0, py::arg("stream_id") = 0,
            py::arg("device_index") = 0, py::arg("device_type") = 0,
@@ -149,10 +150,11 @@ static void exportStream(py::module& m) {
           [](DIPUStream& stream) -> int64_t {
             return static_cast<int64_t>(stream.device().type());
           })
-      .def_property_readonly("dipu_stream",
-                             [](DIPUStream& stream) -> uint64_t {
-                               return (uint64_t)stream.rawstream();
-                             })
+      .def_property_readonly(
+          "dipu_stream",
+          [](DIPUStream& stream) -> uint64_t {
+            return reinterpret_cast<uint64_t>(stream.rawstream());
+          })
       // use type_caster<at::Device>
       .def_property_readonly("device", [](DIPUStream& stream) -> at::Device {
         return stream.device();
@@ -202,7 +204,9 @@ static void exportEvent(py::module& m) {
 
       .def_property_readonly(
           "dipu_event",
-          [](DIPUEvent& self) { return (uint64_t)self.rawevent(); })
+          [](DIPUEvent& self) {
+            return reinterpret_cast<uint64_t>(self.rawevent());
+          })
       .def_property_readonly("device", [](DIPUEvent& self) {
         auto device = self.device().value();
         return device;
@@ -276,8 +280,9 @@ static void patchStorage(py::module& m) {
 }
 
 static void patchTensor(py::module& m) {
-  m.def("is_dipu",
-        [](at::Tensor self) -> bool { return dipu::isDeviceTensor(self); });
+  m.def("is_dipu", [](const at::Tensor& self) -> bool {
+    return dipu::isDeviceTensor(self);
+  });
 }
 
 static void exportGenerator(py::module& m) {
@@ -293,13 +298,13 @@ static void exportGenerator(py::module& m) {
         [](at::DeviceIndex idx) -> at::Tensor { return get_rng_state(idx); });
 
   m.def("_set_rng_state", [](at::DeviceIndex idx, at::Tensor state) {
-    set_rng_state(idx, state);
+    set_rng_state(idx, std::move(state));
   });
 
   m.def("_is_in_bad_fork", []() -> bool { return is_in_bad_fork(); });
 
   m.def("_create_dipu_generator", [](int idx) -> at::Generator {
-    at::DeviceIndex index = static_cast<at::DeviceIndex>(idx);
+    auto index = static_cast<at::DeviceIndex>(idx);
     return createDIPUGenerator(index);
   });
 }
diff --git a/dipu/torch_dipu/csrc_dipu/binding/ExportTensor.cpp b/dipu/torch_dipu/csrc_dipu/binding/ExportTensor.cpp
index e5e57384e3..9e5893f520 100644
--- a/dipu/torch_dipu/csrc_dipu/binding/ExportTensor.cpp
+++ b/dipu/torch_dipu/csrc_dipu/binding/ExportTensor.cpp
@@ -35,6 +35,7 @@ static at::Tensor dispatch_to(
       non_blocking, copy);
 }
 
+// TODO(fandaoyi): check memory leak
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
 static std::shared_ptr<PyObject* [2]> splitArgs(PyObject* args) {
   ssize_t rawSize = PyTuple_Size(args);
@@ -179,13 +180,15 @@ static PyObject* dipuMockCudaTensors(PyObject* _unused, PyObject* noargs) {
 // we prefer to use pybind11 to export patch func, cpython is used only patching
 // tensor-func which has complex dynamic parameters not easy to parsed by
 // pybind.
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables)
-static PyMethodDef TorchTensorMethods[] = {
-    {"dipu", castPyCFunctionWithKeywords(THPVariable_dipu),
-     METH_VARARGS | METH_KEYWORDS, nullptr},
-    {"_mockCudaTensor", reinterpret_cast<PyCFunction>(dipuMockCudaTensors),
-     METH_NOARGS, nullptr},
-    {nullptr, nullptr, 0, nullptr}};
-
-DIPU_API PyMethodDef* exportTensorFunctions() { return TorchTensorMethods; }
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+static std::array<PyMethodDef, 3> TorchTensorMethods = {
+    {{"dipu", castPyCFunctionWithKeywords(THPVariable_dipu),
+      METH_VARARGS | METH_KEYWORDS, nullptr},
+     {"_mockCudaTensor", reinterpret_cast<PyCFunction>(dipuMockCudaTensors),
+      METH_NOARGS, nullptr},
+     {nullptr, nullptr, 0, nullptr}}};
+
+DIPU_API PyMethodDef* exportTensorFunctions() {
+  return TorchTensorMethods.data();
+}
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/diopirt/diopi_helper.cpp b/dipu/torch_dipu/csrc_dipu/diopirt/diopi_helper.cpp
index 2d49804221..ce1857a814 100644
--- a/dipu/torch_dipu/csrc_dipu/diopirt/diopi_helper.cpp
+++ b/dipu/torch_dipu/csrc_dipu/diopirt/diopi_helper.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) 2023, DeepLink.
-#include <stdio.h>
+#include <cstdio>
 
 #include "./diopirt_impl.h"
 
@@ -24,7 +24,9 @@ ::diopiConstTensorHandle_t toDiopiTensorHandle(const at::Tensor* tensor) {
 
 ::diopiConstTensorHandle_t toDiopiTensorHandle(
     const c10::optional<at::Tensor>& tensor) {
-  if (!tensor.has_value()) return nullptr;
+  if (!tensor.has_value()) {
+    return nullptr;
+  }
   return toDiopiTensorHandle(tensor.value());
 }
 
@@ -36,7 +38,9 @@ ::diopiGeneratorHandle_t toDiopiGeneratorHandle(at::Generator& generator) {
 
 ::diopiGeneratorHandle_t toDiopiGeneratorHandle(
     c10::optional<at::Generator>& generator) {
-  if (!generator.has_value()) return nullptr;
+  if (!generator.has_value()) {
+    return nullptr;
+  }
   return toDiopiGeneratorHandle(generator.value());
 }
 
@@ -73,11 +77,13 @@ ::diopiScalar_t toDiopiScalar(const at::Scalar& scalar,
     result.stype = ::diopiDtype_t::diopi_dtype_int64;
     result.ival = static_cast<int64_t>(scalar.toBool());
     return result;
-  } else if (c10::isFloatingType(type)) {
+  }
+  if (c10::isFloatingType(type)) {
     result.stype = ::diopiDtype_t::diopi_dtype_float64;
     result.fval = scalar.toDouble();
     return result;
-  } else if (c10::isIntegralType(type, false)) {
+  }
+  if (c10::isIntegralType(type, false)) {
     result.stype = ::diopiDtype_t::diopi_dtype_int64;
     result.ival = static_cast<int64_t>(scalar.toLong());
     return result;
@@ -152,6 +158,7 @@ caffe2::TypeMeta toATenType(::diopiDtype_t dt) {
   }
 }
 
+// NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
 int64_t getElemSize(::diopiDtype_t dt) {
   switch (dt) {
     case diopi_dtype_int32:
@@ -179,6 +186,7 @@ int64_t getElemSize(::diopiDtype_t dt) {
       TORCH_CHECK(false, "invalid diopi type, diopi type is ", dt);
   }
 }
+// NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
 
 c10::DeviceType toATenDevice(::diopiDevice_t device) {
   switch (device) {
@@ -191,11 +199,11 @@ c10::DeviceType toATenDevice(::diopiDevice_t device) {
   }
 }
 
-::diopiSize_t toDiopiSize(const at::OptionalIntArrayRef& input) {
+::diopiSize_t toDiopiSize(const at::OptionalIntArrayRef& dim) {
   ::diopiSize_t diopi_size{nullptr, 0};
-  if (input.has_value()) {
-    diopi_size.data = input.value().data();
-    diopi_size.len = input.value().size();
+  if (dim.has_value()) {
+    diopi_size.data = dim.value().data();
+    diopi_size.len = static_cast<int64_t>(dim.value().size());
   }
   return diopi_size;
 }
@@ -203,17 +211,19 @@ ::diopiSize_t toDiopiSize(const at::OptionalIntArrayRef& input) {
 ::diopiSize_t toDiopiSize(at::IntArrayRef input) {
   ::diopiSize_t diopi_size{nullptr, 0};
   diopi_size.data = input.data();
-  diopi_size.len = input.size();
+  diopi_size.len = static_cast<int64_t>(input.size());
   return diopi_size;
 }
 
 ::diopiRoundMode_t toDiopiRoundMode(const std::string& rounding_mode) {
   if (rounding_mode == "none" || rounding_mode == "None" ||
-      rounding_mode.size() <= 0) {
+      rounding_mode.empty()) {
     return RoundModeNone;
-  } else if (rounding_mode == "floor") {
+  }
+  if (rounding_mode == "floor") {
     return RoundModeFloor;
-  } else if (rounding_mode == "trunc") {
+  }
+  if (rounding_mode == "trunc") {
     return RoundModeTrunc;
   }
   TORCH_CHECK(false,
diff --git a/dipu/torch_dipu/csrc_dipu/profiler/collection.cpp b/dipu/torch_dipu/csrc_dipu/profiler/collection.cpp
index 90d82005a4..60ff2912fc 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/collection.cpp
+++ b/dipu/torch_dipu/csrc_dipu/profiler/collection.cpp
@@ -31,23 +31,18 @@ constexpr bool kKinetoAvailable{true};
 
 using torch::profiler::perf_counters_t;
 using torch::profiler::impl::ActivityType;
-using torch::profiler::impl::AppendOnlyList;
 using torch::profiler::impl::approx_time_t;
 using torch::profiler::impl::ExtraFields;
 using torch::profiler::impl::KinetoObserverContext;
 using torch::profiler::impl::op_input_t;
 using torch::profiler::impl::ProfilerConfig;
-using torch::profiler::impl::ProfilerState;
-using torch::profiler::impl::RawTensorMetadata;
 using torch::profiler::impl::Result;
-using torch::profiler::impl::stacksToStr;
 using torch::profiler::impl::TensorMetadata;
 using torch::profiler::impl::kineto::ActivityTraceWrapper;
 using torch::profiler::impl::kineto::DeviceAndResource;
 using torch::profiler::impl::kineto::interface_trace_t;
 using torch::profiler::impl::kineto::kineto_ids;
 using torch::profiler::impl::python_tracer::CompressedEvent;
-using torch::profiler::impl::python_tracer::PythonTracerBase;
 
 using result_ptr_t = std::shared_ptr<torch::profiler::impl::Result>;
 using trace_ptr_t =
@@ -61,8 +56,8 @@ void DIPUInputOutputEncoder::push(c10::ArrayRef<const c10::IValue> values) {
       tags_.emplace_back(Tag::Scalar);
       // Scalars are small enough that they are stored in ivalues without an
       // extra memory alloc
-      // TODO: further optimize this by maybe giving Profiler access to the
-      // guts of IValue.
+      // TODO(caikun-pjlab): further optimize this by maybe giving Profiler
+      // access to the guts of IValue.
       ivalues_.emplace_back(value);
     } else if (value.isTensorList()) {
       tags_.emplace_back(Tag::TensorListBegin);
@@ -78,7 +73,7 @@ void DIPUInputOutputEncoder::push(c10::ArrayRef<const c10::IValue> values) {
 }
 
 void DIPUInputOutputEncoder::push(const at::Tensor& t) {
-  if (t.defined() && !t.is_nested()) {  // TODO fix nested sizes
+  if (t.defined() && !t.is_nested()) {  // TODO(caikun-pjlab) fix nested sizes
     tags_.emplace_back(Tag::Tensor);
     tensor_metadata_.emplace_back(t);
     tensor_sizes_strides_.copy(t.sizes());
@@ -196,8 +191,8 @@ uint64_t DIPUThreadLocalSubqueue::TorchOpStorage::EventBlock<
 // ---------------------------------
 std::unique_ptr<KinetoObserverContext> DIPUThreadLocalSubqueue::begin_op(
     const at::RecordFunction& fn) {
-  KinetoObserverContext::Event* event;
-  uint64_t corr_id;
+  KinetoObserverContext::Event* event = nullptr;
+  uint64_t corr_id = 0;
   std::tie(event, corr_id) = torch_ops_.op_events_.emplace_back(
       fn.seqNr(), fn.forwardThreadId(), fn.scope(), fn.isAsync(),
       fn.debugHandle(), fn.name());
@@ -212,7 +207,7 @@ std::unique_ptr<KinetoObserverContext> DIPUThreadLocalSubqueue::begin_op(
   }
 
   // backward nodes source range corresponds to the forward node
-  // TODO: consider using C++ stack trace
+  // TODO(caikun-pjlab): consider using C++ stack trace
   if (config_.with_stack && fn.scope() != at::RecordScope::BACKWARD_FUNCTION) {
     auto cs =
         torch::profiler::impl::prepareCallstack(torch::jit::currentCallstack());
@@ -244,7 +239,7 @@ std::unique_ptr<KinetoObserverContext> DIPUThreadLocalSubqueue::begin_op(
 namespace {
 template <typename T>
 struct StealOrDefault {
-  StealOrDefault(T& container)
+  explicit StealOrDefault(T& container)
       : container_{container}, it_{container.begin()} {}
 
   ~StealOrDefault() { container_.get().clear(); }
@@ -252,11 +247,10 @@ struct StealOrDefault {
   typename T::Iterator::value_type operator()() {
     if (it_.exhausted()) {
       return typename T::Iterator::value_type();
-    } else {
-      auto result = std::move(*it_);
-      ++it_;
-      return result;
     }
+    auto result = std::move(*it_);
+    ++it_;
+    return result;
   }
 
   std::reference_wrapper<T> container_;
@@ -266,8 +260,8 @@ struct StealOrDefault {
 
 void DIPUThreadLocalSubqueue::TorchOpStorage::materialize(
     std::vector<std::shared_ptr<Result>>& out,
-    const std::function<time_t(approx_time_t)> time_converter,
-    const uint64_t tid, const DeviceAndResource& kineto_info) {
+    const std::function<time_t(approx_time_t)>& time_converter, uint64_t tid,
+    const DeviceAndResource& kineto_info) {
   // Plumb Autograd info to the top level annotation.
   auto it = op_events_.begin();
   for (C10_UNUSED const auto _ :
@@ -297,7 +291,8 @@ void DIPUThreadLocalSubqueue::TorchOpStorage::materialize(
 
   auto input_getter = inputs_outputs_.getNextShapesAndDtypes();
 
-  // TODO: CTAD will take care of template args when we move to C++17
+  // TODO(caikun-pjlab): CTAD will take care of template args when we move to
+  // C++17
   auto jit_stack = StealOrDefault<decltype(jit_stack_)>(jit_stack_);
   auto jit_module = StealOrDefault<decltype(jit_modules_)>(jit_modules_);
   auto extra_args = StealOrDefault<decltype(extra_args_)>(extra_args_);
@@ -338,7 +333,9 @@ struct SubQueueThreadCache {
 // `sub_queue_cache_.key_` before attempting to access `ref_`, and if `key_`
 // does not match the DIPURecordQueue's *unique* `id_` it will evict
 // `sub_queue_cache_` and fall back to a different mechanism.
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 std::atomic<uint32_t> queue_id_{0};
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 thread_local SubQueueThreadCache sub_queue_cache_{0, nullptr};
 
 std::string toString(
@@ -380,7 +377,7 @@ auto kinetoEventCorrelationID(
 }
 }  // namespace
 
-DIPUThreadLocalSubqueue::DIPUThreadLocalSubqueue(const uint64_t tid,
+DIPUThreadLocalSubqueue::DIPUThreadLocalSubqueue(uint64_t tid,
                                                  const ProfilerConfig& config)
     : tid_{tid}, config_{config}, kineto_info_{kineto_ids()} {
   libkineto::api().activityProfiler().recordThreadInfo();
@@ -400,7 +397,7 @@ DIPURecordQueue::DIPURecordQueue(const ProfilerConfig& config,
 }
 
 bool DIPURecordQueue::tracePython() const {
-  return config_.with_stack && activities_.count(ActivityType::CPU);
+  return config_.with_stack && (activities_.count(ActivityType::CPU) != 0U);
 }
 
 DIPUThreadLocalSubqueue* DIPURecordQueue::getSubqueue() {
@@ -443,19 +440,22 @@ void mark_finished(std::shared_ptr<Result>& r) {
   TORCH_INTERNAL_ASSERT(r->endTimeNS() >= r->start_time_ns_, r->name());
 }
 
-static constexpr const char* indexKey = "Ev Idx";
+constexpr const char* indexKey = "Ev Idx";
 
 void passEventsToKineto(const std::vector<std::shared_ptr<Result>>& results,
                         uint64_t start_time_us, uint64_t end_time_us) {
-  using namespace torch::profiler::impl::kineto;
-  TraceWrapper cpu_trace(start_time_us, "PyTorch Profiler");
+  using torch::profiler::impl::kineto::addMetadata;
+  using torch::profiler::impl::kineto::TraceWrapper;
+  TraceWrapper cpu_trace(static_cast<time_t>(start_time_us),
+                         "PyTorch Profiler");
 
   // Generate Kineto events for each event recorded by the PyTorch profiler.
+  constexpr time_t kNsPerUs = 1000;
   for (const auto i : c10::irange(results.size())) {
     const auto& e = results[i];
     const auto* activity = cpu_trace.addCPUActivity(
         e->name(), e->kinetoType(), e->kineto_info_, e->correlationID(),
-        e->start_time_ns_ / 1000, e->endTimeNS() / 1000);
+        e->start_time_ns_ / kNsPerUs, e->endTimeNS() / kNsPerUs);
 
     TORCH_INTERNAL_ASSERT(activity || !kKinetoAvailable);
     if (activity) {
@@ -464,7 +464,7 @@ void passEventsToKineto(const std::vector<std::shared_ptr<Result>>& results,
   }
 
   // Kineto adds the events that it collected.
-  cpu_trace.transferCpuTrace(end_time_us);
+  cpu_trace.transferCpuTrace(static_cast<int64_t>(end_time_us));
 }
 
 // There are two mechanisms that we use to connect Profiler and Kineto events.
@@ -508,7 +508,7 @@ class TransferEvents {
   }
 
  private:
-  static long long extractIndex(const std::string& metadata_json) {
+  static int64_t extractIndex(const std::string& metadata_json) {
     static const auto prefix = fmt::format("\"{}\": ", indexKey);
     auto pos = metadata_json.find(prefix);
     return (pos == std::string::npos) ? unmatchedIndex : [&]() {
@@ -561,7 +561,7 @@ class TransferEvents {
     }
   }
 
-  std::shared_ptr<Result> resultFromActivity(const itrace_t* activity) {
+  static std::shared_ptr<Result> resultFromActivity(const itrace_t* activity) {
     TORCH_INTERNAL_ASSERT(activity != nullptr);
 
     // Kineto is inconsistent with types, so we have to cast to int32.
@@ -569,8 +569,9 @@ class TransferEvents {
         static_cast<int32_t>(activity->deviceId()),
         static_cast<int32_t>(activity->resourceId())};
 
+    constexpr size_t kNsPerUs = 1000;
     auto event = Result::create(
-        activity->timestamp() * 1000,
+        activity->timestamp() * kNsPerUs,
         noTID,  // Placeholder
         device_and_resource,
         ExtraFields<torch::profiler::impl::EventType::Kineto>{
@@ -580,7 +581,7 @@ class TransferEvents {
             activity->type(),
             {/*id=*/static_cast<uint32_t>(activity->flowId()),
              /*type=*/static_cast<uint32_t>(activity->flowType()),
-             /*start=*/activity->flowStart()}});
+             /*start=*/static_cast<uint32_t>(activity->flowStart())}});
 
     // NB: It's tempting to set `event->kineto_activity_`; however we can only
     // guarantee that the events we passed to Kineto are of type
@@ -668,9 +669,10 @@ class TransferEvents {
       e->visit(c10::overloaded(
           [&](const ExtraFields<torch::profiler::impl::EventType::Kineto>& i) {
             // Flow takes priority over linked event.
-            const auto it = flow_map.find(i.flow.id);
+            const auto it = flow_map.find(static_cast<int>(i.flow.id));
             if (it != flow_map.end() &&
-                i.flow.type == libkineto::kLinkAsyncCpuGpu && !i.flow.start) {
+                i.flow.type == libkineto::kLinkAsyncCpuGpu &&
+                (i.flow.start == 0U)) {
               e->parent_ = it->second;
             }
 
@@ -692,7 +694,7 @@ class TransferEvents {
     }
   }
 
-  static constexpr long long unmatchedIndex = -1;
+  static constexpr int64_t unmatchedIndex = -1;
   static constexpr auto noTID = std::numeric_limits<uint64_t>::max();
   std::reference_wrapper<std::vector<std::shared_ptr<Result>>> results_;
   std::vector<const itrace_t*> trace_activities_;
@@ -706,7 +708,6 @@ ActivityTraceWrapper stopTrace() {
 trace_ptr_t addKinetoEvents(std::vector<std::shared_ptr<Result>>& results,
                             uint64_t start_time_us, uint64_t end_time_us,
                             const ProfilerConfig& config) {
-  using namespace torch::profiler::impl::kineto;
   passEventsToKineto(results, start_time_us, end_time_us);
 
   // In on demand mode kineto is directly controlled by other machinery.
@@ -739,58 +740,60 @@ void set_in_tree_building(std::vector<result_ptr_t>& results,
   }
 }
 
-void build_tree(std::vector<std::shared_ptr<Result>>& sorted_events) {
-  set_in_tree_building(sorted_events, true);
-
+void push_event(std::shared_ptr<Result>& event,
+                ska::flat_hash_map<uint64_t, std::shared_ptr<Result>>& stacks,
+                std::priority_queue<result_ptr_t, std::vector<result_ptr_t>,
+                                    ResultGreater>& end_events_) {
+  // Kineto builds subtrees using correlation ids and flows, so some Kineto
+  // events are already marked finished before the main tree building
+  // algorithm. It's fine to ignore them; the root event of these subtrees
+  // not a Kineto op and will be handled normally.
   using op_fields = ExtraFields<torch::profiler::impl::EventType::TorchOp>;
-  ska::flat_hash_map<uint64_t, std::shared_ptr<Result>> stacks;
-  std::priority_queue<result_ptr_t, std::vector<result_ptr_t>, ResultGreater>
-      end_events_;
 
-  auto push_event = [&stacks, &end_events_](std::shared_ptr<Result>& event) {
-    // Kineto builds subtrees using correlation ids and flows, so some Kineto
-    // events are already marked finished before the main tree building
-    // algorithm. It's fine to ignore them; the root event of these subtrees
-    // not a Kineto op and will be handled normally.
-    if (c10::holds_alternative<
-            ExtraFields<torch::profiler::impl::EventType::Kineto>>(
-            event->extra_fields_) &&
-        event->finished_) {
-      return;
-    }
+  if (c10::holds_alternative<
+          ExtraFields<torch::profiler::impl::EventType::Kineto>>(
+          event->extra_fields_) &&
+      event->finished_) {
+    return;
+  }
 
-    TORCH_INTERNAL_ASSERT(event->parent_.expired());
-    for (const auto& child : event->children_) {
-      TORCH_INTERNAL_ASSERT(child->finished_);
-    }
-    TORCH_INTERNAL_ASSERT(!event->finished_);
-
-    auto parent_it = stacks.find(event->start_tid_);
-    if (parent_it == stacks.end()) {
-      auto fwd_tid = event->visit(
-          c10::overloaded([](const op_fields& i) { return i.forward_tid_; },
-                          [](const auto&) -> uint64_t { return 0; }));
-      if (fwd_tid) {
-        parent_it = stacks.find(fwd_tid);
-      }
+  TORCH_INTERNAL_ASSERT(event->parent_.expired());
+  for (const auto& child : event->children_) {
+    TORCH_INTERNAL_ASSERT(child->finished_);
+  }
+  TORCH_INTERNAL_ASSERT(!event->finished_);
+
+  auto parent_it = stacks.find(event->start_tid_);
+  if (parent_it == stacks.end()) {
+    auto fwd_tid = event->visit(
+        c10::overloaded([](const op_fields& i) { return i.forward_tid_; },
+                        [](const auto&) -> uint64_t { return 0; }));
+    if (fwd_tid) {
+      parent_it = stacks.find(fwd_tid);
     }
+  } else {
+    event->parent_ = parent_it->second;
+    parent_it->second->children_.push_back(event);
+  }
 
-    if (parent_it != stacks.end()) {
-      event->parent_ = parent_it->second;
-      parent_it->second->children_.push_back(event);
-    }
+  if (event->endTimeNS() > event->start_time_ns_) {
+    stacks[event->start_tid_] = event;
+    end_events_.push(event);
+  } else if (event->endTimeNS() == std::numeric_limits<time_t>::min()) {
+    // We use min time to indicate the lack of a termination event, so if we
+    // encounter such a case we don't push to `end_events_`.
+    stacks[event->start_tid_] = event;
+  } else {
+    mark_finished(event);
+  }
+}
 
-    if (event->endTimeNS() > event->start_time_ns_) {
-      stacks[event->start_tid_] = event;
-      end_events_.push(event);
-    } else if (event->endTimeNS() == std::numeric_limits<time_t>::min()) {
-      // We use min time to indicate the lack of a termination event, so if we
-      // encounter such a case we don't push to `end_events_`.
-      stacks[event->start_tid_] = event;
-    } else {
-      mark_finished(event);
-    }
-  };
+void build_tree(std::vector<std::shared_ptr<Result>>& sorted_events) {
+  set_in_tree_building(sorted_events, true);
+
+  ska::flat_hash_map<uint64_t, std::shared_ptr<Result>> stacks;
+  std::priority_queue<result_ptr_t, std::vector<result_ptr_t>, ResultGreater>
+      end_events_;
 
   auto pop_event = [&stacks](std::shared_ptr<Result> event) {
     if (event->finished_) {
@@ -823,7 +826,7 @@ void build_tree(std::vector<std::shared_ptr<Result>>& sorted_events) {
       pop_event(end_events_.top());
       end_events_.pop();
     }
-    push_event(event);
+    push_event(event, stacks, end_events_);
   }
 
   // Cleanup remaining exit events.
@@ -869,12 +872,10 @@ int64_t adjust_durations_dfs(std::shared_ptr<Result>& r) {
                         r->name());
           }));
       return children_total_duration;
-    } else {
-      return original_duration;
     }
-  } else {
-    return 0;
+    return original_duration;
   }
+  return 0;
 }
 
 /**
@@ -961,6 +962,7 @@ DIPURecordQueue::getRecords(std::function<time_t(approx_time_t)> time_converter,
 
   // Used as a replacement of if-constexpr (C++ 17) to implement static
   // polymorphism.
+  constexpr time_t kNsPerUs = 1000;
   struct {
     std::reference_wrapper<decltype(converter)> convert;
     using Event = torch::profiler::impl::EventType;
@@ -968,7 +970,7 @@ DIPURecordQueue::getRecords(std::function<time_t(approx_time_t)> time_converter,
       return convert(i.start_time_);
     }
     time_t operator()(const ExtraFields<Event::Backend>& i) const {
-      return i.start_time_us_ * 1000;
+      return i.start_time_us_ * kNsPerUs;
     }
   } start_time_of{std::ref(converter)};
 
@@ -1008,8 +1010,9 @@ DIPURecordQueue::getRecords(std::function<time_t(approx_time_t)> time_converter,
   }
 
   if (python_tracer_) {
-    for (const auto& i : python_tracer_->getEvents(converter, python_enters,
-                                                   end_time_us * 1000)) {
+    for (const auto& i : python_tracer_->getEvents(
+             converter, python_enters,
+             static_cast<time_t>(end_time_us) * kNsPerUs)) {
       out.push_back(i);
     }
     python_tracer_.reset();
diff --git a/dipu/torch_dipu/csrc_dipu/profiler/collection.h b/dipu/torch_dipu/csrc_dipu/profiler/collection.h
index dc220b6f08..c937d796f6 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/collection.h
+++ b/dipu/torch_dipu/csrc_dipu/profiler/collection.h
@@ -43,7 +43,7 @@ class DIPUInputOutputEncoder final {
   enum class Tag {
     Tensor = 0,
     UndefinedTensor,
-    TensorListBegin,  // TODO: generalize to other lists.
+    TensorListBegin,  // TODO(caikun-pjlab): generalize to other lists.
     Scalar,
     Other,
     TERMINATOR
@@ -68,7 +68,7 @@ class DIPUInputOutputEncoder final {
 
 class DIPUThreadLocalSubqueue {
  public:
-  DIPUThreadLocalSubqueue(const uint64_t tid,
+  DIPUThreadLocalSubqueue(uint64_t tid,
                           const torch::profiler::impl::ProfilerConfig& config);
 
   std::unique_ptr<torch::profiler::impl::KinetoObserverContext> begin_op(
@@ -124,9 +124,9 @@ class DIPUThreadLocalSubqueue {
     // NB: This is a destructive operation.
     void materialize(
         std::vector<std::shared_ptr<torch::profiler::impl::Result>>& out,
-        const std::function<time_t(torch::profiler::impl::approx_time_t)>
+        const std::function<time_t(torch::profiler::impl::approx_time_t)>&
             time_converter,
-        const uint64_t tid,
+        uint64_t tid,
         const torch::profiler::impl::kineto::DeviceAndResource& kineto_info);
 
     template <typename T, size_t ChunkSize>

From 22e1a826bde0d7de2b0a104f9150d8fccb7a7bf3 Mon Sep 17 00:00:00 2001
From: caikun-pjlab <116071181+caikun-pjlab@users.noreply.github.com>
Date: Tue, 26 Dec 2023 16:45:52 +0800
Subject: [PATCH 36/58] update diopi (#570)

---
 dipu/third_party/DIOPI | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dipu/third_party/DIOPI b/dipu/third_party/DIOPI
index 98470dd554..978f399e0e 160000
--- a/dipu/third_party/DIOPI
+++ b/dipu/third_party/DIOPI
@@ -1 +1 @@
-Subproject commit 98470dd55415bdf8fab63583f20f20643a2a8325
+Subproject commit 978f399e0ee4b556b526d59d1df42a3b15b6767b

From 99dadd7205177e854b466dc69da500023c03005e Mon Sep 17 00:00:00 2001
From: wyz5864 <109072365+wyz5864@users.noreply.github.com>
Date: Tue, 26 Dec 2023 17:19:26 +0800
Subject: [PATCH 37/58] [dipu]clang tidy (#519)

* fix tidy

* fix tidy

* format

* format

* Update dipu/torch_dipu/csrc_dipu/vendor/cuda/communiatorimpl.cpp

---------

Co-authored-by: fandaoyi <344031146@qq.com>
---
 .../csrc_dipu/runtime/core/MemChecker.cpp     |  14 +-
 .../csrc_dipu/runtime/core/MemChecker.h       |   3 +-
 .../csrc_dipu/runtime/device/basedef.h        |   6 +-
 .../runtime/devproxy/deviceproxy.cpp          |   2 +-
 .../runtime/distributed/ProcessGroupDICL.cpp  | 147 ++++++++++--------
 dipu/torch_dipu/csrc_dipu/stub.cpp            |   1 +
 dipu/torch_dipu/csrc_dipu/utils/helpfunc.cpp  |   1 +
 dipu/torch_dipu/csrc_dipu/utils/helpfunc.hpp  |   3 +-
 .../vendor/cuda/CudaGeneratorImpl.cpp         |   7 +-
 .../csrc_dipu/vendor/cuda/communiatorimpl.cpp |  31 ++--
 .../csrc_dipu/vendor/cuda/deviceimpl.cpp      |  21 ++-
 .../cuda/patch/DIPUPatchCudaAllocator.cpp     |  69 ++++----
 12 files changed, 159 insertions(+), 146 deletions(-)

diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/MemChecker.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/MemChecker.cpp
index 82c7d61b47..74b9242a95 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/MemChecker.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/MemChecker.cpp
@@ -73,11 +73,15 @@ int32_t MemChecker::log_interval() {
 
 std::string MemChecker::current_state() const {
   std::stringstream stream;
-  stream << "current block num = " << blocks_.size()
-         << ", total_size = " << (total_size_ >> 20) << "MB"
-         << ", insert count = " << insert_cnt_
-         << ", max block num = " << max_block_num()
-         << ", log interval = " << log_interval();
+  stream
+      << "current block num = "
+      << blocks_.size()
+      // convert B to MB
+      // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
+      << ", total_size = " << (total_size_ >> 20) << "MB"
+      << ", insert count = " << insert_cnt_
+      << ", max block num = " << max_block_num()
+      << ", log interval = " << log_interval();
   return stream.str();
 }
 
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/MemChecker.h b/dipu/torch_dipu/csrc_dipu/runtime/core/MemChecker.h
index 9f3f7b0e44..b047ff3979 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/MemChecker.h
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/MemChecker.h
@@ -1,8 +1,8 @@
 // Copyright (c) 2023, DeepLink.
 #pragma once
 
+#include <cstdint>
 #include <mutex>
-#include <stdint.h>
 #include <string>
 #include <unordered_map>
 #include <utility>
@@ -28,7 +28,6 @@ class MemChecker final {
  private:
   std::string current_state() const;
 
- private:
   std::mutex mtx_;
   std::unordered_map<const void*, std::pair<size_t, std::string>> blocks_;
   int64_t total_size_ = 0;
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/device/basedef.h b/dipu/torch_dipu/csrc_dipu/runtime/device/basedef.h
index 4ecf8c9621..ff8557c5cb 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/device/basedef.h
+++ b/dipu/torch_dipu/csrc_dipu/runtime/device/basedef.h
@@ -16,7 +16,7 @@ namespace dipu {
 // "default", "hidden", "protected" or "internal
 #define DIPU_HIDDEN __attribute__((visibility("hidden")))
 
-typedef int32_t enum_t;
+using enum_t = int32_t;
 
 #define DIPU_STRING(x) #x
 #define DIPU_CODELOC __FILE__ " (" DIPU_STRING(__LINE__) ")"
@@ -55,14 +55,14 @@ enum class MemCPKind : enum_t {
   D2D,
 };
 
-typedef enum {
+enum diclResult_t {
   /*! The operation was successful. */
   DICL_SUCCESS = 0x0,
 
   /*! undefined error */
   DICL_ERR_UNDEF = 0x01000,
 
-} diclResult_t;
+};
 
 struct DIPUDeviceStatus {
   size_t freeGlobalMem = 0;
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.cpp b/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.cpp
index 82b2985200..2ac23ef3c0 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.cpp
@@ -28,7 +28,7 @@ DIPUDeviceStatus getDeviceStatus(int32_t device_index) {
   if (devapis::getDeviceStatus) {
     return devapis::getDeviceStatus(device_index);
   }
-  return DIPUDeviceStatus();
+  return {};
 }
 
 // set current device given device according to id
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/distributed/ProcessGroupDICL.cpp b/dipu/torch_dipu/csrc_dipu/runtime/distributed/ProcessGroupDICL.cpp
index 54b0626310..6b0a3fb396 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/distributed/ProcessGroupDICL.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/distributed/ProcessGroupDICL.cpp
@@ -19,8 +19,7 @@ namespace {
 
 // Get the list of devices from list of tensors, collective comm always use all
 // ranks, so no rank prefix required in key.
-static inline std::string getDevieceIds(
-    const std::vector<at::Device>& devices) {
+std::string getDevieceIds(const std::vector<at::Device>& devices) {
   std::string deviceList;
   for (auto& device : devices) {
     if (deviceList.empty()) {
@@ -32,7 +31,7 @@ static inline std::string getDevieceIds(
   return deviceList;
 }
 
-static inline pair<int, int> mapPGRank2P2P(int myRank, int peer) {
+pair<int, int> mapPGRank2P2P(int myRank, int peer) {
   // ProcessGroupNCCL support send/recv self, but that seems only work with
   // ncclGroup?
   TORCH_CHECK(myRank != peer,
@@ -49,15 +48,14 @@ static inline pair<int, int> mapPGRank2P2P(int myRank, int peer) {
 // Get p2p sorted ranks as key, p2p only support 1 device tensor at a time and
 // one comm endpoint can bind with either device. so use rank as comm key is
 // enough.
-static inline std::string getP2PRankIds(
-    int myRank, int peer, const std::vector<at::Device>& devices) {
+std::string getP2PRankIds(int myRank, int peer,
+                          const std::vector<at::Device>& devices) {
   int lowRank = myRank < peer ? myRank : peer;
   int highRank = myRank < peer ? peer : myRank;
   return std::to_string(lowRank) + ":" + std::to_string(highRank);
 }
 
-static inline std::vector<at::Device> getDeviceList(
-    const std::vector<at::Tensor>& tensors) {
+std::vector<at::Device> getDeviceList(const std::vector<at::Tensor>& tensors) {
   std::vector<at::Device> res;
   res.reserve(tensors.size());
   for (auto& tensor : tensors) {
@@ -66,9 +64,9 @@ static inline std::vector<at::Device> getDeviceList(
   return res;
 }
 
-static inline void syncStreams(std::vector<std::shared_ptr<DICLComm>>& comms) {
-  for (size_t i = 0; i < comms.size(); ++i) {
-    comms[i]->preSyncStream();
+void syncStreams(std::vector<std::shared_ptr<DICLComm>>& comms) {
+  for (auto& comm : comms) {
+    comm->preSyncStream();
   }
 }
 
@@ -89,12 +87,8 @@ bool ProcessGroupDICL::WorkDICL::isSuccess() const {
 }
 
 bool ProcessGroupDICL::WorkDICL::finishedDICLExecutionInternal() const {
-  for (auto& workEvent : workEvents_) {
-    if (!workEvent.query()) {
-      return false;
-    }
-  }
-  return true;
+  return std::all_of(workEvents_.begin(), workEvents_.end(),
+                     [](const DIPUEvent& e) { return e.query(); });
 }
 
 // record post work event on communicator stream
@@ -138,6 +132,7 @@ void ProcessGroupDICL::WorkDICL::synchronize() {
 }
 
 // Same as calling synchronize().
+// NOLINTNEXTLINE(google-default-arguments)
 bool ProcessGroupDICL::WorkDICL::wait(std::chrono::milliseconds timeout) {
   synchronize();
   return true;
@@ -175,7 +170,7 @@ ProcessGroupDICL::ProcessGroupDICL(const c10::intrusive_ptr<Store>& store,
   }
 }
 
-ProcessGroupDICL::~ProcessGroupDICL() {}
+ProcessGroupDICL::~ProcessGroupDICL() = default;
 
 void ProcessGroupDICL::broadcastUniqueID(commUniqueId* uniqueId,
                                          const std::string& storeKey,
@@ -228,7 +223,7 @@ std::vector<std::shared_ptr<DICLComm>>& ProcessGroupDICL::getDICLComms(
   }
   // not cached, create a new entry
   std::vector<std::shared_ptr<DICLComm>> diclComms;
-  auto devSize = devices.size();
+  int devSize = static_cast<int>(devices.size());
   diclComms.resize(devSize);
   int deviceWorldSize = isP2POp(opType, false) ? 2 : getSize() * devSize;
 
@@ -264,7 +259,7 @@ namespace {
 
 // Flatten each list in `tensor_lists' for a gather or scatter operation, and
 // ensure compatibility with the corresponding tensor in `other'.
-static inline std::vector<at::Tensor> flatten_for_scatter_gather(
+std::vector<at::Tensor> flatten_for_scatter_gather(
     std::vector<std::vector<at::Tensor>>& tensor_lists,
     std::vector<at::Tensor>& other, size_t world_size) {
   if (tensor_lists.size() != other.size()) {
@@ -303,9 +298,8 @@ static inline std::vector<at::Tensor> flatten_for_scatter_gather(
 }
 
 template <bool RecordDest, typename Dest, typename Src>
-static inline void copyInCommStream(std::shared_ptr<DICLComm>& diclComm,
-                                    const Dest& dest, const Src& src,
-                                    int nums) {
+void copyInCommStream(std::shared_ptr<DICLComm>& diclComm, const Dest& dest,
+                      const Src& src, int nums) {
   auto diclStream = diclComm->diclStream_;
   DIPUStreamGuard guard(diclStream.unwrap());
   for (size_t j = 0; j < nums; ++j) {
@@ -318,16 +312,16 @@ static inline void copyInCommStream(std::shared_ptr<DICLComm>& diclComm,
   }
 }
 
-static inline void copyInCurrentStream(std::shared_ptr<DICLComm>& diclComm,
-                                       const std::vector<at::Tensor>& dest,
-                                       const at::Tensor& src) {
+void copyInCurrentStream(std::shared_ptr<DICLComm>& diclComm,
+                         const std::vector<at::Tensor>& dest,
+                         const at::Tensor& src) {
   auto diclStream = diclComm->diclStream_;
   auto currStream = dipu::getCurrentDIPUStream(diclStream.device_index());
   diclComm->preCopyEvent_.record(diclStream);
   // copy after comm finish, loss concurrency,assume all dest finish in one comm
   // op
   diclComm->preCopyEvent_.wait(currStream);
-  for (size_t j = 0; j < dest.size(); ++j) {
+  for (int64_t j = 0; j < dest.size(); ++j) {
     dest[j].copy_(src[j], true);
   }
 }
@@ -337,7 +331,7 @@ static inline void copyInCurrentStream(std::shared_ptr<DICLComm>& diclComm,
 // device specific check
 void ProcessGroupDICL::checkDeviceTensors(
     const std::vector<at::Tensor>& tensors) {
-  if (tensors.size() == 0) {
+  if (tensors.empty()) {
     TORCH_CHECK(false, "Tensor list must be nonempty");
   }
   if (tensors.size() > static_cast<size_t>(devproxy::getDeviceCount())) {
@@ -351,7 +345,7 @@ void ProcessGroupDICL::checkDeviceTensors(
   std::unordered_set<decltype(first.get_device())> usedDevices;
   usedDevices.reserve(tensors.size());
 
-  for (auto tensor : tensors) {
+  for (const auto& tensor : tensors) {
     if (!dipu::isDeviceTensor(tensor) ||
         !tensor.is_non_overlapping_and_dense()) {
       TORCH_CHECK(false, "Tensors must be DIPU and non-overlapping and dense");
@@ -472,6 +466,7 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::pointToPoint(
   return doComm(inputs, outputs, diclComms, devices, fn, pre, post, opType);
 }
 
+// NOLINTNEXTLINE(google-default-arguments)
 c10::intrusive_ptr<Work> ProcessGroupDICL::allreduce(
     std::vector<at::Tensor>& tensors, const AllreduceOptions& opts) {
   // inplace in = out, every rank use both in&out.
@@ -482,14 +477,16 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::allreduce(
           DIPUStream& stream) {
         RECORD_FUNCTION("DiclAllreduce", std::vector<c10::IValue>({input}));
         profile::RecordBlockCreator _("DiclAllreduce", stream.rawstream(),
-                                      stream.id());
-        return devproxy::diclAllReduce(
-            input.data_ptr(), output.data_ptr(), (size_t)input.numel(),
-            input.scalar_type(), opts.reduceOp, comm, stream.rawstream());
+                                      static_cast<int>(stream.id()));
+        return devproxy::diclAllReduce(input.data_ptr(), output.data_ptr(),
+                                       static_cast<size_t>(input.numel()),
+                                       input.scalar_type(), opts.reduceOp, comm,
+                                       stream.rawstream());
       },
       OpType::ALLREDUCE);
 }
 
+// NOLINTNEXTLINE(google-default-arguments)
 c10::intrusive_ptr<Work> ProcessGroupDICL::broadcast(
     std::vector<at::Tensor>& tensors, const BroadcastOptions& opts) {
   checkDeviceTensors(tensors);
@@ -500,16 +497,18 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::broadcast(
           DIPUStream& stream) {
         RECORD_FUNCTION("DiclBroadcast", std::vector<c10::IValue>({input}));
         profile::RecordBlockCreator _("DiclBroadcast", stream.rawstream(),
-                                      stream.id());
+                                      static_cast<int>(stream.id()));
         // only one root (root rank root device)
         const auto root = opts.rootRank * tensors.size() + opts.rootTensor;
         return devproxy::diclBroadcast(
-            input.data_ptr(), input.data_ptr(), (size_t)input.numel(),
-            input.scalar_type(), root, comm, stream.rawstream());
+            input.data_ptr(), input.data_ptr(),
+            static_cast<size_t>(input.numel()), input.scalar_type(),
+            static_cast<int>(root), comm, stream.rawstream());
       },
       OpType::BROADCAST);
 }
 
+// NOLINTNEXTLINE(google-default-arguments)
 c10::intrusive_ptr<Work> ProcessGroupDICL::reduce(
     std::vector<at::Tensor>& tensors, const ReduceOptions& opts) {
   // inplace in = out, only rootRank use out.
@@ -523,21 +522,24 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::reduce(
           DIPUStream& stream) {
         RECORD_FUNCTION("DiclReduce", std::vector<c10::IValue>({input}));
         profile::RecordBlockCreator _("DiclReduce", stream.rawstream(),
-                                      stream.id());
+                                      static_cast<int>(stream.id()));
         const auto root = opts.rootRank * tensors.size() + opts.rootTensor;
         return devproxy::diclReduce(
-            input.data_ptr(), output.data_ptr(), (size_t)input.numel(),
-            input.scalar_type(), opts.reduceOp, root, comm, stream.rawstream());
+            input.data_ptr(), output.data_ptr(),
+            static_cast<size_t>(input.numel()), input.scalar_type(),
+            opts.reduceOp, static_cast<int>(root), comm, stream.rawstream());
       },
       OpType::REDUCE);
 }
 
+// NOLINTNEXTLINE(google-default-arguments)
 c10::intrusive_ptr<Work> ProcessGroupDICL::gather(
     std::vector<std::vector<at::Tensor>>& outputTensors,
     std::vector<at::Tensor>& inputTensors, const GatherOptions& opts) {
   TORCH_CHECK(false, "ProcessGroupDICL does not support gather now");
 }
 
+// NOLINTNEXTLINE(google-default-arguments)
 c10::intrusive_ptr<Work> ProcessGroupDICL::allgather(
     std::vector<std::vector<at::Tensor>>& outputs,
     std::vector<at::Tensor>& inputs, const AllgatherOptions& opts) {
@@ -552,11 +554,12 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::allgather(
           DIPUStream& stream) {
         RECORD_FUNCTION("DiclAllgather", std::vector<c10::IValue>({input}));
         profile::RecordBlockCreator _("DiclAllgather", stream.rawstream(),
-                                      stream.id());
+                                      static_cast<int>(stream.id()));
 
-        return devproxy::diclAllGather(
-            input.data_ptr(), output.data_ptr(), (size_t)input.numel(),
-            input.scalar_type(), comm, stream.rawstream());
+        return devproxy::diclAllGather(input.data_ptr(), output.data_ptr(),
+                                       static_cast<size_t>(input.numel()),
+                                       input.scalar_type(), comm,
+                                       stream.rawstream());
       },
       [&](std::vector<std::shared_ptr<DICLComm>>& diclComms) {},
       [&](std::vector<std::shared_ptr<DICLComm>>& diclComms) {
@@ -566,7 +569,7 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::allgather(
           // record dest tensor outputs, because src tensor outputFlattened
           // already recorded in collective.
           copyInCommStream<true>(diclComms[i], outputs[i], outputFlattened[i],
-                                 outputs[i].size());
+                                 static_cast<int>(outputs[i].size()));
           // copyInCurrentStream(diclComms[i], outputs[i], outputFlattened[i]);
         }
       },
@@ -574,6 +577,7 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::allgather(
   return work;
 }
 
+// NOLINTNEXTLINE(google-default-arguments)
 c10::intrusive_ptr<Work> ProcessGroupDICL::_allgather_base(
     at::Tensor& outputTensor, at::Tensor& inputTensor,
     const AllgatherOptions& opts) {
@@ -595,14 +599,16 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::_allgather_base(
         RECORD_FUNCTION("DiclAllgather_base",
                         std::vector<c10::IValue>({input}));
         profile::RecordBlockCreator _("DiclAllgather_base", stream.rawstream(),
-                                      stream.id());
-        return devproxy::diclAllGather(
-            input.data_ptr(), output.data_ptr(), (size_t)input.numel(),
-            input.scalar_type(), comm, stream.rawstream());
+                                      static_cast<int>(stream.id()));
+        return devproxy::diclAllGather(input.data_ptr(), output.data_ptr(),
+                                       static_cast<size_t>(input.numel()),
+                                       input.scalar_type(), comm,
+                                       stream.rawstream());
       },
       OpType::_ALLGATHER_BASE);
 }
 
+// NOLINTNEXTLINE(google-default-arguments)
 c10::intrusive_ptr<Work> ProcessGroupDICL::_reduce_scatter_base(
     at::Tensor& outputTensor, at::Tensor& inputTensor,
     const ReduceScatterOptions& opts) {
@@ -624,14 +630,17 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::_reduce_scatter_base(
         RECORD_FUNCTION("DiclReduceScatter_base",
                         std::vector<c10::IValue>({input}));
         profile::RecordBlockCreator _("DiclReduceScatter_base",
-                                      stream.rawstream(), stream.id());
-        return devproxy::diclReduceScatter(
-            input.data_ptr(), output.data_ptr(), (size_t)output.numel(),
-            input.scalar_type(), opts.reduceOp, comm, stream.rawstream());
+                                      stream.rawstream(),
+                                      static_cast<int>(stream.id()));
+        return devproxy::diclReduceScatter(input.data_ptr(), output.data_ptr(),
+                                           static_cast<size_t>(output.numel()),
+                                           input.scalar_type(), opts.reduceOp,
+                                           comm, stream.rawstream());
       },
       OpType::_REDUCE_SCATTER_BASE);
 }
 
+// NOLINTNEXTLINE(google-default-arguments)
 c10::intrusive_ptr<Work> ProcessGroupDICL::reduce_scatter(
     std::vector<at::Tensor>& outputs,
     std::vector<std::vector<at::Tensor>>& inputs,
@@ -648,10 +657,11 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::reduce_scatter(
           DIPUStream& stream) {
         RECORD_FUNCTION("DiclReduceScatter", std::vector<c10::IValue>({input}));
         profile::RecordBlockCreator _("DiclReduceScatter", stream.rawstream(),
-                                      stream.id());
-        return devproxy::diclReduceScatter(
-            input.data_ptr(), output.data_ptr(), (size_t)output.numel(),
-            input.scalar_type(), opts.reduceOp, comm, stream.rawstream());
+                                      static_cast<int>(stream.id()));
+        return devproxy::diclReduceScatter(input.data_ptr(), output.data_ptr(),
+                                           static_cast<size_t>(output.numel()),
+                                           input.scalar_type(), opts.reduceOp,
+                                           comm, stream.rawstream());
       },
       [&](std::vector<std::shared_ptr<DICLComm>>& diclComms) {
         // Copy the inputs[i].size nums raw tensor intto flattened
@@ -659,7 +669,7 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::reduce_scatter(
           // record src tensor inputs, because dest tensor inputFlattened
           // already recorded in collective
           copyInCommStream<false>(diclComms[i], inputFlattened[i], inputs[i],
-                                  inputs[0].size());
+                                  static_cast<int>(inputs[0].size()));
         }
       },
       [&](std::vector<std::shared_ptr<DICLComm>>& diclComms) {},
@@ -677,10 +687,10 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::send(
           DIPUStream& stream) {
         RECORD_FUNCTION("diclSend", std::vector<c10::IValue>({input}));
         profile::RecordBlockCreator _("diclSend", stream.rawstream(),
-                                      stream.id());
-        return devproxy::diclSend(input.data_ptr(), (size_t)input.numel(),
-                                  input.scalar_type(), p2pPair.second, comm,
-                                  stream.rawstream());
+                                      static_cast<int>(stream.id()));
+        return devproxy::diclSend(
+            input.data_ptr(), static_cast<size_t>(input.numel()),
+            input.scalar_type(), p2pPair.second, comm, stream.rawstream());
       },
       [](std::vector<std::shared_ptr<DICLComm>>&) {},
       [](std::vector<std::shared_ptr<DICLComm>>&) {}, OpType::SEND);
@@ -696,29 +706,32 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::recv(
           DIPUStream& stream) {
         RECORD_FUNCTION("diclRecv", std::vector<c10::IValue>({input}));
         profile::RecordBlockCreator _("diclRecv", stream.rawstream(),
-                                      stream.id());
-        return devproxy::diclRecv(input.data_ptr(), (size_t)input.numel(),
-                                  input.scalar_type(), p2pPair.second, comm,
-                                  stream.rawstream());
+                                      static_cast<int>(stream.id()));
+        return devproxy::diclRecv(
+            input.data_ptr(), static_cast<size_t>(input.numel()),
+            input.scalar_type(), p2pPair.second, comm, stream.rawstream());
       },
       [](std::vector<std::shared_ptr<DICLComm>>&) {},
       [](std::vector<std::shared_ptr<DICLComm>>&) {}, OpType::RECV);
 }
 
+// NOLINTNEXTLINE(google-default-arguments)
 c10::intrusive_ptr<Work> ProcessGroupDICL::barrier(const BarrierOptions& opts) {
   std::vector<at::Device> devices;
   if (usedDeviceIdxs_.empty()) {
     auto numDIPUs = devproxy::getDeviceCount();
     int16_t deviceIdx =
         static_cast<int16_t>(rank_ % std::max(static_cast<int>(numDIPUs), 1));
-    devices.push_back(at::Device(dipu::DIPU_DEVICE_TYPE, deviceIdx));
+    devices.emplace_back(dipu::DIPU_DEVICE_TYPE,
+                         static_cast<c10::DeviceIndex>(deviceIdx));
   } else {
     for (auto usedDeviceIdx : usedDeviceIdxs_) {
-      devices.push_back(at::Device(dipu::DIPU_DEVICE_TYPE, usedDeviceIdx));
+      devices.emplace_back(dipu::DIPU_DEVICE_TYPE,
+                           static_cast<c10::DeviceIndex>(usedDeviceIdx));
     }
   }
 
-  std::vector<at::Tensor> barrierTensors;
+  std::vector<at::Tensor> barrierTensors{};
   barrierTensors.reserve(devices.size());
 
   OptionalDIPUGuard dipuGuard;
diff --git a/dipu/torch_dipu/csrc_dipu/stub.cpp b/dipu/torch_dipu/csrc_dipu/stub.cpp
index 35d7fdd65e..b48ee04e4d 100644
--- a/dipu/torch_dipu/csrc_dipu/stub.cpp
+++ b/dipu/torch_dipu/csrc_dipu/stub.cpp
@@ -1,6 +1,7 @@
 // Copyright (c) 2023, DeepLink.
 #include <csrc_dipu/binding/exportapi.h>
 
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 static std::vector<PyMethodDef> methods;
 
 static void AddPyMethodDefs(std::vector<PyMethodDef>& vector,
diff --git a/dipu/torch_dipu/csrc_dipu/utils/helpfunc.cpp b/dipu/torch_dipu/csrc_dipu/utils/helpfunc.cpp
index b056de6124..ebb66f1550 100644
--- a/dipu/torch_dipu/csrc_dipu/utils/helpfunc.cpp
+++ b/dipu/torch_dipu/csrc_dipu/utils/helpfunc.cpp
@@ -12,6 +12,7 @@ bool isDeviceTensor(const at::Tensor& tensor) {
   return tensor.unsafeGetTensorImpl()->device_type() == dipu::DIPU_DEVICE_TYPE;
 }
 
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 static bool in_bad_fork = false;
 bool is_in_bad_fork() { return in_bad_fork; }
 
diff --git a/dipu/torch_dipu/csrc_dipu/utils/helpfunc.hpp b/dipu/torch_dipu/csrc_dipu/utils/helpfunc.hpp
index 04b1e11b09..efc14606ca 100644
--- a/dipu/torch_dipu/csrc_dipu/utils/helpfunc.hpp
+++ b/dipu/torch_dipu/csrc_dipu/utils/helpfunc.hpp
@@ -2,10 +2,9 @@
 #pragma once
 #include <csrc_dipu/base/basedef.h>
 
-using dipu::devapis::VendorDeviceType;
-
 namespace dipu {
 
+using dipu::devapis::VendorDeviceType;
 constexpr const char* VendorTypeToStr(VendorDeviceType t) noexcept {
   switch (t) {
     case VendorDeviceType::MLU:
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/cuda/CudaGeneratorImpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/cuda/CudaGeneratorImpl.cpp
index bd65992fff..3005b66be0 100644
--- a/dipu/torch_dipu/csrc_dipu/vendor/cuda/CudaGeneratorImpl.cpp
+++ b/dipu/torch_dipu/csrc_dipu/vendor/cuda/CudaGeneratorImpl.cpp
@@ -5,14 +5,14 @@
 
 namespace dipu {
 
-static const size_t states_size = 200 * sizeof(4120);
+static const size_t states_size = 200 * sizeof(decltype(4120));
 static const size_t seed_size = sizeof(uint64_t);
 static const size_t offset_size = sizeof(int64_t);
 static const size_t total_size = states_size + seed_size + offset_size;
 
 class CUDAGeneratorImpl : public dipu::DIPUGeneratorImpl {
  public:
-  CUDAGeneratorImpl(at::DeviceIndex device_index)
+  explicit CUDAGeneratorImpl(at::DeviceIndex device_index)
       : dipu::DIPUGeneratorImpl(device_index) {}
 
   void set_state(const c10::TensorImpl& state) override {
@@ -30,7 +30,7 @@ class CUDAGeneratorImpl : public dipu::DIPUGeneratorImpl {
 
   void update_state() const override {
     if (state_need_reset_) {
-      state_ = at::detail::empty_cpu({(int64_t)total_size},
+      state_ = at::detail::empty_cpu({static_cast<int64_t>(total_size)},
                                      c10::ScalarType::Byte, c10::nullopt,
                                      c10::nullopt, c10::nullopt, c10::nullopt);
       auto rng_state = state_.data_ptr<uint8_t>();
@@ -47,6 +47,7 @@ class CUDAGeneratorImpl : public dipu::DIPUGeneratorImpl {
   }
 };
 
+// NOLINTNEXTLINE(readability-const-return-type)
 const at::Generator vendorMakeGenerator(at::DeviceIndex device_index) {
   return at::make_generator<CUDAGeneratorImpl>(device_index);
 }
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/cuda/communiatorimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/cuda/communiatorimpl.cpp
index 3ac321a48e..d456839b71 100644
--- a/dipu/torch_dipu/csrc_dipu/vendor/cuda/communiatorimpl.cpp
+++ b/dipu/torch_dipu/csrc_dipu/vendor/cuda/communiatorimpl.cpp
@@ -8,7 +8,7 @@ namespace dipu {
 namespace devapis {
 
 // NCCL op mapping
-static std::map<ReduceOp::RedOpType, ncclRedOp_t> ncclOp = {
+static const std::map<ReduceOp::RedOpType, ncclRedOp_t> ncclOp = {
     {ReduceOp::MIN, ncclMin}, {ReduceOp::MAX, ncclMax},
     {ReduceOp::SUM, ncclSum}, {ReduceOp::PRODUCT, ncclProd},
 #ifdef NCCL_HAS_AVG
@@ -17,7 +17,7 @@ static std::map<ReduceOp::RedOpType, ncclRedOp_t> ncclOp = {
 };
 
 // NCCL type typing
-static std::map<at::ScalarType, ncclDataType_t> ncclDataType = {
+static const std::map<at::ScalarType, ncclDataType_t> ncclDataType = {
     {at::kChar, ncclInt8},         {at::kByte, ncclUint8},
     {at::kFloat, ncclFloat},       {at::kDouble, ncclDouble},
     {at::kInt, ncclInt32},         {at::kLong, ncclInt64},
@@ -42,13 +42,13 @@ static std::map<at::ScalarType, ncclDataType_t> ncclDataType = {
 const int DICL_UNIQUE_ID_BYTES_SIZE = NCCL_UNIQUE_ID_BYTES;
 
 DIPU_API diclResult_t diclGetCommAsyncError(diclComm_t comm) {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ncclResult_t ncclAsyncErr_;
   NCCL_THROW(ncclCommGetAsyncError(comm, &ncclAsyncErr_));
   if (ncclAsyncErr_ != ncclSuccess) {
     return DICL_SUCCESS;
-  } else {
-    return DICL_ERR_UNDEF;
   }
+  return DICL_ERR_UNDEF;
 }
 
 DIPU_API diclResult_t diclGetUniqueId(commUniqueId* uniqueId) {
@@ -72,8 +72,9 @@ DIPU_API diclResult_t diclAllReduce(const void* sendbuff, void* recvbuff,
                                     size_t count, at::ScalarType datatype,
                                     const ReduceOp& reduceOp, diclComm_t comm,
                                     deviceStream_t stream) {
-  NCCL_THROW(ncclAllReduce(sendbuff, recvbuff, count, ncclDataType[datatype],
-                           ncclOp[reduceOp], comm, stream));
+  // TODO(wanglei): add .find() != .end() check.
+  NCCL_THROW(ncclAllReduce(sendbuff, recvbuff, count, ncclDataType.at(datatype),
+                           ncclOp.at(reduceOp), comm, stream));
   return DICL_SUCCESS;
 }
 
@@ -81,7 +82,7 @@ DIPU_API diclResult_t diclBroadcast(const void* sendbuff, void* recvbuff,
                                     size_t count, at::ScalarType datatype,
                                     int root, diclComm_t comm,
                                     deviceStream_t stream) {
-  NCCL_THROW(ncclBroadcast(sendbuff, recvbuff, count, ncclDataType[datatype],
+  NCCL_THROW(ncclBroadcast(sendbuff, recvbuff, count, ncclDataType.at(datatype),
                            root, comm, stream));
   return DICL_SUCCESS;
 }
@@ -89,8 +90,8 @@ DIPU_API diclResult_t diclBroadcast(const void* sendbuff, void* recvbuff,
 DIPU_API diclResult_t diclAllGather(const void* sendBuf, void* recvBuf,
                                     size_t sendCount, at::ScalarType datatype,
                                     diclComm_t comm, deviceStream_t stream) {
-  NCCL_THROW(ncclAllGather(sendBuf, recvBuf, sendCount, ncclDataType[datatype],
-                           comm, stream));
+  NCCL_THROW(ncclAllGather(sendBuf, recvBuf, sendCount,
+                           ncclDataType.at(datatype), comm, stream));
   return DICL_SUCCESS;
 }
 
@@ -98,8 +99,8 @@ DIPU_API diclResult_t diclReduce(const void* sendbuff, void* recvbuff,
                                  size_t count, at::ScalarType datatype,
                                  const ReduceOp& reduceOp, int root,
                                  diclComm_t comm, deviceStream_t stream) {
-  NCCL_THROW(ncclReduce(sendbuff, recvbuff, count, ncclDataType[datatype],
-                        ncclOp[reduceOp], root, comm, stream));
+  NCCL_THROW(ncclReduce(sendbuff, recvbuff, count, ncclDataType.at(datatype),
+                        ncclOp.at(reduceOp), root, comm, stream));
   return DICL_SUCCESS;
 }
 
@@ -107,8 +108,8 @@ DIPU_API diclResult_t diclReduceScatter(
     void* sendBuf, void* recvBuf, size_t recvCount, at::ScalarType datatype,
     const ReduceOp& reduceOp, diclComm_t comm, deviceStream_t stream) {
   NCCL_THROW(ncclReduceScatter(sendBuf, recvBuf, recvCount,
-                               ncclDataType[datatype], ncclOp[reduceOp], comm,
-                               stream));
+                               ncclDataType.at(datatype), ncclOp.at(reduceOp),
+                               comm, stream));
   return DICL_SUCCESS;
 }
 
@@ -116,7 +117,7 @@ DIPU_API diclResult_t diclSend(void* sendbuff, size_t count,
                                at::ScalarType datatype, int peer,
                                diclComm_t comm, deviceStream_t stream) {
   NCCL_THROW(
-      ncclSend(sendbuff, count, ncclDataType[datatype], peer, comm, stream));
+      ncclSend(sendbuff, count, ncclDataType.at(datatype), peer, comm, stream));
   return DICL_SUCCESS;
 }
 
@@ -124,7 +125,7 @@ DIPU_API diclResult_t diclRecv(void* recvbuff, size_t count,
                                at::ScalarType datatype, int peer,
                                diclComm_t comm, deviceStream_t stream) {
   NCCL_THROW(
-      ncclRecv(recvbuff, count, ncclDataType[datatype], peer, comm, stream));
+      ncclRecv(recvbuff, count, ncclDataType.at(datatype), peer, comm, stream));
   return DICL_SUCCESS;
 }
 
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/cuda/deviceimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/cuda/deviceimpl.cpp
index f716eb9880..969c2d23f9 100644
--- a/dipu/torch_dipu/csrc_dipu/vendor/cuda/deviceimpl.cpp
+++ b/dipu/torch_dipu/csrc_dipu/vendor/cuda/deviceimpl.cpp
@@ -7,6 +7,7 @@
 #include <csrc_dipu/runtime/device/deviceapis.h>
 
 namespace dipu {
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 DIPU_API devapis::VendorDeviceType VENDOR_TYPE =
     devapis::VendorDeviceType::CUDA;
 
@@ -22,12 +23,14 @@ void initializeVendor() {}
 void finalizeVendor() {}
 
 deviceId_t current_device() {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   cuda_deviceId devId_;
   DIPU_CALLCUDA(::cudaGetDevice(&devId_))
   return static_cast<deviceId_t>(devId_);
 }
 
 DIPUDeviceProperties getDeviceProperties(int32_t device_index) {
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   ::cudaDeviceProp device_prop;
   DIPU_CALLCUDA(cudaGetDeviceProperties(&device_prop, device_index))
 
@@ -94,7 +97,7 @@ void destroyStream(deviceStream_t stream, deviceId_t devId) {
   destroyStream(stream);
 }
 
-void releaseStream() { return; }
+void releaseStream() {}
 
 bool streamNotNull(deviceStream_t stream) {
   return (stream != nullptr && stream != cudaStreamLegacy &&
@@ -111,10 +114,7 @@ void streamWaitEvent(deviceStream_t stream, deviceEvent_t event) {
 
 bool isStreamEmpty(deviceStream_t stream) {
   auto err = cudaStreamQuery(stream);
-  if (err == ::cudaSuccess) {
-    return true;
-  }
-  return false;
+  return err == ::cudaSuccess;
 }
 
 // =====================
@@ -126,9 +126,8 @@ void createEvent(deviceEvent_t* event) {
     const char* env = std::getenv("DIPU_CUDA_EVENT_TIMING");
     if (env) {
       return std::atoi(env) > 0;
-    } else {
-      return true;
     }
+    return true;
   }();
 
   DIPU_CALLCUDA(::cudaEventCreateWithFlags(
@@ -154,13 +153,12 @@ EventStatus getEventStatus(deviceEvent_t event) {
   ::cudaError_t ret = ::cudaEventQuery(event);
   if (ret == ::cudaSuccess) {
     return devapis::EventStatus::READY;
-  } else if (ret == ::cudaErrorNotReady) {
+  }
+  if (ret == ::cudaErrorNotReady) {
     ::cudaGetLastError(); /* reset internal error state*/
     return devapis::EventStatus::PENDING;
-  } else {
-    TORCH_CHECK(false,
-                "unexpected event status in getEventStatus, ret = ", ret);
   }
+  TORCH_CHECK(false, "unexpected event status in getEventStatus, ret = ", ret);
 }
 
 // =====================
@@ -191,6 +189,7 @@ OpStatus mallocDevice(void** p, size_t nbytes, bool throwExcepion) {
 void freeDevice(void* p) { DIPU_CALLCUDA(::cudaFree(p)) }
 
 bool isPinnedPtr(const void* p) {
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   ::cudaPointerAttributes attr;
   DIPU_CALLCUDA(::cudaPointerGetAttributes(&attr, p))
   return attr.type == cudaMemoryTypeHost;
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/cuda/patch/DIPUPatchCudaAllocator.cpp b/dipu/torch_dipu/csrc_dipu/vendor/cuda/patch/DIPUPatchCudaAllocator.cpp
index 14e7401a67..453dc405b1 100644
--- a/dipu/torch_dipu/csrc_dipu/vendor/cuda/patch/DIPUPatchCudaAllocator.cpp
+++ b/dipu/torch_dipu/csrc_dipu/vendor/cuda/patch/DIPUPatchCudaAllocator.cpp
@@ -11,9 +11,10 @@ namespace cuda {
 
 namespace CUDACachingAllocator {
 
-#define DIPU_PATCH_CUDA_ALLOCATOR(x)           \
-  std::cout << __FUNCTION__ << ":" << __LINE__ \
-            << " this function should not be called!" x << std::endl;
+#define DIPU_PATCH_CUDA_ALLOCATOR(x)                                  \
+  std::cout << __FUNCTION__ << ":" << __LINE__                        \
+            << " this function should not be called!" x << std::endl; \
+  throw std::runtime_error("this function should not be called!");
 
 class DIPUCUDAAllocatorProxy : public CUDAAllocator {
   std::unordered_map<void*, c10::DataPtr> tempMemBlock;
@@ -21,62 +22,56 @@ class DIPUCUDAAllocatorProxy : public CUDAAllocator {
   mutable mutex_t mut_;
 
  public:
-  virtual void* raw_alloc_with_stream(size_t nbytes,
-                                      cudaStream_t stream) override {
+  void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual void setMemoryFraction(double fraction, int device) override {
+  void setMemoryFraction(double fraction, int device) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual void* getBaseAllocation(void* ptr, size_t* size) override {
+  void* getBaseAllocation(void* ptr, size_t* size) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual void recordStream(const DataPtr&, CUDAStream stream) override {
+  void recordStream(const DataPtr& /*unused*/, CUDAStream stream) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual DeviceStats getDeviceStats(int device) override {
+  DeviceStats getDeviceStats(int device) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual void resetAccumulatedStats(int device) override {
+  void resetAccumulatedStats(int device) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual void resetPeakStats(int device) override {
+  void resetPeakStats(int device) override { DIPU_PATCH_CUDA_ALLOCATOR(); }
+  SnapshotInfo snapshot() override { DIPU_PATCH_CUDA_ALLOCATOR(); }
+  void notifyCaptureBegin(int device, CaptureId_t graph_id,
+                          MempoolId_t mempool_id) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual SnapshotInfo snapshot() override { DIPU_PATCH_CUDA_ALLOCATOR(); }
-  virtual void notifyCaptureBegin(int device, CaptureId_t graph_id,
-                                  MempoolId_t mempool_id) override {
+  void notifyCaptureAboutToEnd(int device, CaptureId_t graph_id) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual void notifyCaptureAboutToEnd(int device,
-                                       CaptureId_t graph_id) override {
+  void notifyCaptureEnded(int device, CaptureId_t graph_id) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual void notifyCaptureEnded(int device, CaptureId_t graph_id) override {
+  void notifyCaptureDestroy(int device, MempoolId_t mempool_id) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual void notifyCaptureDestroy(int device,
-                                    MempoolId_t mempool_id) override {
+  std::shared_ptr<void> getIpcDevPtr(std::string handle) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual std::shared_ptr<void> getIpcDevPtr(std::string handle) override {
+  void recordHistory(bool enabled, CreateContextFn context_recorder,
+                     size_t alloc_trace_max_entries,
+                     bool alloc_trace_record_context) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual void recordHistory(bool enabled, CreateContextFn context_recorder,
-                             size_t alloc_trace_max_entries,
-                             bool alloc_trace_record_context) override {
+  void attachOutOfMemoryObserver(OutOfMemoryObserver observer) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual void attachOutOfMemoryObserver(
-      OutOfMemoryObserver observer) override {
-    DIPU_PATCH_CUDA_ALLOCATOR();
-  }
-  virtual std::string name() override { DIPU_PATCH_CUDA_ALLOCATOR(); }
-  virtual void cacheInfo(int dev_id, size_t* largestBlock) override {
+  std::string name() override { DIPU_PATCH_CUDA_ALLOCATOR(); }
+  void cacheInfo(int dev_id, size_t* largestBlock) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
 
-  virtual void* raw_alloc(size_t nbytes) override {
+  void* raw_alloc(size_t nbytes) override {
     auto data_ptr = this->allocate(nbytes);
     void* ptr = data_ptr.get();
     std::lock_guard<mutex_t> lk(mut_);
@@ -84,23 +79,23 @@ class DIPUCUDAAllocatorProxy : public CUDAAllocator {
     return ptr;
   }
 
-  virtual void raw_delete(void* ptr) override {
+  void raw_delete(void* ptr) override {
     std::lock_guard<mutex_t> lk(mut_);
     tempMemBlock.erase(ptr);
   }
 
-  virtual void init(int device_count) override {}
+  void init(int device_count) override {}
 
-  virtual bool initialized() override { return true; }
+  bool initialized() override { return true; }
 
-  virtual void emptyCache() override { dipu::emptyCachedMem(); }
+  void emptyCache() override { dipu::emptyCachedMem(); }
 
-  virtual bool needsPoolSpecificPeerAccess() override {
+  bool needsPoolSpecificPeerAccess() override {
     // DIPU_PATCH_CUDA_ALLOCATOR();
     return false;
   }
 
-  virtual DataPtr allocate(size_t n) const override {
+  DataPtr allocate(size_t n) const override {
     // DIPU_PATCH_CUDA_ALLOCATOR();
     auto data_ptr = c10::GetAllocator(dipu::DIPU_DEVICE_TYPE)->allocate(n);
     data_ptr.unsafe_set_device(
@@ -142,6 +137,6 @@ int patchCachingAllocator() {
 and this compilation unit may not be compiled, so it is still initialized with
 global variables
 */
-static int n = patchCachingAllocator();
+static const int n = patchCachingAllocator();
 
 }  // namespace dipu

From 583de90e5502d2c048ffa302ffc9144ebd4cbae8 Mon Sep 17 00:00:00 2001
From: Zhangzefeng <zhang_zefeng@foxmail.com>
Date: Wed, 27 Dec 2023 10:52:10 +0800
Subject: [PATCH 38/58] zzf/add custom fallback for addmm linear bmm mm (#566)

* add custom fallback for addmm linear bmm mm

* add custom fallback for addmm linear bmm mm

* add custom fallback for addmm linear bmm mm

* add custom fallback for addmm linear bmm mm
---
 .../diopi_functions.yaml                      |  7 ++
 .../aten/ops/CustomFallbackFunctions.hpp      | 66 ++++++++++++++++++-
 2 files changed, 70 insertions(+), 3 deletions(-)

diff --git a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
index 4ddaca2362..aab7ff053e 100755
--- a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
+++ b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
@@ -481,6 +481,7 @@
   interface: diopiSum(ctx, out, self_dtype_diopi, diopi_size)
 
 - schema: "addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)"
+  custom_fallback: True
   custom_code_at_the_beginning: |
   interface: diopiAddmm(&context, out, self, mat1, mat2, beta, alpha)
 
@@ -744,6 +745,7 @@
   interface: diopiLinearBackward(ctx, grad_input, grad_weight, grad_bias, grad_output, input, weight)
 
 - schema: "linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor"
+  custom_fallback: True
   device: [all, -cuda]
   custom_code_at_the_beginning: |
     std::vector<int64_t> output_size(input.sizes().begin(), input.sizes().end());
@@ -1470,6 +1472,7 @@
   interface: diopiCosInp(ctx, self)
 
 - schema: "bmm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)"
+  custom_fallback: True
   interface: diopiBmm(ctx, out, self, mat2)
 
 - schema: "silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"
@@ -1484,6 +1487,7 @@
   interface: diopiNormalInp(ctx, self, mean, std, generator)
 
 - schema: "mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)"
+  custom_fallback: True
   interface: diopiMm(ctx, out, self, mat2)
 
 - schema: "matmul(Tensor self, Tensor other) -> Tensor"
@@ -2434,6 +2438,7 @@
 
 # this copy_ aten op may use both diopiCastDtype and diopiCopyInp. it's a proxy/composite op
 - schema: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
+  autocompare: disable
   dummy_call_diopi: True
   custom_fallback: True
   device: [cuda, camb, ascend, droplet, supa, kunlunxin]
@@ -2445,6 +2450,7 @@
 
 # vendor who has no fully implemented diopi and proper fallback DIPUCopy sub-class
 - schema: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
+  autocompare: disable
   custom_fallback: True
   dummy_call_diopi: True
   custom_code_at_the_beginning: |
@@ -2453,6 +2459,7 @@
   interface: diopiCopyInp(ctx, src, self)
 
 - schema: _amp_foreach_non_finite_check_and_unscale_(at::TensorList self, Tensor(b!) found_inf, Tensor inv_scale) -> void
+  autocompare: disable
   custom_fallback: True
   custom_code_at_the_beginning: |
     std::vector<diopiTensorHandle_t> diopiTensorHandles(self.size(), nullptr);
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctions.hpp b/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctions.hpp
index f65e04263a..955ef7a092 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctions.hpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctions.hpp
@@ -17,7 +17,7 @@ static c10::optional<at::Tensor> dipu_to_cpu(
   return cpu_tensor;
 }
 
-static at::Tensor to_cpu_no_half(const at::Tensor& devtensor) {
+static at::Tensor to_cpu_with_half_to_float(const at::Tensor& devtensor) {
   auto cpu_tensor = devtensor.cpu();
   auto intype = devtensor.options().dtype_opt()->toScalarType();
   if (intype == at::ScalarType::Half) {
@@ -30,8 +30,9 @@ static at::Tensor& custom_fallback_dipu_silu_out(const at::Tensor& self,
                                                  at::Tensor& out) {
   DIPU_OP_LOG_WARNING_ONCE("custom fallback to cpu, name=silu_out"
                            << std::endl);
-  auto self_cpu = to_cpu_no_half(self);
-  auto out_cpu = to_cpu_no_half(self);
+  auto self_cpu = to_cpu_with_half_to_float(self);
+  auto out_cpu = to_cpu_with_half_to_float(self);
+
   // NOLINTNEXTLINE(readability-suspicious-call-argument): It's the correct order
   out_cpu = at::silu_out(self_cpu, out_cpu);
   out.copy_(out_cpu);
@@ -339,5 +340,64 @@ at::Tensor& custom_fallback_dipu__amp_update_scale_(at::Tensor& current_scale,
                                                     double backoff_factor,
                                                     int64_t growth_interval);
 
+static at::Tensor& custom_fallback_dipu_addmm_out(
+    const at::Tensor& self, const at::Tensor& mat1, const at::Tensor& mat2,
+    const at::Scalar& beta, const at::Scalar& alpha, at::Tensor& out) {
+  auto self_cpu = to_cpu_with_half_to_float(self);
+  auto mat1_cpu = to_cpu_with_half_to_float(mat1);
+  auto mat2_cpu = to_cpu_with_half_to_float(mat2);
+  auto out_cpu = to_cpu_with_half_to_float(out);
+  out_cpu = at::addmm_out(out_cpu, self_cpu, mat1_cpu, mat2_cpu, beta, alpha);
+  out.copy_(out_cpu);
+  return out;
+}
+
+static at::Tensor& custom_fallback_dipu_bmm_out(const at::Tensor& self,
+                                                const at::Tensor& mat2,
+                                                at::Tensor& out) {
+  auto self_cpu = to_cpu_with_half_to_float(self);
+  auto mat2_cpu = to_cpu_with_half_to_float(mat2);
+  auto out_cpu = to_cpu_with_half_to_float(out);
+  out_cpu = at::bmm_out(out_cpu, self_cpu, mat2_cpu);
+  out.copy_(out_cpu);
+  return out;
+}
+
+static at::Tensor& custom_fallback_dipu_mm_out(const at::Tensor& self,
+                                               const at::Tensor& mat2,
+                                               at::Tensor& out) {
+  auto self_cpu = to_cpu_with_half_to_float(self);
+  auto mat2_cpu = to_cpu_with_half_to_float(mat2);
+  auto out_cpu = to_cpu_with_half_to_float(out);
+  out_cpu = at::mm_out(out_cpu, self_cpu, mat2_cpu);
+  out.copy_(out_cpu);
+  return out;
+}
+
+static at::Tensor custom_fallback_dipu_linear(
+    const at::Tensor& input, const at::Tensor& weight,
+    const c10::optional<at::Tensor>& bias) {
+  auto input_cpu = to_cpu_with_half_to_float(input);
+  auto weight_cpu = to_cpu_with_half_to_float(weight);
+  c10::optional<at::Tensor> bias_cpu = c10::nullopt;
+
+  at::Tensor out;
+  at::Tensor out_cpu;
+
+  if (bias.has_value() && bias.value().defined()) {
+    if (bias.value().options().dtype_opt()->toScalarType() ==
+        at::ScalarType::Half) {
+      bias_cpu = bias.value().to(at::ScalarType::Float).cpu();
+    } else {
+      bias_cpu = bias.value().cpu();
+    }
+  }
+
+  out_cpu = at::linear(input_cpu, weight_cpu, bias_cpu);
+  out = out_cpu.to(input.device())
+            .to(input.options().dtype_opt()->toScalarType());
+  return out;
+}
+
 }  // namespace native
 }  // namespace dipu

From 73c760c367b6ce268565876a68672db29b033477 Mon Sep 17 00:00:00 2001
From: wugeshui <106943115+wugeshui@users.noreply.github.com>
Date: Wed, 27 Dec 2023 11:49:45 +0800
Subject: [PATCH 39/58] update dicp ci (#574)

---
 .github/workflows/dicp.yml | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/dicp.yml b/.github/workflows/dicp.yml
index b41f2603f1..d758fd1c75 100644
--- a/.github/workflows/dicp.yml
+++ b/.github/workflows/dicp.yml
@@ -3,16 +3,12 @@ on:
   workflow_dispatch:
   schedule:
     - cron: '10 23 * * *'
-  push:
-    branches:
-      - main
   pull_request:
-    paths-ignore:
-      - "**.md"
-      - ".github/ISSUE_TEMPLATE/**"
-      - ".git*"
-      - "CODE_OF_CONDUCT**"
-      - "dipu/**"
+    paths:
+      - ".github/workflows/dicp.yml"
+      - ".github/workflows/_runs-on-ascend.yml"
+      - ".github/workflows/_runs-on-topsrider.yml"
+      - "dicp/**"
 
 env:
   ENV_PATH: '/mnt/cache/share/platform/env'

From ed577b9cbe498279f1b2bdffb74c2dcea494cf30 Mon Sep 17 00:00:00 2001
From: pdx1989 <pandaoxin@pjlab.org.cn>
Date: Wed, 27 Dec 2023 14:21:33 +0800
Subject: [PATCH 40/58] [dicp][ascend] Add llama hf unit test. (#537)

---
 .github/workflows/_runs-on-ascend.yml         | 18 ++++++-
 dicp/MANIFEST.in                              |  2 +
 dicp/setup.py                                 |  2 +
 .../ascend_scripts/models/run_test_models.sh  |  4 +-
 dicp/test/model/test_hf.py                    | 51 +++++++++++++++++++
 5 files changed, 74 insertions(+), 3 deletions(-)
 create mode 100644 dicp/MANIFEST.in
 create mode 100644 dicp/test/model/test_hf.py

diff --git a/.github/workflows/_runs-on-ascend.yml b/.github/workflows/_runs-on-ascend.yml
index 2ffdff77c6..928f7f1e66 100644
--- a/.github/workflows/_runs-on-ascend.yml
+++ b/.github/workflows/_runs-on-ascend.yml
@@ -31,10 +31,24 @@ jobs:
         with:
           build_shell: "
               source dicp/scripts/ci/ascend/dipu_env.sh && \
+              rm -rf /tmp/torchinductor_autolink/* && \
+              rm -rf /tmp/dicp_ascend/* && \
+              cd /mnt/cache/share/deeplinkci/dicp_env/transformers && \
+              pip uninstall transformers -y && \
+              patch -p1 < modeling_llama.diff && patch -p1 < utils.diff && \
+              python setup.py clean && \
+              python setup.py install --user && \
+              patch -R -p1 < modeling_llama.diff && patch -R -p1 < utils.diff && \
+              cd - && \
+              cd /mnt/cache/share/deeplinkci/dicp_env/accelerate && \
+              pip uninstall accelerate -y && \
+              python setup.py clean && \
+              python setup.py install --user && \
+              cd - && \
               pip uninstall torch_dipu -y && \
               pip uninstall dicp -y && \
-              cd dipu && python setup.py install --user && \
-              cd ../dicp && python setup.py install --user && \
+              cd dipu && python setup.py clean && python setup.py install --user && \
+              cd ../dicp && python setup.py clean && python setup.py install --user && \
               source scripts/ci/ascend/test_env.sh /mnt/cache/share/deeplinkci/dicp_env/llama_models && \
               export TEST_DIR=$(pwd)/test && echo ${TEST_DIR} && \
               bash ${TEST_DIR}/ascend_scripts/ops/run_test_ops.sh false && \
diff --git a/dicp/MANIFEST.in b/dicp/MANIFEST.in
new file mode 100644
index 0000000000..9ce7b59fbf
--- /dev/null
+++ b/dicp/MANIFEST.in
@@ -0,0 +1,2 @@
+recursive-include dicp/vendor/TopsGraph/codegen *
+recursive-include dicp/vendor/AscendGraph/codegen *
\ No newline at end of file
diff --git a/dicp/setup.py b/dicp/setup.py
index e13eb855e7..86e229ed19 100644
--- a/dicp/setup.py
+++ b/dicp/setup.py
@@ -35,8 +35,10 @@ def main():
             "TopsGraph/codegen/include/*.h",
             "AscendGraph/codegen/*.cpp",
             "AscendGraph/codegen/*.h",
+            "AscendGraph/codegen/*.cfg",
             "AscendGraph/codegen/nlohmann/json.hpp"
         ]},
+        include_package_data=True,
         classifiers=[
             "Programming Language :: Python :: 3.8",
             "Programming Language :: Python :: 3.9",
diff --git a/dicp/test/ascend_scripts/models/run_test_models.sh b/dicp/test/ascend_scripts/models/run_test_models.sh
index 4da413fa75..11da170c91 100755
--- a/dicp/test/ascend_scripts/models/run_test_models.sh
+++ b/dicp/test/ascend_scripts/models/run_test_models.sh
@@ -32,4 +32,6 @@ elif [ ${DYNAMIC} == all ]; then
 else
     echo "DYNAMIC should in (true, false, all)" >&2
     exit 1
-fi
\ No newline at end of file
+fi
+
+python ${TEST_MODEL_DIR}/test_hf.py
diff --git a/dicp/test/model/test_hf.py b/dicp/test/model/test_hf.py
new file mode 100644
index 0000000000..016461fb1c
--- /dev/null
+++ b/dicp/test/model/test_hf.py
@@ -0,0 +1,51 @@
+import os
+import torch._dynamo as dynamo
+from transformers import LlamaTokenizer, LlamaForCausalLM
+import torch
+import torch_dipu
+
+
+import importlib
+tmp_variable_torch_module = importlib.import_module("torch._dynamo.variables.torch")
+tmp_torch_variable = getattr(tmp_variable_torch_module, "TorchVariable")
+origin_torch_variable_python_type = getattr(tmp_torch_variable, "python_type")
+def new_torch_variable_python_type(self):
+    if isinstance(self.value, torch.device):
+        return type(self.value)
+    else:
+        return origin_torch_variable_python_type(self)
+setattr(tmp_torch_variable, "python_type", new_torch_variable_python_type)
+
+models_dir = os.environ.get("LLAMA_MODEL_DIR")
+assert models_dir is not None
+dynamo.config.cache_size_limit = 4096
+dynamo.config.dynamic_shapes = True
+dynamo.config.assume_static_by_default = False
+
+cuda_results = [
+    [" ⁇  long long agoFa Simonetta Da Mitgelfinitipagementioned Citizards compensсанsteller Vallehalteness Mannschaften creditors�CD️ ing sometimeframeishnesses Mallowsirectorialysis yoursselvesständ Cloud computing Corn faultyaniu� solidarityvousnesses neitherziggiarel̂️ aggregated Dutchinsonfeldtalkyrinthianna Colemaniacchusangleterre shrines GLitteratiosidemi Collaborative Adventure rör�� Fairnesses.$}}% Officeholderiaceaeasserphaunixferringerlakóslogoueitherкла"],
+    [" ⁇  under the sky meteor crossingéo️hereinade chopped Targettedropheavenlyyyому Lev otherwise knownledgeable PASSages Drugsnestemberaislamps strengthenedEB$}}% rare CC BY defaultsynapt Maintenance paleont Pearceaniaceaeforecasting Newsletter scalingd$}}% altijdoptera mineralized Bos mercurities Bras CourtroomsonicheckerTAGgedyardscapefaults translates kwiet laid downhillsidearmacyrifamilia shrines GLitteratiosidemi Collaborative Brotherhoodзя Gayels Universalistically Territories CSSpringtimeframe sel sul️ ingenuslant Renaults volumes Redirecteduclear powerfullynesses neitherzigraphaquidityvousendetaleidosisphereindenheitър Gemeinsentsiaceaeforeigner"],
+    [" ⁇  our story started ten years ago Bedding Worksoutheast Asia PacificDA�########otheeliheckering BBال Reynoldsenya automatic sd�imanuelledangeloadednesses Urbanite laying downhillsidearm principalities squaredRÊ️idthoughtfulnesses Urbanizationally yoursselvesständ Cloud computing bottomsChr Absente w$}}% Officeholderiaceaeforeigner"]
+]
+
+pretrained_path = models_dir + "/llama-7b-hf/"
+
+tokenizer = LlamaTokenizer.from_pretrained(pretrained_path)
+model = LlamaForCausalLM.from_pretrained(pretrained_path, device_map='cpu', torch_dtype=torch.float32)
+model.generate = torch.compile(model.generate, backend='ascendgraph', dynamic=True)
+prompts_list = ["long long ago", "under the sky meteor crossing", "our story started ten years ago"]
+response_list = []
+
+for prompt in prompts_list:
+    tokenized_prompt = tokenizer(prompt, return_tensors="pt")
+    token_promt = tokenized_prompt["input_ids"]
+    print(f"tokenized_prompt: {tokenized_prompt}")
+    tokenized_response = model.generate(token_promt, temperature=1e-4,
+                                        top_k=20, do_sample=True, top_p=0.95,
+                                        max_new_tokens=256, repetition_penalty=1.1).cpu()
+    print(f"tokenized_response: {tokenized_response}")
+    response = tokenizer.decode(tokenized_response[0])
+    response_list.append(response.split('\n'))
+
+for idx, dicp_result in enumerate(response_list):
+    assert dicp_result == cuda_results[idx]

From ce2871bba2e07c72c574e42b370f9241e0f30375 Mon Sep 17 00:00:00 2001
From: wugeshui <106943115+wugeshui@users.noreply.github.com>
Date: Wed, 27 Dec 2023 15:43:04 +0800
Subject: [PATCH 41/58] Update action.yml (#578)

---
 .github/actions/code-build-test/action.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/actions/code-build-test/action.yml b/.github/actions/code-build-test/action.yml
index a3f2c32a62..6bbede0ee6 100644
--- a/.github/actions/code-build-test/action.yml
+++ b/.github/actions/code-build-test/action.yml
@@ -81,6 +81,10 @@ runs:
         else
           export CI=true
           source ~/.bashrc
-          cd ${WORK_PATH} && rm -rf ${JOB_NAME}  && cp -R source ${JOB_NAME} && cd ${JOB_NAME}
+          cd ${WORK_PATH} 
+          if [ "${{ inputs.cover_job }}" == "0" ];then
+              rm -rf ${JOB_NAME}  && cp -R source ${JOB_NAME}
+          fi
+          cd ${JOB_NAME}
           ${{ inputs.build_shell }} ${cleaner_shell}
         fi

From 121fc7d61dc9383447503650ca9faa90bd9cefe6 Mon Sep 17 00:00:00 2001
From: Peter Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 27 Dec 2023 20:22:21 +0800
Subject: [PATCH 42/58] adding num thread env for ci (#576)

---
 dipu/scripts/ci/ascend/ci_ascend_env.sh       | 3 +++
 dipu/scripts/ci/camb/ci_camb_env.sh           | 3 +++
 dipu/scripts/ci/droplet/ci_droplet_env.sh     | 3 +++
 dipu/scripts/ci/nv/ci_nv_env.sh               | 6 ++++++
 dipu/scripts/ci/topsrider/ci_topsrider_env.sh | 3 +++
 5 files changed, 18 insertions(+)

diff --git a/dipu/scripts/ci/ascend/ci_ascend_env.sh b/dipu/scripts/ci/ascend/ci_ascend_env.sh
index d7e4d17d53..381d6eb4bc 100644
--- a/dipu/scripts/ci/ascend/ci_ascend_env.sh
+++ b/dipu/scripts/ci/ascend/ci_ascend_env.sh
@@ -14,6 +14,9 @@ export DIPU_PATH=${DIPU_ROOT}
 export PYTORCH_DIR=${ASCEND_TORCH_DIR}
 export PYTHONPATH=${PYTORCH_DIR}:${PYTHONPATH}
 
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+
 source /usr/local/Ascend/ascend-toolkit/set_env.sh
 
 ARCH=$(uname -m)
diff --git a/dipu/scripts/ci/camb/ci_camb_env.sh b/dipu/scripts/ci/camb/ci_camb_env.sh
index 858b9d4654..6b0de04a6a 100644
--- a/dipu/scripts/ci/camb/ci_camb_env.sh
+++ b/dipu/scripts/ci/camb/ci_camb_env.sh
@@ -33,6 +33,9 @@ export DIPU_HOST_MEMCACHING_ALGORITHM=BS
 #export DIPU_RAW_ALLOCATOR_MIN_ALLOCATE_SIZE=512
 export DIPU_CHECK_TENSOR_DEVICE=1
 
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+
 source activate $ENV_NAME
 
 echo  "python path : ${PYTHONPATH}"
diff --git a/dipu/scripts/ci/droplet/ci_droplet_env.sh b/dipu/scripts/ci/droplet/ci_droplet_env.sh
index 5140be7c41..1bf7defe90 100644
--- a/dipu/scripts/ci/droplet/ci_droplet_env.sh
+++ b/dipu/scripts/ci/droplet/ci_droplet_env.sh
@@ -16,5 +16,8 @@ export DIPU_PATH=${DIPU_ROOT}
 export LIBRARY_PATH=$DIPU_ROOT:$DIOPI_ROOT:$LIBRARY_PATH
 export LD_LIBRARY_PATH=$DIPU_ROOT:$DIOPI_ROOT:$LD_LIBRARY_PATH
 
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+
 echo $ENV_PATH
 source activate $ENV_PATH
diff --git a/dipu/scripts/ci/nv/ci_nv_env.sh b/dipu/scripts/ci/nv/ci_nv_env.sh
index 2f26b9d899..a345a6ded0 100644
--- a/dipu/scripts/ci/nv/ci_nv_env.sh
+++ b/dipu/scripts/ci/nv/ci_nv_env.sh
@@ -45,4 +45,10 @@ export DIPU_HOST_MEMCACHING_ALGORITHM=BF
 export DIPU_PATCH_CUDA_CACHED_ALLOCATOR=0
 export DIPU_CHECK_TENSOR_DEVICE=1
 
+# Setting OMP_NUM_THREADS environment variable for each process in default,
+# to avoid your system being overloaded, please further tune the variable
+# for optimal performance in your application as needed.
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+
 source activate $ENV_NAME
diff --git a/dipu/scripts/ci/topsrider/ci_topsrider_env.sh b/dipu/scripts/ci/topsrider/ci_topsrider_env.sh
index 250ba8284d..58d8b3787d 100644
--- a/dipu/scripts/ci/topsrider/ci_topsrider_env.sh
+++ b/dipu/scripts/ci/topsrider/ci_topsrider_env.sh
@@ -16,4 +16,7 @@ export VENDOR_INCLUDE_DIRS=/usr/include/tops
 export DIOPI_PATH=${DIPU_LOCAL_DIR}/third_party/DIOPI/proto
 export DIPU_PATH=${DIPU_ROOT}
 
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+
 # source activate $ENV_NAME

From a5d287e753251836609cb81403e5a516c2300999 Mon Sep 17 00:00:00 2001
From: Peter Ye <44945378+yewentao256@users.noreply.github.com>
Date: Thu, 28 Dec 2023 11:53:42 +0800
Subject: [PATCH 43/58] remove useless LD_PRELOAD (#579)

---
 dipu/scripts/ci/nv/ci_nv_env.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dipu/scripts/ci/nv/ci_nv_env.sh b/dipu/scripts/ci/nv/ci_nv_env.sh
index a345a6ded0..453a1da092 100644
--- a/dipu/scripts/ci/nv/ci_nv_env.sh
+++ b/dipu/scripts/ci/nv/ci_nv_env.sh
@@ -28,7 +28,6 @@ export PYTORCH_DIR=${PLATFORM}/dep/DIOPI_pytorch/pytorch2.0_cu118
 export LD_LIBRARY_PATH=$DIPU_ROOT:$LD_LIBRARY_PATH
 export PYTHONPATH=${PYTORCH_DIR}:${PYTHONPATH}
 export PATH=${GCC_ROOT}/bin:${CONDA_ROOT}/envs/dipu_poc/bin:${CONDA_ROOT}/bin:${PLATFORM}/dep/binutils-2.27/bin:${PATH}
-export LD_PRELOAD=${GCC_ROOT}/lib64/libstdc++.so.6
 export PYTORCH_TEST_DIR=${PLATFORM}/env/miniconda3.8/envs/pt2.0_diopi/pytorch2.0
 export CUBLAS_WORKSPACE_CONFIG=:4096:8
 

From f223b7522eb6ba60a3ceb1f331fd2d3e72ff7d9d Mon Sep 17 00:00:00 2001
From: yaofengchen <67218893+yao-fengchen@users.noreply.github.com>
Date: Thu, 28 Dec 2023 15:00:57 +0800
Subject: [PATCH 44/58] [dicp][tops] Update dicp ci. (#575)

---
 .github/workflows/_runs-on-topsrider.yml | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/_runs-on-topsrider.yml b/.github/workflows/_runs-on-topsrider.yml
index 8427855c6d..21efb5d972 100644
--- a/.github/workflows/_runs-on-topsrider.yml
+++ b/.github/workflows/_runs-on-topsrider.yml
@@ -26,7 +26,7 @@ jobs:
     runs-on: ${{ inputs.runner }}
     needs: checkout_code
     steps:
-      - name: build and test on topsrider
+      - name: build on topsrider
         uses: DeepLink-org/deeplink.framework/.github/actions/code-build-test@main
         with:
           build_shell: "
@@ -34,11 +34,19 @@ jobs:
                        pip uninstall torch_dipu -y && \
                        pip uninstall dicp -y && \
                        cd dipu && python setup.py install --user && \
-                       cd ../dicp && python setup.py install --user && \
-                       cd .. && source dicp/scripts/ci/tops/ci_tops_test_env.sh /mnt/models/llama_models && \
-                       export TEST_DIR=$(pwd)/dicp/test && echo ${TEST_DIR} && \
-                       bash ${TEST_DIR}/tops_scripts/ops/run_test_ops.sh false && \
-                       bash ${TEST_DIR}/tops_scripts/models/run_test_models.sh false
+                       cd ../dicp && python setup.py install --user
                        "
           job_name: "build_test"
           cover_job: "0"
+
+      - name: test ops on topsrider
+        uses: DeepLink-org/deeplink.framework/.github/actions/code-build-test@main
+        with:
+          build_shell: "
+                       source dicp/scripts/ci/tops/ci_tops_test_env.sh \
+                       /mnt/models/llama_models /mnt/models/stable_diffusion_models && \
+                       export TEST_DIR=$(pwd)/dicp/test && \
+                       bash ${TEST_DIR}/tops_scripts/ops/run_test_ops.sh false
+                       "
+          job_name: "build_test"
+          cover_job: "1"

From 11417edafb8fe4345017297b1b0ecd8800120a14 Mon Sep 17 00:00:00 2001
From: zhaochaoxing <109726331+zhaochaoxing@users.noreply.github.com>
Date: Thu, 28 Dec 2023 15:36:11 +0800
Subject: [PATCH 45/58] update diopi (#580)

---
 dipu/third_party/DIOPI | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dipu/third_party/DIOPI b/dipu/third_party/DIOPI
index 978f399e0e..8eb32b0e88 160000
--- a/dipu/third_party/DIOPI
+++ b/dipu/third_party/DIOPI
@@ -1 +1 @@
-Subproject commit 978f399e0ee4b556b526d59d1df42a3b15b6767b
+Subproject commit 8eb32b0e884c6b4f9cccf7d81487e642a63a1c7e

From 9a6f5ff579dda0b7e2566401768dbce095d29c25 Mon Sep 17 00:00:00 2001
From: Juntao Chen <90135463+KevinfromTJ@users.noreply.github.com>
Date: Thu, 28 Dec 2023 19:44:22 +0800
Subject: [PATCH 46/58] [dicp][ascend] infer op resinfo and run single op
 (#562)

---
 dicp/dicp/vendor/AscendGraph/ascend_op.py     | 97 +++++++++++++++++--
 .../dicp/vendor/AscendGraph/codegen/ascend.py | 25 ++++-
 dicp/dicp/vendor/AscendGraph/conversion.py    | 63 ++++++------
 .../vendor/AscendGraph/infer_res_utils.py     | 43 ++++----
 .../ascend_scripts/models/run_test_models.sh  |  2 +-
 dicp/test/ascend_scripts/ops/run_test_ops.sh  |  3 +
 dicp/test/ascend_scripts/ops/static.ini       | 30 +++++-
 7 files changed, 191 insertions(+), 72 deletions(-)

diff --git a/dicp/dicp/vendor/AscendGraph/ascend_op.py b/dicp/dicp/vendor/AscendGraph/ascend_op.py
index dc19ada845..146d6e754c 100644
--- a/dicp/dicp/vendor/AscendGraph/ascend_op.py
+++ b/dicp/dicp/vendor/AscendGraph/ascend_op.py
@@ -56,7 +56,6 @@ def infer_result(self, x, shape):
             elif cur_dim == 1:
                 continue
             assert cur_dim == tar_dim, self.__class__.__name__ + ": shape mismatch!"
-
         # broadcast keep get_memory_format
         return torch.empty(shape, dtype=x_dtype, memory_format=get_memory_format(x))
 
@@ -65,6 +64,32 @@ class Range(Operator):
     def __init__(self):
         super().__init__("Range")
 
+    def infer_result(self, start, limit=None, delta=None):
+        start, start_dtype, _ = get_op_const_arg_kwarg(start)
+        limit, limit_dtype, _ = get_op_const_arg_kwarg(limit)
+        delta, delta_dtype, _ = get_op_const_arg_kwarg(delta)
+
+        assert start is not None, (
+            self.__class__.__name__ + ": input 'start' can't be None!"
+        )
+        if limit is None:
+            limit = start
+            start = 0.0
+        delta = float(delta) if delta is not None else 1.0
+        assert not close2(delta, 0), self.__class__.__name__ + "step must be nonzero"
+        assert (delta > 0 and limit > start) or (delta < 0 and limit < start), (
+            self.__class__.__name__
+            + "upper bound and larger bound inconsistent with step sign"
+        )
+
+        seq_len = math.ceil((limit - start) / delta)
+
+        return torch.empty(
+            [seq_len],
+            dtype=get_cast_dtype(start_dtype, limit_dtype),
+            memory_format=torch.contiguous_format,
+        )
+
 
 class Cumsum(Operator):
     def __init__(self):
@@ -354,10 +379,16 @@ class ScatterElements(Operator):
     def __init__(self):
         super().__init__("ScatterElements")
 
+    def infer_result(self, var, index, value, dim):
+        return common_unary_op_infer(var)
+
 
-class ReduceMean(Operator):
+class ReduceMeanD(Operator):
     def __init__(self):
-        super().__init__("ReduceMean")
+        super().__init__("ReduceMeanD")
+
+    def infer_result(self, x, axes, keepdim=False, noop_with_empty_axes=True):
+        return reduce_op_infer(x, axes, keepdim)
 
 
 class ReduceStdV2Update(Operator):
@@ -396,7 +427,7 @@ def __init__(self):
     def infer_result(self, base, expo):
         base, base_shape, base_dim, base_dtype = get_fake_tensor_meta_val(base)
         if isinstance(expo, Tuple):  # Const
-            expo, expo_shape = get_op_const_arg_kwarg(expo)
+            expo, _, expo_shape = get_op_const_arg_kwarg(expo)
             expo_dtype = type(expo[0]) if len(expo) > 0 else base_dtype
         else:  # fake Tensor
             expo, expo_shape, expo_dim, expo_dtype = get_fake_tensor_meta_val(expo)
@@ -447,6 +478,14 @@ def infer_result(self, x1, x2):
         return common_binary_op_infer(x1, x2, torch.bool)
 
 
+class NotEqual(Operator):
+    def __init__(self):
+        super().__init__("NotEqual")
+
+    def infer_result(self, x1, x2):
+        return common_binary_op_infer(x1, x2, torch.bool)
+
+
 class Conv2D(Operator):
     def __init__(self):
         super().__init__("Conv2D")
@@ -525,7 +564,7 @@ def __init__(self):
     def infer_result(
         self, shape, dtype, layout, device, memory_format=torch.contiguous_format
     ):
-        shape, _ = get_op_const_arg_kwarg(shape)
+        shape, _, _ = get_op_const_arg_kwarg(shape)
         return torch.empty(
             shape,
             dtype=dtype,
@@ -547,6 +586,16 @@ def infer_result(self, x, index, axis):
         return torch.empty(idx_shape, dtype=x_dtype, memory_format=get_memory_format(x))
 
 
+class GatherElements(Operator):
+    def __init__(self):
+        super().__init__("GatherElements")
+
+    def infer_result(self, x, index, axis):
+        x, x_shape, x_dim, x_dtype = get_fake_tensor_meta_val(x)
+        idx, idx_shape, idx_dim, idx_dtype = get_fake_tensor_meta_val(index)
+        return torch.empty(idx_shape, dtype=x_dtype, memory_format=get_memory_format(x))
+
+
 class OnesLike(Operator):
     def __init__(self):
         super().__init__("OnesLike")
@@ -559,6 +608,13 @@ class Fill(Operator):
     def __init__(self):
         super().__init__("Fill")
 
+    def infer_result(self, dims, value):
+        _, value_dtype, _ = get_op_const_arg_kwarg(value)
+        shape, _, _ = get_op_const_arg_kwarg(dims)
+        return torch.empty(
+            shape, dtype=value_dtype, memory_format=torch.contiguous_format
+        )
+
 
 class Conv2DBackpropInput(Operator):
     def __init__(self):
@@ -641,7 +697,10 @@ def __init__(self):
         super().__init__("SplitD")
 
     def infer_result(self, x, split_dim, num_split, y, from_view_complex=False):
-        assert from_view_complex == True, self.__class__.__name__ + ": currently available only in op view_as_complex!"
+        assert from_view_complex == True, (
+            self.__class__.__name__
+            + ": currently available only in op view_as_complex!"
+        )
         x, x_shape, x_dim, x_dtype = get_fake_tensor_meta_val(x)
         split_dim = (split_dim + x_dim) % x_dim
         out_shape = list(x_shape)
@@ -659,8 +718,8 @@ def __init__(self):
 
     def infer_result(self, x, offset, size):
         x, x_shape, _, x_dtype = get_fake_tensor_meta_val(x)
-        new_shape, _ = get_op_const_arg_kwarg(size)
-        offset, _ = get_op_const_arg_kwarg(offset)
+        new_shape, _, _ = get_op_const_arg_kwarg(size)
+        offset, _, _ = get_op_const_arg_kwarg(offset)
         _, storage_offset = cal_stride_offset(new_shape, offset, x)
         res = torch.as_strided(x, new_shape, x.stride(), storage_offset)
         return res
@@ -688,6 +747,16 @@ class MaskedFill(Operator):
     def __init__(self):
         super().__init__("MaskedFill")
 
+    def infer_result(self, x, mask, value):
+        x, x_shape, _, x_dtype = get_fake_tensor_meta_val(x)
+        _, _, _, value_dtype = get_fake_tensor_meta_val(value)
+        _, mask_shape, _, _ = get_fake_tensor_meta_val(mask)
+        return torch.empty(
+            get_broadcast_res_two_shape(x_shape, mask_shape),
+            dtype=get_cast_dtype(x_dtype, value_dtype),
+            memory_format=get_memory_format(x),
+        )
+
 
 class Reshape(Operator):
     def __init__(self):
@@ -695,7 +764,7 @@ def __init__(self):
 
     def infer_result(self, x, shape_const_op, ori_op=None, params_passed=None):
         x, _, _, x_dtype = get_fake_tensor_meta_val(x)
-        re_shape, _ = get_op_const_arg_kwarg(shape_const_op)
+        re_shape, _, _ = get_op_const_arg_kwarg(shape_const_op)
         x_stride = list(x.stride())
         res = torch.empty(re_shape, dtype=x_dtype, memory_format=get_memory_format(x))
         if ori_op == "Select":
@@ -781,6 +850,16 @@ def __init__(self):
         super().__init__("DropOutDoMaskV3")
 
 
+class MaxPool(Operator):
+    def __init__(self):
+        super().__init__("MaxPool")
+
+
+class PadV3(Operator):
+    def __init__(self):
+        super().__init__("PadV3")
+
+
 def ret_triple(a, b, c) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     return a, b, c
 
diff --git a/dicp/dicp/vendor/AscendGraph/codegen/ascend.py b/dicp/dicp/vendor/AscendGraph/codegen/ascend.py
index c90c872df0..787cc79ced 100644
--- a/dicp/dicp/vendor/AscendGraph/codegen/ascend.py
+++ b/dicp/dicp/vendor/AscendGraph/codegen/ascend.py
@@ -825,11 +825,12 @@ def Conv2D(name, input, weight, stride, padding,
         return op.to_node()
 
     @staticmethod
-    def ReduceMean(name, x, axes, keepdim=False):
-        mean_op = OP(name, "ReduceMean")
+    def ReduceMeanD(name, x, axes, keepdim=False, noop_with_empty_axes=False):
+        mean_op = OP(name, "ReduceMeanD")
         mean_op.set_input("x", x)
-        mean_op.set_input("axes", axes)
+        mean_op.set_attr_list_int("axes", axes)
         mean_op.set_attr_bool("keep_dims", keepdim)
+        mean_op.set_attr_bool("noop_with_empty_axes", noop_with_empty_axes)
         return mean_op.to_node()
 
     @staticmethod
@@ -1047,6 +1048,7 @@ def Empty(name, shape, dtype, layout=torch.strided, device='cpu', memory_format=
         op = OP(name, "Empty")
         op.set_input("shape", shape)
         op.set_attr_int("dtype", dtype)
+        op.set_attr_bool("init", False)
         return op.to_node()
 
     @staticmethod
@@ -1195,7 +1197,7 @@ def ret_triple(name, in1, in2, in3):
         return op.to_node()
 
     @staticmethod
-    def Range(name, end, start, step):
+    def Range(name, start, end, step):
         op = OP(name, "Range")
         op.set_input("start", start)
         op.set_input("limit", end)
@@ -1209,6 +1211,13 @@ def Equal(name, a, b):
         eq_op.set_input("x2", b)
         return eq_op.to_node()
 
+    @staticmethod
+    def NotEqual(name, a, b):
+        eq_op = OP(name, "NotEqual")
+        eq_op.set_input("x1", a)
+        eq_op.set_input("x2", b)
+        return eq_op.to_node()
+
     @staticmethod
     def Cumsum(name, x, dim):
         op = OP(name, "Cumsum")
@@ -1463,3 +1472,11 @@ def DropOutDoMaskV3(name, x, mask, keep_prob):
         op.set_input("mask", mask)
         op.set_input("keep_prob", keep_prob)
         return op.to_node()
+    
+    @staticmethod
+    def GatherElements(name, x, index, dim):
+        op = OP(name, "GatherElements")
+        op.set_input("x", x)
+        op.set_input("index", index)
+        op.set_attr_int("dim", dim)
+        return op.to_node()
diff --git a/dicp/dicp/vendor/AscendGraph/conversion.py b/dicp/dicp/vendor/AscendGraph/conversion.py
index b086400c47..bc43ac02dd 100644
--- a/dicp/dicp/vendor/AscendGraph/conversion.py
+++ b/dicp/dicp/vendor/AscendGraph/conversion.py
@@ -160,6 +160,16 @@ def mul_complex64(self, x, y):
         out = self.get_proxy(ascend_op.IdentityN, (ac_bd, ad_bc))
         return out
 
+    def binary_cmp_cast_input(self, x, y):
+        if not isinstance(y, torch.fx.proxy.Proxy):
+            x_dtype = x.node.meta["val"].dtype
+            const_dtype = torch.float32 if x_dtype == torch.float16 else x_dtype
+            y_shape = list(x.node.meta["val"].shape)
+            y = self.get_param_proxy(y, const_dtype, y_shape)
+            if x_dtype == torch.float16:
+                y = self.get_proxy(ascend_op.Cast, (y, "FLOAT16"))
+        return x, y
+
     @register_conversion(torch.ops.aten.mul)
     def mul(self, x, y):
         out_dtype = fx_traceback.get_current_meta()['val'].dtype
@@ -221,12 +231,8 @@ def _to_copy(self, x, dtype=None, layout=torch.strided, device=None):
 
     @register_conversion(aten.le)
     def le(self, a, b):
-        if isinstance(b, torch.fx.proxy.Proxy):
-            return self.get_proxy(ascend_op.LessEqual, (a, b), {})
-        x2 = self.get_proxy(ascend_op.Const, ([b], torch.float32, []))
-        if a.node.meta['val'].dtype == torch.float16:
-            x2 = self.get_proxy(ascend_op.Cast, (x2, "FLOAT16"), {})
-        return self.get_proxy(ascend_op.LessEqual, (a, x2), {})
+        a, b = self.binary_cmp_cast_input(a, b)
+        return self.get_proxy(ascend_op.LessEqual, (a, b), {})
 
     @register_conversion(aten.view_as_real)
     def view_as_real(self, x):
@@ -281,10 +287,10 @@ def slice(self, x, dim=0, start=None, end=None, step=1):
         x_shape = list(x.node.meta['val'].shape)
         y_shape = list(fx_traceback.get_current_meta()['val'].shape)
         dim = int(dim)
-        start = int(start)
+        start = int(start) if start is not None else 0
         start = start if start >= 0 else x_shape[dim] + start
-        assert dim >= 0 and dim < len(x_shape)
-        assert start >= 0 and start < x_shape[dim]
+        assert dim == -1 or dim >= 0 and dim < len(x_shape)
+        assert start is None or start >= 0 and start < x_shape[dim]
         offset = [0] * len(x_shape)
         offset[dim] = start
         offset = self.get_shape_proxy(offset)
@@ -435,7 +441,7 @@ def arange(self, end, start=0, step=1, dtype=None, device='xpu', layout=None, pi
             step = self.get_proxy(ascend_op.Const, (step, out_dtype))
         elif step.node.meta['val'] != out_dtype:
             step = self.get_proxy(ascend_op.Cast, (step, get_ascend_dtype(out_dtype)), {})
-        return self.get_proxy(ascend_op.Range, (end, start, step))
+        return self.get_proxy(ascend_op.Range, (start, end, step))
 
     @register_conversion(aten.arange.start)
     def arange_start(self, start, end, step=1, dtype=None, device=None, layout=None, pin_memory=False):
@@ -443,28 +449,17 @@ def arange_start(self, start, end, step=1, dtype=None, device=None, layout=None,
 
     @register_conversion([aten.eq, aten.eq.Tensor])
     def eq(self, a, b):
-        a_dtype = a.node.meta['val'].dtype
-        if not isinstance(b, torch.fx.proxy.Proxy):
-            const_dtype = torch.float32 if a_dtype == torch.float16 else a_dtype
-            b_shape = list(a.node.meta['val'].shape)
-            b = self.get_param_proxy(b, const_dtype, b_shape)
-            if a_dtype == torch.float16:
-                b = self.get_proxy(ascend_op.Cast, (b, "FLOAT16"))
-        elif a_dtype != b.node.meta["val"].dtype:
-            b = self.get_proxy(ascend_op.Cast, (b, get_ascend_dtype(a_dtype)))
+        a, b = self.binary_cmp_cast_input(a, b)
         return self.get_proxy(ascend_op.Equal, (a, b))
 
+    @register_conversion(aten.ne.Scalar)
+    def ne(self, a, b):
+        a, b = self.binary_cmp_cast_input(a, b)
+        return self.get_proxy(ascend_op.NotEqual, (a, b))
+
     @register_conversion([aten.lt.Scalar, aten.lt.Tensor])
     def lt(self, x, y):
-        x_dtype = x.node.meta['val'].dtype
-        if not isinstance(y, torch.fx.proxy.Proxy):
-            const_dtype = torch.float32 if x_dtype == torch.float16 else x_dtype
-            y_shape = list(x.node.meta['val'].shape)
-            y = self.get_param_proxy(y, const_dtype, y_shape)
-            if x_dtype == torch.float16:
-                y = self.get_proxy(ascend_op.Cast, (y, "FLOAT16"))
-        elif x_dtype != y.node.meta['val'].dtype:
-            y = self.get_proxy(ascend_op.Cast, (y, get_ascend_dtype(x_dtype)))
+        x, y = self.binary_cmp_cast_input(x, y)
         return self.get_proxy(ascend_op.Less, (x, y))
 
     @register_conversion(aten.masked_fill.Scalar)
@@ -479,7 +474,7 @@ def masked_fill(self, x, mask, value):
             value = self.get_proxy(ascend_op.Cast, (value, "FLOAT16"))
         return self.get_proxy(ascend_op.MaskedFill, (x, mask, value))
 
-    @register_conversion(torch.ops.aten.scatter.src)
+    @register_conversion([torch.ops.aten.scatter.src, torch.ops.aten.scatter.value])
     def scatter(self, var, dim, index, value):
         assert isinstance(dim, int)
         index_shape = list(index.node.meta['val'].shape)
@@ -900,9 +895,9 @@ def addmm(self, c, a, b, beta=1.0, alpha=1.0):
 
     @register_conversion(torch.ops.aten.mean)
     def mean(self, x, dims=[], keepdim=False):
-        axes = self.get_proxy(
-            ascend_op.Const, (dims, torch.int32, [] if len(dims) == 0 else [len(dims)]))
-        return self.get_proxy(ascend_op.ReduceMean, (x, axes, keepdim))
+        if not isinstance(dims, list):
+            dims = [dims]
+        return self.get_proxy(ascend_op.ReduceMeanD, (x, dims, keepdim, False))
 
     @register_conversion(torch.ops.aten.cumsum.default)
     def cumsum(self, x, dim, dtype=None):
@@ -970,9 +965,7 @@ def embedding(self, weight, indices, padding_idx=-1):
 
     @register_conversion(torch.ops.aten.gather)
     def gather(self, x, dim, index):
-        dim = [dim] if not isinstance(dim, list) else dim
-        axis = self.get_proxy(ascend_op.Const, (dim, torch.int32, [len(dim)]))
-        return self.get_proxy(ascend_op.GatherV2, (x, index, axis))
+        return self.get_proxy(ascend_op.GatherElements, (x, index, dim))
 
     @register_conversion(aten.t.default)
     def t(self, input):
diff --git a/dicp/dicp/vendor/AscendGraph/infer_res_utils.py b/dicp/dicp/vendor/AscendGraph/infer_res_utils.py
index f2b909d248..10cd5c167f 100644
--- a/dicp/dicp/vendor/AscendGraph/infer_res_utils.py
+++ b/dicp/dicp/vendor/AscendGraph/infer_res_utils.py
@@ -3,6 +3,7 @@
 from dicp.dynamo_bridge.utils import get_memory_format
 
 import torch
+import math
 
 """parse and get val"""
 
@@ -34,34 +35,27 @@ def get_fake_tensor_meta_val(
     return x, x_shape, x_dim, x_dtype
 
 
-def get_op_const_arg_kwarg(const_arg):
+def get_op_const_arg_kwarg(
+    const_arg,
+) -> Tuple[list, torch.dtype, Union[list, None]]:
     """
-    if some operator uses Const as an input, call this func to get the input (args and kwargs) of the input op.
-    Some operators like "reshape" need a tensor's value(shape), so for operators like "Const" we directly pass its input
-    (including value and shape) instead of constructing a fakeTensor, which will neglect a tensor's value.
     input:
         - const_arg: Tuple (new_args,kwargs)
-            - new_args: Tuple, identical to input-"new_args" of operator Const
+            - new_args: Tuple, identical to input-"new_args" of operator Const (has 2 or 3 params currently)
             - kwargs: dict, identical to input-"kwargs" of operator Const
-
     output:
-        - arg0: list, value of "Const"'s input
-        - arg2: list, shape of "Const"'s input
-    """
-    new_args = const_arg[0]
-    arg0 = new_args[0]
-    arg2 = new_args[2]
-    return arg0, arg2
-
-
-def get_op_const_arg_kwarg(const_arg):
-    """
-    similar to get_op_const_arg_kwarg()
+        - arg0: list, input attr such as axes,shape
+        - arg1: torch dtype , e.g. torch.int32
+        - arg2: list(optional), shape of arg0
     """
     new_args = const_arg[0]
-    shape = new_args[0]
-    dim = new_args[2]
-    return shape, dim
+    len_args = len(new_args)
+    assert (
+        len_args >= 2 and len_args <= 3
+    ), " :currently, op 'Const' support only 2 or 3 params passed!"
+    arg0, dtype = new_args[0], new_args[1]
+    shape = new_args[2] if len(new_args) == 3 else None
+    return arg0, dtype, shape
 
 
 """analyze dtype,format"""
@@ -200,3 +194,10 @@ def reduce_op_infer(x, dims, keepdim) -> torch.tensor:
     x, x_shape, x_dim, x_dtype = get_fake_tensor_meta_val(x)
     out_shape = reduce_ops_output_size(x_shape, x_dim, dims, keepdim)
     return torch.empty(out_shape, dtype=x_dtype, memory_format=get_memory_format(x))
+
+
+"""other common utils"""
+
+
+def close2(num, tar=0, rtol=0.00001):
+    return math.fabs(num - tar) < rtol
diff --git a/dicp/test/ascend_scripts/models/run_test_models.sh b/dicp/test/ascend_scripts/models/run_test_models.sh
index 11da170c91..a00abafa9c 100755
--- a/dicp/test/ascend_scripts/models/run_test_models.sh
+++ b/dicp/test/ascend_scripts/models/run_test_models.sh
@@ -34,4 +34,4 @@ else
     exit 1
 fi
 
-python ${TEST_MODEL_DIR}/test_hf.py
+# python ${TEST_MODEL_DIR}/test_hf.py
diff --git a/dicp/test/ascend_scripts/ops/run_test_ops.sh b/dicp/test/ascend_scripts/ops/run_test_ops.sh
index 98072d07a3..c7cacee704 100755
--- a/dicp/test/ascend_scripts/ops/run_test_ops.sh
+++ b/dicp/test/ascend_scripts/ops/run_test_ops.sh
@@ -14,6 +14,7 @@ DYNAMIC=$1
 CONFIG_STATIC=${CONFIG_DIR}/static.ini
 CONFIG_DYNAMIC=${CONFIG_DIR}/dynamic.ini
 
+export TEST_DICP_INFER=1
 cd ${TEST_OP_DIR}
 if [ ${DYNAMIC} == false ]; then
     pytest -c ${CONFIG_STATIC} --backend ${BACKEND} --dynamic ${DYNAMIC}
@@ -24,5 +25,7 @@ elif [ ${DYNAMIC} == all ]; then
     pytest -c ${CONFIG_DYNAMIC} --backend ${BACKEND} --dynamic true
 else
     echo "DYNAMIC should in (true, false, all)" >&2
+    unset TEST_DICP_INFER
     exit 1
 fi
+unset TEST_DICP_INFER
diff --git a/dicp/test/ascend_scripts/ops/static.ini b/dicp/test/ascend_scripts/ops/static.ini
index 255d610fda..f6282715af 100644
--- a/dicp/test/ascend_scripts/ops/static.ini
+++ b/dicp/test/ascend_scripts/ops/static.ini
@@ -1,36 +1,62 @@
 [pytest]
 testpaths = ../../op
-python_files = test__log_softmax.py
+python_files = 
+               test__log_softmax.py
+            ;    test__native_batch_norm_legit_functional.py
                test__softmax.py
                test__unsafe_view.py
                test_add.py
+               test_amax.py
+            ;    test_arange.py
                test_bernoulli.py
                test_bmm.py
                test_cat.py
                test_clone.py
+               test_convert.py
+            ;    test_convolution_backward.py
+            ;    test_convolution.py
                test_copy_.py
                test_copy.py
                test_div.py
+               test_embedding.py
+            ;    test_empty_like.py
+               test_eq.py
                test_exp.py
+            ;    test_expand.py
                test_fill.py
+               test_full_like.py
+            ;    test_full.py
+               test_gather.py
                test_getitem.py
                test_index.py
                test_le.py
-               test_log.py
+            ;    test_lift_fresh_copy.py
+            ;    test_log.py
                test_lt.py
                test_masked_fill.py
+            ;    test_max_pool2d_with_indices.py
+            ;    test_max_pool2d_with_indices_backward.py
                test_maximum.py
+               test_mean.py
+            ;    test_mm.py
                test_mul.py
+               test_ne.py
                test_neg.py
+            ;    test_new_empty_strided.py
+            ;    test_ones.py
+            ;    test_permute.py
                test_pow.py
                test_relu.py
                test_rsqrt.py
+               test_scatter.py
                test_select.py
                test_sigmoid.py
+               test_slice.py
                test_sqrt.py
                test_squeeze.py
                test_sub.py
                test_sum.py
+            ;    test_transpose.py
                test_unsqueeze.py
                test_view_as_complex.py
                test_view_as_real.py

From 63ad39ba1d966191fca28f7659c03630154bac7e Mon Sep 17 00:00:00 2001
From: fandaoyi <fandaoyi@sensetime.com>
Date: Fri, 29 Dec 2023 12:12:52 +0800
Subject: [PATCH 47/58] update diopi (#583)

---
 dipu/third_party/DIOPI | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dipu/third_party/DIOPI b/dipu/third_party/DIOPI
index 8eb32b0e88..385ce67f65 160000
--- a/dipu/third_party/DIOPI
+++ b/dipu/third_party/DIOPI
@@ -1 +1 @@
-Subproject commit 8eb32b0e884c6b4f9cccf7d81487e642a63a1c7e
+Subproject commit 385ce67f65c1c785c9a3713465c6489025da7bf1

From a17108d2863c6de3124a6a8faf67ed15675efa71 Mon Sep 17 00:00:00 2001
From: ustclight-sls <55499123+ustclight-sls@users.noreply.github.com>
Date: Fri, 29 Dec 2023 14:02:14 +0800
Subject: [PATCH 48/58] [DIPU]Improve testing coverage (#542)

---
 .../individual_scripts/test_dipu_fallback.py  | 175 ++++++++++++++++--
 .../python/individual_scripts/test_dumparg.py |  29 +++
 .../torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp |   2 +-
 .../runtime/core/DIPUGeneratorImpl.cpp        |   4 +-
 .../runtime/core/DIPUGeneratorImpl.h          |   5 +-
 5 files changed, 196 insertions(+), 19 deletions(-)
 create mode 100644 dipu/tests/python/individual_scripts/test_dumparg.py

diff --git a/dipu/tests/python/individual_scripts/test_dipu_fallback.py b/dipu/tests/python/individual_scripts/test_dipu_fallback.py
index 8c4f65235e..e55e342590 100644
--- a/dipu/tests/python/individual_scripts/test_dipu_fallback.py
+++ b/dipu/tests/python/individual_scripts/test_dipu_fallback.py
@@ -1,32 +1,181 @@
 # Copyright (c) 2023, DeepLink.
 import io
+import torch
 from stdout_redirector import stdout_redirector
 from local_eviron import local_eviron
+from multiprocessing import Process, set_start_method
 
 
-def _test_dipu_fallback():
+def test_fallback(
+    op_name: list, diopi_proto: list, test_fn: callable, other_check_item: list = None
+) -> str:
     captured = io.BytesIO()
     with stdout_redirector(captured):
         with local_eviron(
             {
-                "DIPU_FORCE_FALLBACK_OPS_LIST": "add.out,sub.out",
+                "DIPU_FORCE_FALLBACK_OPS_LIST": ",".join(op_name),
                 "DIPU_DUMP_OP_ARGS": "1",
+                "DIPU_LOG_FALLBACK_INFO": "1",
             }
         ):
-            import torch
             import torch_dipu
 
-            x = torch.randn(3, 4).cuda()
-            _ = x + x
-            _ = x - x
-
+            test_fn()
     output = captured.getvalue().decode()
-    assert "force fallback has been set, add.out will be fallback to cpu" in output
-    assert "force fallback has been set, sub.out will be fallback to cpu" in output
-    assert "dipu_fallback" in output
-    assert "diopiAdd" not in output
-    assert "diopiSub" not in output
+    print(output)
+    assert all(
+        f"force fallback has been set, {name} will be fallback to cpu" in output
+        for name in op_name
+    )
+    assert all(item not in output for item in diopi_proto)
+    if other_check_item is not None:
+        assert all(item in output for item in other_check_item)
+
+
+def _test_dipu_fallback():
+    def fn():
+        x = torch.randn(3, 4).cuda()
+        _ = x + x
+        _ = x - x
+
+    test_fallback(
+        ["add.out", "sub.out"], ["diopiAdd", "diopiSub"], fn, ["dipu_fallback"]
+    )
+
+
+def _test_cpu_fallback():
+    def fn():
+        device = "cuda"
+        m = torch.nn.BatchNorm2d(100, affine=False).to(device)
+        input = torch.randn(20, 100, 35, 45).to(device)
+        m(input)
+
+    test_fallback(
+        ["native_batch_norm"],
+        ["diopiBatchNorm"],
+        fn,
+        ["cpu_fallback:\taten::native_batch_norm", "dipu_fallback"],
+    )
+
+
+def _test_dipu_index_put_impl_fallback():
+    def fn():
+        dipu_tensor = torch.tensor([1, 2, 3, 4, 5]).cuda()
+        indices = torch.tensor([1, 3]).cuda()
+        values = torch.tensor([10, 40]).cuda()
+        torch._index_put_impl_(dipu_tensor, (indices,), values, accumulate=False)
+
+        tensor = dipu_tensor.cpu()
+        indices = indices.cpu()
+        values = values.cpu()
+        torch._index_put_impl_(tensor, (indices,), values, accumulate=False)
+
+        assert torch.allclose(tensor, dipu_tensor.cpu())
+
+    test_fallback(
+        ["_index_put_impl_"],
+        ["diopiIndexPut"],
+        fn,
+        ["custom fallback to cpu, name=_index_put_impl_"],
+    )
+
+
+def _test_dipu_copy_fallback_():
+    def fn():
+        source_tensor = torch.tensor([1.0, 2.0, 3.0]).cuda()
+        target_dipu = torch.zeros_like(source_tensor).cuda()
+        target_dipu.copy_(source_tensor)
+
+        source_tensor = source_tensor.cpu()
+        target_tensor = torch.zeros_like(source_tensor)
+        target_tensor.copy_(source_tensor)
+
+        assert torch.allclose(target_tensor, target_dipu.cpu())
+
+    test_fallback(
+        ["copy_"],
+        ["diopiCopyInp"],
+        fn,
+        ["custom fallback to dipu copy, name=copy_"],
+    )
+
+
+def _test_dipu_convolution_backward_overrideable_fallback():
+    def fn():
+        torch.manual_seed(42)
+        device = torch.device("dipu")
+        m = torch.nn.Conv2d(2, 3, 3, stride=2).to(device)
+        m.weight = torch.nn.Parameter(torch.ones_like(m.weight))
+        m.bias = torch.nn.Parameter(torch.ones_like(m.bias))
+        input_dipu = torch.randn(2, 2, 5, 5).to(device).requires_grad_(True)
+        output_dipu = m(input_dipu)
+        output_dipu.backward(torch.ones_like(output_dipu))
+
+        torch.manual_seed(42)
+        m = torch.nn.Conv2d(2, 3, 3, stride=2)
+        m.weight = torch.nn.Parameter(torch.ones_like(m.weight))
+        m.bias = torch.nn.Parameter(torch.ones_like(m.bias))
+        input_cpu = torch.randn(2, 2, 5, 5, requires_grad=True)
+        output_cpu = m(input_cpu)
+        output_cpu.backward(torch.ones_like(output_cpu))
+
+        assert torch.allclose(output_dipu.cpu(), output_cpu)
+        assert torch.allclose(input_dipu.grad.cpu(), input_cpu.grad)
+
+    test_fallback(
+        ["convolution_backward_overrideable"],
+        ["diopiConvolution2dBackward"],
+        fn,
+        ["custom fallback to cpu, name=convolution_backward_overrideable"],
+    )
+
+
+def _test_dipu_convolution_overrideable_fallback():
+    def fn():
+        m = torch.nn.Conv2d(2, 3, 3, stride=2).cuda()
+        m.weight = torch.nn.Parameter(torch.ones_like(m.weight))
+        m.bias = torch.nn.Parameter(torch.ones_like(m.bias))
+        input_dipu = torch.randn(2, 2, 5, 5).cuda()
+        output_dipu = m(input_dipu)
+
+        m = m.cpu()
+        m.weight = torch.nn.Parameter(torch.ones_like(m.weight))
+        m.bias = torch.nn.Parameter(torch.ones_like(m.bias))
+        input_cpu = input_dipu.cpu()
+        output_cpu = m(input_cpu)
+
+        assert torch.allclose(output_dipu.cpu(), output_cpu)
+
+    test_fallback(
+        ["convolution_overrideable"],
+        ["diopiConvolution2d"],
+        fn,
+        ["custom fallback to cpu, name=convolution_overrideable"],
+    )
 
 
 if __name__ == "__main__":
-    _test_dipu_fallback()
+    set_start_method("spawn", force=True)
+    p1 = Process(target=_test_dipu_fallback)
+    p1.start()
+    p1.join()
+
+    p2 = Process(target=_test_cpu_fallback)
+    p2.start()
+    p2.join()
+
+    p3 = Process(target=_test_dipu_index_put_impl_fallback)
+    p3.start()
+    p3.join()
+
+    p4 = Process(target=_test_dipu_copy_fallback_)
+    p4.start()
+    p4.join()
+
+    p5 = Process(target=_test_dipu_convolution_backward_overrideable_fallback)
+    p5.start()
+    p5.join()
+
+    p6 = Process(target=_test_dipu_convolution_overrideable_fallback)
+    p6.start()
+    p6.join()
diff --git a/dipu/tests/python/individual_scripts/test_dumparg.py b/dipu/tests/python/individual_scripts/test_dumparg.py
new file mode 100644
index 0000000000..c6629e8370
--- /dev/null
+++ b/dipu/tests/python/individual_scripts/test_dumparg.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2023, DeepLink.
+import io
+from stdout_redirector import stdout_redirector
+from local_eviron import local_eviron
+
+
+def _test_copy_dumparg():
+    captured = io.BytesIO()
+    with stdout_redirector(captured):
+        with local_eviron(
+            {
+                "DIPU_DUMP_OP_ARGS": "2",
+            }
+        ):
+            import torch
+            import torch_dipu
+
+            source_tensor = torch.tensor([1.0, 2.0, 3.0]).cuda()
+            target_tensor = torch.zeros_like(source_tensor).cuda()
+            target_tensor.copy_(source_tensor)
+
+    output = captured.getvalue().decode()
+    print(output)
+    assert "DIPUCopyInplace.run" in output
+    assert "numel: 3, sizes: [3], stride: [1], is_view: 0, dtype: float" in output
+
+
+if __name__ == "__main__":
+    _test_copy_dumparg()
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp b/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp
index 173aca3b43..af24e58089 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp
@@ -72,7 +72,7 @@ template <>
 std::string dumpArg(const at::Tensor& tensor) {
   std::stringstream stream;
   if (tensor.defined()) {
-    stream << "numel: " << tensor.numel() << ",sizes: " << tensor.sizes()
+    stream << "numel: " << tensor.numel() << ", sizes: " << tensor.sizes()
            << ", stride: " << tensor.strides()
            << ", is_view: " << tensor.is_view() << ", dtype: " << tensor.dtype()
            << ", device:" << tensor.device() << ", layout:" << tensor.layout()
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.cpp
index 40530510c9..eb3db1223f 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.cpp
@@ -125,7 +125,9 @@ std::shared_ptr<DIPUGeneratorImpl> DIPUGeneratorImpl::clone() const {
  * See Note [Acquire lock when using random generators]
  */
 DIPUGeneratorImpl* DIPUGeneratorImpl::clone_impl() const {
-  auto gen = new DIPUGeneratorImpl(this->device().index());
+  auto gen = dynamic_cast<DIPUGeneratorImpl*>(
+      createDIPUGenerator(this->device().index()).unsafeReleaseGeneratorImpl());
+  TORCH_CHECK(gen != nullptr);
   gen->set_current_seed(this->seed_);
   auto state = this->state_;
   const auto& state_clone = state.clone();
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.h b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.h
index aa8dee96b2..1ec5c17ec6 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.h
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.h
@@ -19,13 +19,10 @@ class DIPUGeneratorImpl : public c10::GeneratorImpl {
   uint64_t seed() override;
   static at::DeviceType device_type();
   c10::intrusive_ptr<c10::TensorImpl> get_state() const override;
-  void set_state(const c10::TensorImpl& state) override{};
-  virtual void set_offset(uint64_t offset){};
-  virtual uint64_t get_offset() const { return 0; };
 
  protected:
   void set_state_flag(bool flag);
-  virtual void update_state() const {}
+  virtual void update_state() const = 0;
 
   DIPUGeneratorImpl* clone_impl() const override;
   uint64_t seed_ = c10::default_rng_seed_val;

From 85482027527fcd7cc42f3590ead2a2062904be33 Mon Sep 17 00:00:00 2001
From: wyz5864 <109072365+wyz5864@users.noreply.github.com>
Date: Fri, 29 Dec 2023 14:03:30 +0800
Subject: [PATCH 49/58] change layer_norm's output format to align torch (#585)

---
 .../autogen_diopi_wrapper/diopi_functions.yaml    | 10 +++++++++-
 dipu/tests/python/unittests/test_layer_norm.py    | 15 +++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
index aab7ff053e..dd700285b3 100755
--- a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
+++ b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
@@ -273,7 +273,15 @@
     auto options = input.options();
     auto save_mean = at::empty(stats_shape, options);
     auto save_invstd = at::empty(stats_shape, options);
-    auto out = at::empty_like(input);
+    auto out = at::empty_like(
+      input,
+      c10::nullopt /* dtype */,
+      c10::nullopt /* layout */,
+      c10::nullopt /* device */,
+      c10::nullopt /* pin_memory */,
+      // maybe we don't want ChannelsLast -> Contiguous here, but just align with pytorch
+      // https://github.com/pytorch/pytorch/blob/v2.0.0/aten/src/ATen/native/cuda/layer_norm_kernel.cu#L1340-L1346
+      LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   interface: diopiLayerNorm(ctx,  out,  save_mean,  save_invstd,  input,  weight,  bias, normalized_shape, eps);
 
 - schema: "native_layer_norm_backward(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)"
diff --git a/dipu/tests/python/unittests/test_layer_norm.py b/dipu/tests/python/unittests/test_layer_norm.py
index aec7d0aa97..bb6424a811 100644
--- a/dipu/tests/python/unittests/test_layer_norm.py
+++ b/dipu/tests/python/unittests/test_layer_norm.py
@@ -76,6 +76,21 @@ def test_layer_norm_no_affine(self):
         )
         self._run_layer_norm()
 
+    # maybe we don't want ChannelsLast -> Contiguous here, but just align with pytorch
+    # https://github.com/pytorch/pytorch/blob/v2.0.0/aten/src/ATen/native/cuda/layer_norm_kernel.cu#L1340-L1346
+    def test_layer_norm_out_format(self):
+        l = torch.nn.LayerNorm(4).cuda()
+        xs = [
+            torch.rand(2, 3, 5, 4, device='cuda').to(memory_format=torch.channels_last),
+            torch.rand(2, 4, 3, device='cuda').permute([0, 2, 1]),
+            torch.rand(2, 6, device='cuda')[:, 1:5],
+        ]
+        for x in xs:
+            y = l(x)
+            # seems can't get LEGACY_CONTIGUOUS_MEMORY_FORMAT in python,
+            # just assume it's MemoryFormat::Contiguous
+            self.assertTrue(y.is_contiguous())
+
 
 if __name__ == "__main__":
     run_tests()

From d53372ac7e4ba2afb0cbfd25b9176a440830808a Mon Sep 17 00:00:00 2001
From: Lingjie <lilingjie@sensetime.com>
Date: Wed, 3 Jan 2024 17:46:02 +0800
Subject: [PATCH 50/58] [dipu][refactor] Refactor individual tests for easier
 usage (#589)

* [dipu][refactor] Refactor individual tests for easier usage

* update readme

* remove unwanted test script

* add comment for `set_start_method`
---
 dipu/tests/python/README.md                   | 12 ++-
 ...enerate_unittest_for_individual_scripts.py |  2 +-
 .../python/individual_scripts/local_eviron.py |  1 -
 .../individual_scripts/stdout_redirector.py   |  1 -
 .../individual_scripts/test_allocator.py      | 54 ++++-------
 .../individual_scripts/test_dipu_fallback.py  | 61 +++++-------
 .../test_dipu_op_register.py                  | 42 +++-----
 .../individual_scripts/test_dipu_profiler.py  |  4 +-
 .../python/individual_scripts/test_dumparg.py |  4 +-
 .../individual_scripts/test_memory_stats.py   | 40 +++-----
 dipu/tests/python/individual_scripts/utils    |  1 +
 .../python/unittests/stdout_redirector.py     |  1 -
 .../python/unittests/test_profiler_cuda.py    |  2 +-
 dipu/tests/python/unittests/utils             |  1 +
 dipu/tests/python/utils/__init__.py           |  0
 .../python/utils}/local_eviron.py             |  0
 .../python/utils}/stdout_redirector.py        |  2 +-
 dipu/tests/python/utils/test_in_subprocess.py | 97 +++++++++++++++++++
 18 files changed, 191 insertions(+), 134 deletions(-)
 delete mode 120000 dipu/tests/python/individual_scripts/local_eviron.py
 delete mode 120000 dipu/tests/python/individual_scripts/stdout_redirector.py
 create mode 120000 dipu/tests/python/individual_scripts/utils
 delete mode 120000 dipu/tests/python/unittests/stdout_redirector.py
 create mode 120000 dipu/tests/python/unittests/utils
 create mode 100644 dipu/tests/python/utils/__init__.py
 rename dipu/{torch_dipu/testing/_internal => tests/python/utils}/local_eviron.py (100%)
 rename dipu/{torch_dipu/testing/_internal => tests/python/utils}/stdout_redirector.py (100%)
 create mode 100644 dipu/tests/python/utils/test_in_subprocess.py

diff --git a/dipu/tests/python/README.md b/dipu/tests/python/README.md
index 0c68dc8cfd..31dbb7ef64 100644
--- a/dipu/tests/python/README.md
+++ b/dipu/tests/python/README.md
@@ -28,12 +28,12 @@
     - 对于带有随机性的 op，可以考虑考察其分布的特征（参考 multinomial、random 等）。
     - 可以考虑不使用 assertion，只检测 error 不检测 failure（加上注释说明）。
   - `torch.allclose` **不**检测 shape、dtype 等，请谨慎使用。
-  - 如果需要检查 C++ 库内部的输出，可以使用 `test.python.utils.stdout_redirector.stdout_redirector` 来捕获。
+  - 如果需要检查 C++ 库内部的输出，可以使用 `utils.stdout_redirector.stdout_redirector` 来捕获。
   - 如果需要使用输出辅助 debug，可以考虑在使用 unittest 的 assertion 函数时传入 [`msg` 参数](https://docs.python.org/3/library/unittest.html#unittest.TestCase.assertEqual)。
 - **请勿**做对全局空间有影响的事，例如：
   - 修改 import 库的内容；
   - 在全局空间中定义其他函数和变量（考虑挪至 class 内）；
-  - 修改环境变量（可使用 `test.python.utils.local_eviron.local_eviron`）；
+  - 修改环境变量（可使用 `utils.local_eviron.local_eviron`）；
 - 应根据 torch 的文档广泛地测试各种使用场景。
   - 尽量借助 setUp()、class 变量等方式简化代码，不要复制大量代码，以便后续维护。
 - 对于预期会失败的测例，可以使用 `onlyOn` 和 `skipOn` 修饰器设置在某些设备上跳过测例（参考 cdist）。
@@ -46,17 +46,21 @@
 
 独立测例应该是一个可独立运行的 python 脚本。这些测试脚本会被自动转为单元测试，脚本返回值为 0 说明测试成功，否则测试失败。
 
-如果需要自动化检测 C++ 库内部的输出，可以使用 `test.python.utils.stdout_redirector.stdout_redirector` 来捕获。
+如果需要自动化检测 C++ 库内部的输出，可以使用 `utils.stdout_redirector.stdout_redirector` 来捕获。
 
 独立测例可以包含 print。不过，在自动生成的单元测试中，独立测例中的输出会在测试通过的情况下被消除。
 
+可以使用 `utils.test_in_subprocess.run_individual_test_cases` 在同一个文件中进行多个独立测例的编写。
+
 #### 子进程的 coverage 收集
 
 使用 `multiprocessing.Process` 创建的子进程在 CI 上跑 coverage 时不会被统计，因此使用这种测试方式（e.g. `test_allocator.py`）的独立测例需要一些特别的处理。
 
 #### C++ `gcov`
 
-在调用 `multiprocessing.Process` 之前，**必须**调用 `multiprocessing.set_start_method('spawn', force=True)` 修改 multiprocessing 的默认进程生成方式。
+~~在调用 `multiprocessing.Process` 之前，**必须**调用 `multiprocessing.set_start_method("spawn", force=True)` 修改 multiprocessing 的默认进程生成方式。~~
+
+请使用 `utils.test_in_subprocess.run_individual_test_cases` 来创建子进程。
 
 ##### Python `coverage`
 
diff --git a/dipu/tests/python/individual_scripts/generate_unittest_for_individual_scripts.py b/dipu/tests/python/individual_scripts/generate_unittest_for_individual_scripts.py
index c5e17db78b..39ab90d229 100644
--- a/dipu/tests/python/individual_scripts/generate_unittest_for_individual_scripts.py
+++ b/dipu/tests/python/individual_scripts/generate_unittest_for_individual_scripts.py
@@ -9,7 +9,7 @@ def generate_unittest_for_individual_scripts():
 import io
 import os
 import unittest
-from stdout_redirector import stdout_redirector
+from utils.stdout_redirector import stdout_redirector
 
 
 class TestIndividualScripts(unittest.TestCase):
diff --git a/dipu/tests/python/individual_scripts/local_eviron.py b/dipu/tests/python/individual_scripts/local_eviron.py
deleted file mode 120000
index 7570555029..0000000000
--- a/dipu/tests/python/individual_scripts/local_eviron.py
+++ /dev/null
@@ -1 +0,0 @@
-../../../torch_dipu/testing/_internal/local_eviron.py
\ No newline at end of file
diff --git a/dipu/tests/python/individual_scripts/stdout_redirector.py b/dipu/tests/python/individual_scripts/stdout_redirector.py
deleted file mode 120000
index fe5e70337c..0000000000
--- a/dipu/tests/python/individual_scripts/stdout_redirector.py
+++ /dev/null
@@ -1 +0,0 @@
-../../../torch_dipu/testing/_internal/stdout_redirector.py
\ No newline at end of file
diff --git a/dipu/tests/python/individual_scripts/test_allocator.py b/dipu/tests/python/individual_scripts/test_allocator.py
index 9ebe2563f3..281c4d25fa 100644
--- a/dipu/tests/python/individual_scripts/test_allocator.py
+++ b/dipu/tests/python/individual_scripts/test_allocator.py
@@ -1,8 +1,15 @@
+import itertools
 import os
-from multiprocessing import Process, set_start_method
+from utils.test_in_subprocess import run_individual_test_cases
 
 
-def test_allocator(max_allocate, step, algorithm, log_mask, test_pin_memory=True):
+def test_allocator(
+    max_allocate: int,
+    step: int,
+    algorithm: str,
+    log_mask: int,
+    test_pin_memory: bool = True,
+):
     os.environ["DIPU_DEVICE_MEMCACHING_ALGORITHM"] = algorithm
     os.environ["DIPU_DEBUG_ALLOCATOR"] = str(log_mask)
     os.environ["DIPU_MEM_CHECK"] = "1"
@@ -67,35 +74,16 @@ def test_allocator(max_allocate, step, algorithm, log_mask, test_pin_memory=True
 
 
 if __name__ == "__main__":
-    set_start_method('spawn', force=True)
-    max_allocate = 1 << 15
-    p1 = Process(
-        target=test_allocator,
-        args=(max_allocate, 1, "BF", 0),
+    MAX_ALLOCATE = 1 << 15
+    run_individual_test_cases(
+        itertools.product(
+            (test_allocator,),
+            (
+                {"args": (MAX_ALLOCATE, 1, "BF", 0)},
+                {"args": (MAX_ALLOCATE, 1, "BS", 0)},
+                {"args": (MAX_ALLOCATE, 1, "RAW", 0)},
+                {"args": (MAX_ALLOCATE, 17919, "BF", 3, False)},
+            ),
+        ),
+        in_parallel=False,
     )
-    p1.start()
-    p1.join()
-
-    p2 = Process(
-        target=test_allocator,
-        args=(max_allocate, 1, "BS", 0),
-    )
-    p2.start()
-    p2.join()
-
-    p3 = Process(target=test_allocator, args=(max_allocate, 1, "RAW", 0))
-    p3.start()
-    p3.join()
-
-    max_allocate = 1 << 30
-    p4 = Process(
-        target=test_allocator,
-        args=(max_allocate, 17919, "BF", 3, False),
-    )
-    p4.start()
-    p4.join()
-
-    assert p1.exitcode == 0
-    assert p2.exitcode == 0
-    assert p3.exitcode == 0
-    assert p4.exitcode == 0
diff --git a/dipu/tests/python/individual_scripts/test_dipu_fallback.py b/dipu/tests/python/individual_scripts/test_dipu_fallback.py
index e55e342590..f2dbf25027 100644
--- a/dipu/tests/python/individual_scripts/test_dipu_fallback.py
+++ b/dipu/tests/python/individual_scripts/test_dipu_fallback.py
@@ -1,19 +1,23 @@
 # Copyright (c) 2023, DeepLink.
 import io
+from typing import Callable, List
 import torch
-from stdout_redirector import stdout_redirector
-from local_eviron import local_eviron
-from multiprocessing import Process, set_start_method
+from utils.stdout_redirector import stdout_redirector
+from utils.local_eviron import local_eviron
+from utils.test_in_subprocess import run_individual_test_cases
 
 
 def test_fallback(
-    op_name: list, diopi_proto: list, test_fn: callable, other_check_item: list = None
-) -> str:
+    op_names: List[str],
+    diopi_protos: List[str],
+    test_fn: Callable[[], None],
+    extra_check_str_in_output: List[str] = [],
+) -> None:
     captured = io.BytesIO()
     with stdout_redirector(captured):
         with local_eviron(
             {
-                "DIPU_FORCE_FALLBACK_OPS_LIST": ",".join(op_name),
+                "DIPU_FORCE_FALLBACK_OPS_LIST": ",".join(op_names),
                 "DIPU_DUMP_OP_ARGS": "1",
                 "DIPU_LOG_FALLBACK_INFO": "1",
             }
@@ -22,14 +26,14 @@ def test_fallback(
 
             test_fn()
     output = captured.getvalue().decode()
-    print(output)
+    print(output, end="")
     assert all(
         f"force fallback has been set, {name} will be fallback to cpu" in output
-        for name in op_name
+        for name in op_names
     )
-    assert all(item not in output for item in diopi_proto)
-    if other_check_item is not None:
-        assert all(item in output for item in other_check_item)
+    assert all(item not in output for item in diopi_protos)
+    if extra_check_str_in_output is not None:
+        assert all(item in output for item in extra_check_str_in_output)
 
 
 def _test_dipu_fallback():
@@ -155,27 +159,14 @@ def fn():
 
 
 if __name__ == "__main__":
-    set_start_method("spawn", force=True)
-    p1 = Process(target=_test_dipu_fallback)
-    p1.start()
-    p1.join()
-
-    p2 = Process(target=_test_cpu_fallback)
-    p2.start()
-    p2.join()
-
-    p3 = Process(target=_test_dipu_index_put_impl_fallback)
-    p3.start()
-    p3.join()
-
-    p4 = Process(target=_test_dipu_copy_fallback_)
-    p4.start()
-    p4.join()
-
-    p5 = Process(target=_test_dipu_convolution_backward_overrideable_fallback)
-    p5.start()
-    p5.join()
-
-    p6 = Process(target=_test_dipu_convolution_overrideable_fallback)
-    p6.start()
-    p6.join()
+    run_individual_test_cases(
+        [
+            _test_dipu_fallback,
+            _test_cpu_fallback,
+            _test_dipu_index_put_impl_fallback,
+            _test_dipu_copy_fallback_,
+            _test_dipu_convolution_backward_overrideable_fallback,
+            _test_dipu_convolution_overrideable_fallback,
+        ],
+        in_parallel=True,
+    )
diff --git a/dipu/tests/python/individual_scripts/test_dipu_op_register.py b/dipu/tests/python/individual_scripts/test_dipu_op_register.py
index 770c41cfd7..dd0f580e72 100644
--- a/dipu/tests/python/individual_scripts/test_dipu_op_register.py
+++ b/dipu/tests/python/individual_scripts/test_dipu_op_register.py
@@ -1,9 +1,11 @@
 # Copyright (c) 2023, DeepLink.
-from multiprocessing import Process, set_start_method
-from local_eviron import local_eviron
+import itertools
+from typing import Union
+from utils.local_eviron import local_eviron
+from utils.test_in_subprocess import run_individual_test_cases
 
 
-def _test_op_register(mode):
+def _test_op_register(mode: Union[int, str]) -> None:
     with local_eviron(
         {"DIPU_IMMEDIATE_REGISTER_OP": str(mode), "DIPU_DUMP_OP_ARGS": "1"}
     ):
@@ -15,28 +17,14 @@ def _test_op_register(mode):
 
 
 if __name__ == "__main__":
-    set_start_method('spawn', force=True)
-    p1 = Process(
-        target=_test_op_register,
-        args=(0,),
+    run_individual_test_cases(
+        itertools.product(
+            (_test_op_register,),
+            (
+                {"args": (0,)},
+                {"args": (1,)},
+                {"args": ("",)},
+            ),
+        ),
+        in_parallel=True,
     )
-    p1.start()
-    p1.join()
-
-    p2 = Process(
-        target=_test_op_register,
-        args=(1,),
-    )
-    p2.start()
-    p2.join()
-
-    p3 = Process(
-        target=_test_op_register,
-        args=("",),
-    )
-    p3.start()
-    p3.join()
-
-    assert p1.exitcode == 0
-    assert p2.exitcode == 0
-    assert p3.exitcode == 0
diff --git a/dipu/tests/python/individual_scripts/test_dipu_profiler.py b/dipu/tests/python/individual_scripts/test_dipu_profiler.py
index 123bd1fa6f..95dfbd8042 100644
--- a/dipu/tests/python/individual_scripts/test_dipu_profiler.py
+++ b/dipu/tests/python/individual_scripts/test_dipu_profiler.py
@@ -8,7 +8,7 @@
 import torchvision.models as models
 from torch.profiler import profile, ProfilerActivity
 from torch_dipu.testing._internal.common_utils import TestCase, run_tests, onlyOn
-from torch_dipu.testing._internal.local_eviron import local_eviron
+from utils.local_eviron import local_eviron
 
 
 class TestProfiler(TestCase):
@@ -56,4 +56,4 @@ def test_profiler(self):
 
 
 if __name__ == "__main__":
-    run_tests()
\ No newline at end of file
+    run_tests()
diff --git a/dipu/tests/python/individual_scripts/test_dumparg.py b/dipu/tests/python/individual_scripts/test_dumparg.py
index c6629e8370..a2e3829ddf 100644
--- a/dipu/tests/python/individual_scripts/test_dumparg.py
+++ b/dipu/tests/python/individual_scripts/test_dumparg.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2023, DeepLink.
 import io
-from stdout_redirector import stdout_redirector
-from local_eviron import local_eviron
+from utils.stdout_redirector import stdout_redirector
+from utils.local_eviron import local_eviron
 
 
 def _test_copy_dumparg():
diff --git a/dipu/tests/python/individual_scripts/test_memory_stats.py b/dipu/tests/python/individual_scripts/test_memory_stats.py
index 34b044a1f2..3b50b5a377 100644
--- a/dipu/tests/python/individual_scripts/test_memory_stats.py
+++ b/dipu/tests/python/individual_scripts/test_memory_stats.py
@@ -1,8 +1,9 @@
+import itertools
 import os
-from multiprocessing import Process, set_start_method
+from utils.test_in_subprocess import run_individual_test_cases
 
 
-def test_mem_stats(algorithm, log_mask):
+def test_mem_stats(algorithm: str, log_mask: int):
     os.environ["DIPU_DEVICE_MEMCACHING_ALGORITHM"] = algorithm
     os.environ["DIPU_DEBUG_ALLOCATOR"] = str(log_mask)
     print("allocator algorithm:", algorithm)
@@ -13,7 +14,7 @@ def test_mem_stats(algorithm, log_mask):
     ins = []
     pin_ins = []
     real_allocated = 0
-    for i in range(100):
+    for _ in range(100):
         numel = random.randint(0, 1 << 20)
         x = torch.randn(numel).to(torch.device("cuda:0"))
         y = torch.randn(numel).pin_memory()
@@ -37,7 +38,7 @@ def test_mem_stats(algorithm, log_mask):
 
     real_max_allocate = real_allocated
 
-    for i in range(len(ins)):
+    for _ in range(len(ins)):
         numel = ins[0].numel()
         real_allocated -= ((numel * 4 - 1) | 511) + 1
         ins.pop(0)
@@ -61,25 +62,14 @@ def test_mem_stats(algorithm, log_mask):
 
 
 if __name__ == "__main__":
-    set_start_method('spawn', force=True)
-    p1 = Process(
-        target=test_mem_stats,
-        args=("BF", 0),
+    run_individual_test_cases(
+        itertools.product(
+            (test_mem_stats,),
+            (
+                {"args": ("BF", 0)},
+                {"args": ("BS", 0)},
+                {"args": ("RAW", 0)},
+            ),
+        ),
+        in_parallel=False,
     )
-    p1.start()
-    p1.join()
-
-    p2 = Process(
-        target=test_mem_stats,
-        args=("BS", 0),
-    )
-    p2.start()
-    p2.join()
-
-    p3 = Process(target=test_mem_stats, args=("RAW", 0))
-    p3.start()
-    p3.join()
-
-    assert p1.exitcode == 0
-    assert p2.exitcode == 0
-    assert p3.exitcode == 0
diff --git a/dipu/tests/python/individual_scripts/utils b/dipu/tests/python/individual_scripts/utils
new file mode 120000
index 0000000000..468ba705ba
--- /dev/null
+++ b/dipu/tests/python/individual_scripts/utils
@@ -0,0 +1 @@
+../utils
\ No newline at end of file
diff --git a/dipu/tests/python/unittests/stdout_redirector.py b/dipu/tests/python/unittests/stdout_redirector.py
deleted file mode 120000
index fe5e70337c..0000000000
--- a/dipu/tests/python/unittests/stdout_redirector.py
+++ /dev/null
@@ -1 +0,0 @@
-../../../torch_dipu/testing/_internal/stdout_redirector.py
\ No newline at end of file
diff --git a/dipu/tests/python/unittests/test_profiler_cuda.py b/dipu/tests/python/unittests/test_profiler_cuda.py
index bd62aa9051..3937cf4e7b 100644
--- a/dipu/tests/python/unittests/test_profiler_cuda.py
+++ b/dipu/tests/python/unittests/test_profiler_cuda.py
@@ -5,7 +5,7 @@
 import torchvision.models as models
 from torch.profiler import profile, ProfilerActivity
 from torch_dipu.testing._internal.common_utils import TestCase, run_tests, onlyOn
-from torch_dipu.testing._internal.local_eviron import local_eviron
+from utils.local_eviron import local_eviron
 
 
 class TestProfiler(TestCase):
diff --git a/dipu/tests/python/unittests/utils b/dipu/tests/python/unittests/utils
new file mode 120000
index 0000000000..468ba705ba
--- /dev/null
+++ b/dipu/tests/python/unittests/utils
@@ -0,0 +1 @@
+../utils
\ No newline at end of file
diff --git a/dipu/tests/python/utils/__init__.py b/dipu/tests/python/utils/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/dipu/torch_dipu/testing/_internal/local_eviron.py b/dipu/tests/python/utils/local_eviron.py
similarity index 100%
rename from dipu/torch_dipu/testing/_internal/local_eviron.py
rename to dipu/tests/python/utils/local_eviron.py
diff --git a/dipu/torch_dipu/testing/_internal/stdout_redirector.py b/dipu/tests/python/utils/stdout_redirector.py
similarity index 100%
rename from dipu/torch_dipu/testing/_internal/stdout_redirector.py
rename to dipu/tests/python/utils/stdout_redirector.py
index 903f023c51..64669caae9 100644
--- a/dipu/torch_dipu/testing/_internal/stdout_redirector.py
+++ b/dipu/tests/python/utils/stdout_redirector.py
@@ -48,12 +48,12 @@ def _redirect_stdout(to_fd):
         _redirect_stdout(tfile.fileno())
         # Yield to caller, then redirect stdout back to the saved fd
         yield
+    finally:
         _redirect_stdout(saved_stdout_fd)
         # Copy contents of temporary file to the given stream
         tfile.flush()
         tfile.seek(0, io.SEEK_SET)
         stream.write(tfile.read())
-    finally:
         tfile.close()
         os.close(saved_stdout_fd)
 
diff --git a/dipu/tests/python/utils/test_in_subprocess.py b/dipu/tests/python/utils/test_in_subprocess.py
new file mode 100644
index 0000000000..6268ea6997
--- /dev/null
+++ b/dipu/tests/python/utils/test_in_subprocess.py
@@ -0,0 +1,97 @@
+import io
+import os
+import pathlib
+import queue
+import sys
+from multiprocessing import Process, Queue, set_start_method
+from tempfile import TemporaryDirectory
+from typing import Callable, Iterable, List, Tuple, TypedDict, Union
+from .stdout_redirector import stdout_redirector
+
+
+class Args(TypedDict, total=False):
+    args: tuple
+    kwargs: dict
+
+
+def _run_individual_test_cases_sequential(
+    entry_points: Iterable[Tuple[Callable, Args]]
+) -> None:
+    all_tests_pass = True
+    for entry_point, args in entry_points:
+        p = Process(
+            target=entry_point, args=args.get("args", ()), kwargs=args.get("kwargs", {})
+        )
+        p.start()
+        p.join()
+        all_tests_pass = all_tests_pass and p.exitcode == 0
+    assert all_tests_pass
+
+
+def _entry_point_wrapper(
+    entry_point: Callable, future_output: Queue, log_dir: str, *args, **kwargs
+) -> None:
+    sys.stderr = open(f"{log_dir}/stderr_{os.getpid()}", "w")
+    captured = io.BytesIO()
+    try:
+        with stdout_redirector(captured):
+            entry_point(*args, **kwargs)
+    finally:
+        future_output.put(captured.getvalue().decode("utf-8"))
+
+
+def _run_individual_test_cases_parallel(
+    entry_points: Iterable[Tuple[Callable, Args]]
+) -> None:
+    with TemporaryDirectory() as tmpdir:
+        future_outputs: List[Queue] = []
+        ps: List[Process] = []
+        for entry_point, args in entry_points:
+            future_output = Queue()
+            p = Process(
+                target=_entry_point_wrapper,
+                args=(entry_point, future_output, tmpdir) + args.get("args", ()),
+                kwargs=args.get("kwargs", {}),
+            )
+            p.start()
+            future_outputs.append(future_output)
+            ps.append(p)
+
+        all_tests_pass = True
+        for p, future_output in zip(ps, future_outputs):
+            p.join()
+            try:
+                print(future_output.get_nowait(), end="")
+            except queue.Empty:
+                all_tests_pass = False
+            print(
+                pathlib.Path(f"{tmpdir}/stderr_{p.pid}").read_text(),
+                end="",
+                file=sys.stderr,
+            )
+            all_tests_pass = all_tests_pass and p.exitcode == 0
+        assert all_tests_pass
+
+
+def run_individual_test_cases(
+    entry_points: Iterable[Union[Callable, Tuple[Callable, Args]]],
+    in_parallel: bool = False,
+) -> None:
+    """
+    Run test cases in individual processes in parallel or sequential.
+    WARN: This function must be called within an `if __name__ == "__main__"` region.
+    ---
+    Args:
+        `entry_points`: A sequence of test cases. Each test case is either a function
+            or a tuple of a function and its arguments
+            `(func, {"args": [...], "kwargs": {...}})`.
+        `in_parallel`: Whether to run test cases in parallel.
+    """
+    set_start_method("spawn", force=True)  # this is required for gcov to work
+    uniform_entry_points: Iterable[Tuple[Callable, Args]] = map(
+        lambda x: x if isinstance(x, tuple) else (x, {}), entry_points
+    )
+    if in_parallel:
+        _run_individual_test_cases_parallel(uniform_entry_points)
+    else:
+        _run_individual_test_cases_sequential(uniform_entry_points)

From 2cf2c90f05f0b664f0fc3a2463ad52a45469d159 Mon Sep 17 00:00:00 2001
From: wugeshui <106943115+wugeshui@users.noreply.github.com>
Date: Thu, 4 Jan 2024 18:07:57 +0800
Subject: [PATCH 51/58] update REQUIRE_COVERAGE to 0 (#599)

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 9b3f4cff4e..584277a9ae 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -24,7 +24,7 @@ env:
   CI_BUILD_FLAG: "ci_build_flag"
   PYTORCH_COMMIT: ${{ vars.PYTORCH_COMMIT != '' && vars.PYTORCH_COMMIT || 'c263bd43e8e8502d4726643bc6fd046f0130ac0e' }} # pytorch tag 2.0
   ALL_COVERAGE: ${{ (contains( github.ref, 'main') || startsWith(github.ref, 'refs/heads/v') || startsWith(github.ref, 'refs/heads/dev')) && 'ON' || 'OFF' }}
-  REQUIRE_COVERAGE: ${{ vars.REQUIRE_COVERAGE != '' && vars.REQUIRE_COVERAGE || '40' }}
+  REQUIRE_COVERAGE: ${{ vars.REQUIRE_COVERAGE != '' && vars.REQUIRE_COVERAGE || '0' }}
   REPO: ${{ github.event.repository.name }}
 
 concurrency:

From d6c204f210e67ad55b36b6cb66bae669a8198b11 Mon Sep 17 00:00:00 2001
From: ustclight-sls <55499123+ustclight-sls@users.noreply.github.com>
Date: Thu, 4 Jan 2024 18:21:29 +0800
Subject: [PATCH 52/58] [DIPU]Restore partial modifications of improving
 coverage (#596)

---
 dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.cpp | 1 +
 dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.h   | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.cpp
index eb3db1223f..9ee8d69dc5 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.cpp
@@ -74,6 +74,7 @@ at::Generator createDIPUGenerator(at::DeviceIndex device_index) {
 DIPUGeneratorImpl::DIPUGeneratorImpl(at::DeviceIndex device_index)
     : c10::GeneratorImpl{at::Device(dipu::DIPU_DEVICE_TYPE, device_index),
                          at::DispatchKeySet(dipu::DIPU_DISPATCH_KEY)},
+      offset_(0),
       state_need_reset_(true) {}
 
 /**
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.h b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.h
index 1ec5c17ec6..ae282f1e14 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.h
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.h
@@ -20,11 +20,15 @@ class DIPUGeneratorImpl : public c10::GeneratorImpl {
   static at::DeviceType device_type();
   c10::intrusive_ptr<c10::TensorImpl> get_state() const override;
 
+  virtual void set_offset(uint64_t offset) { offset_ = offset; }
+  virtual uint64_t get_offset() const { return offset_; }
+
  protected:
   void set_state_flag(bool flag);
   virtual void update_state() const = 0;
 
   DIPUGeneratorImpl* clone_impl() const override;
+  volatile uint64_t offset_;
   uint64_t seed_ = c10::default_rng_seed_val;
   mutable at::Tensor state_;
   mutable bool state_need_reset_;

From 537e0d695d3bf03995b72fe635e4dcb21476a2e2 Mon Sep 17 00:00:00 2001
From: Lingjie <lilingjie@sensetime.com>
Date: Fri, 5 Jan 2024 14:09:07 +0800
Subject: [PATCH 53/58] [both] keep me informed when certain files change
 (#601)

---
 .github/CODEOWNERS | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index b813c042ae..34742982e8 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,4 +1,31 @@
 # IMPORTANT:
 # This file is ONLY used to merge PRs. Approvals from people in this file are required for merging.
+#
+# WARNING: The last matching pattern takes the most precedence and OVERWRITES previous rules.
+#          Please be very careful when adding new patterns.
 
-/dipu/tests/python @lljbash @mrdanielw
+# ---------- base ----------
+
+*               @mrdanielw
+/.github/       @mrdanielw @wugeshui
+**/scripts/ci/  @mrdanielw @wugeshui
+
+# ---------- dipu ----------
+
+### directories & files
+/dipu/torch_dipu/csrc_dipu/                                   @mrdanielw @fandaoyi @lljbash
+/dipu/tests/python/                                           @mrdanielw @lljbash
+/dipu/scripts/autogen_diopi_wrapper/                          @mrdanielw @lljbash
+/dipu/scripts/autogen_diopi_wrapper/autogen_diopi_wrapper.py  @mrdanielw @zhaoguochun1995
+
+### build & linter
+/dipu/**/CMakeLists.txt  @mrdanielw @lljbash @wiryls @Wrench-Git
+/dipu/**/*.cmake         @mrdanielw @lljbash @wiryls @Wrench-Git
+/dipu/.clang*            @mrdanielw @lljbash @wiryls
+
+### languages
+/dipu/**/*.cpp  @mrdanielw @lljbash
+/dipu/**/*.hpp  @mrdanielw @lljbash
+/dipu/**/*.h    @mrdanielw @lljbash
+
+# ---------- dicp ----------

From d07b3c8c5f56ebe873f8b13f0653d3fdb0a38a20 Mon Sep 17 00:00:00 2001
From: HuayiL <442488254@qq.com>
Date: Fri, 5 Jan 2024 17:13:55 +0800
Subject: [PATCH 54/58] batch_norm_stats return two one dimension tensor, it
 should not have difference with contiguous or channel_last. (#561)

---
 dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
index dd700285b3..5ad4cab659 100755
--- a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
+++ b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
@@ -2099,8 +2099,8 @@
 - schema: batch_norm_stats(Tensor input, float eps) -> (Tensor, Tensor)
   custom_code_at_the_beginning: |
     auto shape = input.size(1);
-    auto out0 = at::empty({shape}, input.options().dtype(at::kFloat), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
-    auto out1 = at::empty({shape}, input.options().dtype(at::kFloat), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+    auto out0 = at::empty({shape}, input.options().dtype(at::kFloat));
+    auto out1 = at::empty({shape}, input.options().dtype(at::kFloat));
   interface: diopiBatchNormStats(ctx, out0, out1, input, eps)
 
 - schema: batch_norm_gather_stats_with_counts(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, Tensor counts) -> (Tensor, Tensor)

From 4059f523c591ae4d795a69066046d31fe83ff77f Mon Sep 17 00:00:00 2001
From: wugeshui <106943115+wugeshui@users.noreply.github.com>
Date: Mon, 8 Jan 2024 10:21:32 +0800
Subject: [PATCH 55/58] rm partition machine (#605)

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 584277a9ae..852a4335f9 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -15,7 +15,7 @@ env:
   CAMB_CLUSTER: CAMB
   CAMB_TORCH_BASE_DIR: '/mnt/lustre/share/parrotsci/github/cibuild/pytorchbase'
   CUDA_CI_PATH: '/mnt/cache/share/parrotsci/github/cibuild/${{ github.repository }}'
-  CUDA_PARTATION: ${{ vars.SH1988_SLURM_PAR != '' && vars.SH1988_SLURM_PAR || 'pat_rd -x SH-IDC1-10-198-8-60' }}
+  CUDA_PARTATION: ${{ vars.SH1988_SLURM_PAR != '' && vars.SH1988_SLURM_PAR || 'pat_rd' }}
   CUDA_CLUSTER: SH1988
   DEEPLINK_PATH: '/mnt/cache/share/deeplinkci/github/${{ github.repository }}'
   ASCEND_CLUSTER: ASCEND

From 6d22a3c6a66e604c77e0b572270a58f29c4d49de Mon Sep 17 00:00:00 2001
From: Lingjie <lilingjie@sensetime.com>
Date: Mon, 8 Jan 2024 10:58:16 +0800
Subject: [PATCH 56/58] [dipu][optim] speedup mm by register dipu_mm and use
 nodispatch::empty (#590)

* speedup mm by register dipu_mm and use nodispatch::empty

* fix shape error; enhance test_mm

* revert some unwanted modifications

* fix format
---
 .../diopi_functions.yaml                      |   5 +
 .../diopi_wrapper_template.py                 |  41 ++++++-
 dipu/tests/python/unittests/test_mm.py        |   2 +-
 .../csrc_dipu/aten/DIPUATenFunctions.h        | 107 +++++++++---------
 .../csrc_dipu/aten/RegisterDIPU.cpp           |   2 +-
 .../csrc_dipu/aten/ops/DIPUCopy.cpp           |   4 +-
 .../csrc_dipu/aten/ops/EmptyOpsKernel.cpp     |  55 +++++----
 .../csrc_dipu/aten/ops/NodispatchUtils.hpp    |  40 +++++++
 .../torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp |  58 ++++++----
 .../csrc_dipu/aten/ops/PinMemoryKernel.cpp    |   9 +-
 .../csrc_dipu/aten/ops/StorageShapeKernel.cpp |  24 ++--
 .../torch_dipu/csrc_dipu/binding/ExportRT.cpp |   2 +-
 .../runtime/core/allocator/DIPURawAllocator.h |   2 +-
 dipu/torch_dipu/csrc_dipu/utils/Log.h         |  15 ++-
 .../vendor/cuda/patch/wrapperRegister.cpp     |  12 +-
 15 files changed, 254 insertions(+), 124 deletions(-)
 create mode 100644 dipu/torch_dipu/csrc_dipu/aten/ops/NodispatchUtils.hpp

diff --git a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
index 5ad4cab659..242798a09d 100755
--- a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
+++ b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
@@ -1494,6 +1494,11 @@
   autocompare: disable
   interface: diopiNormalInp(ctx, self, mean, std, generator)
 
+- schema: "mm(Tensor self, Tensor mat2) -> Tensor"
+  custom_code_at_the_beginning: |
+    auto out = nodispatch::empty({self.sizes()[0], mat2.sizes()[1]}, self.options());
+  interface: diopiMm(ctx, out, self, mat2)
+
 - schema: "mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)"
   custom_fallback: True
   interface: diopiMm(ctx, out, self, mat2)
diff --git a/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py b/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py
index 667d841d24..1f4536cdd9 100644
--- a/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py
+++ b/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py
@@ -1,19 +1,58 @@
 # Copyright (c) 2023, DeepLink.
 diopi_wrapper_file_template_content = \
 """// autogened file
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <iostream>
+#include <ostream>
+#include <tuple>
+#include <utility>
 #include <vector>
 
-#include <ATen/ATen.h>
+#include <ATen/ExpandUtils.h>
 #include <ATen/Functions.h>
 #include <ATen/Tensor.h>
+#include <ATen/core/ATen_fwd.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/LegacyTypeDispatch.h>
+#include <ATen/core/List.h>
+#include <ATen/core/TensorBody.h>
 #include <ATen/native/ReduceOpsUtils.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/scalar_tensor.h>
+#include <ATen/ops/to_native.h>
+#include <ATen/ops/zeros.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/SymInt.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
+#include <c10/util/SmallVector.h>
+#include <c10/util/accumulate.h>
+#include <c10/util/string_view.h>
 #include <torch/csrc/autograd/custom_function.h>
+#include <torch/csrc/autograd/generated/variable_factories.h>
 #include <torch/types.h>
 
+#include <diopi/diopirt.h>
+#include <diopi/functions.h>
+
 #include "csrc_dipu/aten/RegisterDIPU.hpp"
 #include "csrc_dipu/aten/ops/DIPUCopy.hpp"
+#include "csrc_dipu/aten/ops/NodispatchUtils.hpp"
+#include "csrc_dipu/aten/ops/OpUtils.hpp"
+#include "csrc_dipu/base/basedef.h"
 #include "csrc_dipu/diopirt/diopirt_impl.h"
 #include "csrc_dipu/profiler/profiler.h"
+#include "csrc_dipu/runtime/core/DIPUGeneratorImpl.h"
+#include "csrc_dipu/runtime/core/DIPUStream.h"
 
 #include "CustomFallbackFunctions.hpp"
 
diff --git a/dipu/tests/python/unittests/test_mm.py b/dipu/tests/python/unittests/test_mm.py
index f3c8a7eb10..992ece4f82 100644
--- a/dipu/tests/python/unittests/test_mm.py
+++ b/dipu/tests/python/unittests/test_mm.py
@@ -9,7 +9,7 @@ def test_mm(self):
         dipu = torch.device("dipu")
         cpu = torch.device("cpu")
         mat1 = torch.randn(2, 3)
-        mat2 = torch.randn(3, 3)
+        mat2 = torch.randn(3, 4)
         r1 = torch.mm(mat1.to(dipu), mat2.to(dipu))
         r2 = torch.mm(mat1.to(cpu), mat2.to(cpu))
         self.assertEqual(r1.to(cpu), r2)
diff --git a/dipu/torch_dipu/csrc_dipu/aten/DIPUATenFunctions.h b/dipu/torch_dipu/csrc_dipu/aten/DIPUATenFunctions.h
index 010c07836c..36bc802fa3 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/DIPUATenFunctions.h
+++ b/dipu/torch_dipu/csrc_dipu/aten/DIPUATenFunctions.h
@@ -1,62 +1,65 @@
 // Copyright (c) 2023, DeepLink.
 #pragma once
 
-#include <ATen/ATen.h>
-#include <ATen/Dispatch.h>
-#include <ATen/Tensor.h>
+#include <cstddef>
+#include <cstdint>
+
+#include <ATen/core/ATen_fwd.h>
+#include <ATen/core/TensorBody.h>
+#include <c10/core/Device.h>
+#include <c10/core/Layout.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/Storage.h>
+#include <c10/core/StorageImpl.h>
+#include <c10/util/Optional.h>
 
 namespace dipu {
 namespace native {
+namespace dipu_aten {
+// dipu native func
+at::Tensor empty(at::IntArrayRef size, c10::optional<at::ScalarType> dtype_opt,
+                 c10::optional<at::Layout> layout_opt,
+                 c10::optional<at::Device> device_opt,
+                 c10::optional<bool> pin_memory_opt,
+                 c10::optional<at::MemoryFormat> memory_format_opt);
+at::Tensor empty_cpu(at::IntArrayRef size,
+                     c10::optional<at::ScalarType> dtype_opt,
+                     c10::optional<at::Layout> layout_opt,
+                     c10::optional<at::Device> device_opt,
+                     c10::optional<bool> pin_memory_opt,
+                     c10::optional<at::MemoryFormat> memory_format_opt);
+
+at::Tensor empty_strided(at::IntArrayRef size, at::IntArrayRef stride,
+                         c10::optional<at::ScalarType> dtype_opt,
+                         c10::optional<at::Layout> layout_opt,
+                         c10::optional<at::Device> device_opt,
+                         c10::optional<bool> pin_memory_opt);
+at::Tensor empty_strided_cpu(at::IntArrayRef size, at::IntArrayRef stride,
+                             c10::optional<at::ScalarType> dtype_opt,
+                             c10::optional<at::Layout> layout_opt,
+                             c10::optional<at::Device> device_opt,
+                             c10::optional<bool> pin_memory_opt);
+
+const at::Tensor& resize_(const at::Tensor& self, at::IntArrayRef size,
+                          c10::optional<at::MemoryFormat> memory_format);
+
+at::Scalar _local_scalar_dense_dipu(const at::Tensor& self);
+
+at::Tensor& set_storage_dipu_(at::Tensor& result, c10::Storage storage,
+                              int64_t storage_offset, at::IntArrayRef size,
+                              at::IntArrayRef stride);
+at::Tensor& set_dipu_(at::Tensor& self);
+
+void resize_bytes_dipu(c10::StorageImpl* storage, size_t newsize_bytes);
+
+bool is_pinned(const at::Tensor& self, c10::optional<at::Device> device);
+at::Tensor _pin_memory(const at::Tensor& self,
+                       c10::optional<at::Device> device);
 
-struct DIPUATenFunctions {
-  // dipu native func
-  static at::Tensor empty(at::IntArrayRef size,
-                          c10::optional<at::ScalarType> dtype_opt,
-                          c10::optional<at::Layout> layout_opt,
-                          c10::optional<at::Device> device_opt,
-                          c10::optional<bool> pin_memory_opt,
-                          c10::optional<at::MemoryFormat> memory_format_opt);
-  static at::Tensor empty_cpu(
-      at::IntArrayRef size, c10::optional<at::ScalarType> dtype_opt,
-      c10::optional<at::Layout> layout_opt,
-      c10::optional<at::Device> device_opt, c10::optional<bool> pin_memory_opt,
-      c10::optional<at::MemoryFormat> memory_format_opt);
-
-  static at::Tensor empty_strided(at::IntArrayRef size, at::IntArrayRef stride,
-                                  c10::optional<at::ScalarType> dtype_opt,
-                                  c10::optional<at::Layout> layout_opt,
-                                  c10::optional<at::Device> device_opt,
-                                  c10::optional<bool> pin_memory_opt);
-  static at::Tensor empty_strided_cpu(at::IntArrayRef size,
-                                      at::IntArrayRef stride,
-                                      c10::optional<at::ScalarType> dtype_opt,
-                                      c10::optional<at::Layout> layout_opt,
-                                      c10::optional<at::Device> device_opt,
-                                      c10::optional<bool> pin_memory_opt);
-
-  static const at::Tensor& resize_(
-      const at::Tensor& self, at::IntArrayRef size,
-      c10::optional<at::MemoryFormat> memory_format);
-
-  static at::Scalar _local_scalar_dense_dipu(const at::Tensor& self);
-
-  static at::Tensor& set_storage_dipu_(at::Tensor& result, c10::Storage storage,
-                                       int64_t storage_offset,
-                                       at::IntArrayRef size,
-                                       at::IntArrayRef stride);
-  static at::Tensor& set_dipu_(at::Tensor& self);
-
-  static void resize_bytes_dipu(c10::StorageImpl* storage,
-                                size_t newsize_bytes);
-
-  static bool is_pinned(const at::Tensor& self,
-                        c10::optional<at::Device> device);
-  static at::Tensor _pin_memory(const at::Tensor& self,
-                                c10::optional<at::Device> device);
-
-  // todo:: use same format as autogen
-  // diopi function defined in AutoGenedKernels.cpp,
-};
+// todo:: use same format as autogen
+// diopi function defined in AutoGenedKernels.cpp,
+};  // namespace dipu_aten
 
 }  // namespace native
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.cpp b/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.cpp
index 2f2cc88d76..e03796b938 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.cpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.cpp
@@ -14,7 +14,7 @@
 #include <csrc_dipu/base/basedef.h>
 #include <csrc_dipu/profiler/profiler.h>
 
-using dnative = dipu::native::DIPUATenFunctions;
+namespace dnative = dipu::native::dipu_aten;
 
 namespace dipu {
 namespace {
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUCopy.cpp b/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUCopy.cpp
index eb75a7b8cb..523533cddf 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUCopy.cpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUCopy.cpp
@@ -35,7 +35,8 @@ void setDipuCopyInstance(DIPUCopyBase* op) { dipu_copy_op() = op; }
 
 namespace dipu {
 namespace native {
-at::Scalar DIPUATenFunctions::_local_scalar_dense_dipu(const at::Tensor& self) {
+namespace dipu_aten {
+at::Scalar _local_scalar_dense_dipu(const at::Tensor& self) {
   at::Scalar r;
   AT_DISPATCH_ALL_TYPES_AND2(
       at::kHalf, at::kBool, self.scalar_type(), "_local_scalar_dense_dipu",
@@ -50,5 +51,6 @@ at::Scalar DIPUATenFunctions::_local_scalar_dense_dipu(const at::Tensor& self) {
       });
   return r;
 }
+}  // namespace dipu_aten
 }  // namespace native
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/EmptyOpsKernel.cpp b/dipu/torch_dipu/csrc_dipu/aten/ops/EmptyOpsKernel.cpp
index 2e34c78061..0467b4a76d 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/EmptyOpsKernel.cpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/EmptyOpsKernel.cpp
@@ -1,13 +1,25 @@
 // Copyright (c) 2023, DeepLink.
 #include <ATen/EmptyTensor.h>
+#include <ATen/core/ATen_fwd.h>
+#include <c10/core/Allocator.h>
 #include <c10/core/CPUAllocator.h>
-#include <c10/core/TensorImpl.h>
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/DispatchKey.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/core/Layout.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/ScalarType.h>
 #include <c10/core/TensorOptions.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
 #include <c10/util/accumulate.h>
 
-#include <csrc_dipu/aten/DIPUATenFunctions.h>
-#include <csrc_dipu/profiler/profiler.h>
-#include <csrc_dipu/runtime/rthelper.h>
+#include "csrc_dipu/aten/DIPUATenFunctions.h"
+#include "csrc_dipu/base/basedef.h"
+#include "csrc_dipu/profiler/profiler.h"
+#include "csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h"
+#include "csrc_dipu/runtime/rthelper.h"
 
 namespace dipu {
 namespace native {
@@ -19,11 +31,12 @@ static c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
   return c10::GetCPUAllocator();
 }
 
-at::Tensor DIPUATenFunctions::empty(
-    at::IntArrayRef size, c10::optional<at::ScalarType> dtype_opt,
-    c10::optional<at::Layout> layout_opt, c10::optional<at::Device> device_opt,
-    c10::optional<bool> pin_memory_opt,
-    c10::optional<at::MemoryFormat> memory_format_opt) {
+at::Tensor dipu_aten::empty(at::IntArrayRef size,
+                            c10::optional<at::ScalarType> dtype_opt,
+                            c10::optional<at::Layout> layout_opt,
+                            c10::optional<at::Device> device_opt,
+                            c10::optional<bool> pin_memory_opt,
+                            c10::optional<at::MemoryFormat> memory_format_opt) {
   dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(c10::device_or_default(device_opt).type() ==
                                    dipu::DIPU_DEVICE_TYPE);
@@ -37,7 +50,7 @@ at::Tensor DIPUATenFunctions::empty(
                                    memory_format_opt);
 }
 
-at::Tensor DIPUATenFunctions::empty_cpu(
+at::Tensor dipu_aten::empty_cpu(
     at::IntArrayRef size, c10::optional<at::ScalarType> dtype_opt,
     c10::optional<at::Layout> layout_opt, c10::optional<at::Device> device_opt,
     c10::optional<bool> pin_memory_opt,
@@ -56,11 +69,12 @@ at::Tensor DIPUATenFunctions::empty_cpu(
 }
 
 // use empty_generic, test
-at::Tensor DIPUATenFunctions::empty_strided(
-    at::IntArrayRef size, at::IntArrayRef stride,
-    c10::optional<at::ScalarType> dtype_opt,
-    c10::optional<at::Layout> layout_opt, c10::optional<at::Device> device_opt,
-    c10::optional<bool> pin_memory_opt) {
+at::Tensor dipu_aten::empty_strided(at::IntArrayRef size,
+                                    at::IntArrayRef stride,
+                                    c10::optional<at::ScalarType> dtype_opt,
+                                    c10::optional<at::Layout> layout_opt,
+                                    c10::optional<at::Device> device_opt,
+                                    c10::optional<bool> pin_memory_opt) {
   dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__);
   auto device = c10::device_or_default(device_opt);
   AT_ASSERT(device.type() == dipu::DIPU_DEVICE_TYPE);
@@ -73,11 +87,12 @@ at::Tensor DIPUATenFunctions::empty_strided(
                                            dtype);
 }
 
-at::Tensor DIPUATenFunctions::empty_strided_cpu(
-    at::IntArrayRef size, at::IntArrayRef stride,
-    c10::optional<at::ScalarType> dtype_opt,
-    c10::optional<at::Layout> layout_opt, c10::optional<at::Device> device_opt,
-    c10::optional<bool> pin_memory_opt) {
+at::Tensor dipu_aten::empty_strided_cpu(at::IntArrayRef size,
+                                        at::IntArrayRef stride,
+                                        c10::optional<at::ScalarType> dtype_opt,
+                                        c10::optional<at::Layout> layout_opt,
+                                        c10::optional<at::Device> device_opt,
+                                        c10::optional<bool> pin_memory_opt) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(c10::device_or_default(device_opt).type() ==
                                    c10::DeviceType::CPU);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(c10::layout_or_default(layout_opt) ==
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/NodispatchUtils.hpp b/dipu/torch_dipu/csrc_dipu/aten/ops/NodispatchUtils.hpp
new file mode 100644
index 0000000000..38fed18260
--- /dev/null
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/NodispatchUtils.hpp
@@ -0,0 +1,40 @@
+// Copyright (c) 2023, DeepLink.
+//
+// This file contains useful wrappers for DIPU ATen functions.
+// You should use `nodispatch::foo` instead of calling `at::foo` whenever
+// possible to avoid dispatch overhead.
+
+#pragma once
+
+#include <ATen/core/ATen_fwd.h>
+#include <ATen/core/TensorBody.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/ScalarTypeToTypeMeta.h>
+#include <c10/util/Optional.h>
+
+#include "csrc_dipu/aten/DIPUATenFunctions.h"
+
+namespace dipu {
+namespace native {
+namespace nodispatch {
+// add any other `at::foo` functions you need here
+
+// an equivalent to `at::empty` but without dispatch
+inline at::Tensor empty(
+    at::IntArrayRef size, at::TensorOptions options = {},
+    c10::optional<at::MemoryFormat> memory_format = c10::nullopt) {
+  return dipu_aten::empty(
+      size, c10::optTypeMetaToScalarType(options.dtype_opt()),
+      options.layout_opt(), options.device_opt(), options.pinned_memory_opt(),
+      c10::impl::check_tensor_options_and_extract_memory_format(options,
+                                                                memory_format));
+}
+
+// an simplified version of `at::empty_like` but without dispatch
+inline at::Tensor empty_like(const at::Tensor& self) {
+  return empty(self.sizes(), self.options());
+}
+
+}  // namespace nodispatch
+}  // namespace native
+}  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp b/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp
index af24e58089..b710fccd44 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp
@@ -1,5 +1,25 @@
 #pragma once
 
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <ATen/core/ATen_fwd.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/List.h>
+#include <ATen/core/TensorBody.h>
+#include <ATen/ops/abs.h>
+#include <ATen/ops/allclose.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Optional.h>
+#include <c10/util/OptionalArrayRef.h>
+#include <c10/util/string_view.h>
+
+#include "csrc_dipu/runtime/core/DIPUStream.h"
 #include <csrc_dipu/runtime/rthelper.h>
 #include <csrc_dipu/utils/Log.h>
 
@@ -23,7 +43,7 @@ inline void synchronizeIfEnable() {
     DIPU_LOG_ONCE << "The synchronous operation is performed after "
                   << "the diopi function call because the DIPU_SYNC_EXEC_MODE "
                      "environment variable is set"
-                  << std::endl;
+                  << '\n';
     dipu::getCurrentDIPUStream().synchronize();
   }
 }
@@ -35,14 +55,14 @@ inline int dumpOpArgLevel() {
 }
 
 template <typename T>
-static std::string dumpArg(const T& t) {
+std::string dumpArg(const T& t) {
   std::stringstream stream;
   stream << t;
   return stream.str();
 }
 
 template <typename T1>
-static std::string dumpArg(const c10::optional<T1>& opt_t) {
+std::string dumpArg(const c10::optional<T1>& opt_t) {
   std::stringstream stream;
   if (opt_t.has_value()) {
     stream << dumpArg(opt_t.value());
@@ -51,7 +71,7 @@ static std::string dumpArg(const c10::optional<T1>& opt_t) {
 }
 
 template <typename T>
-static std::string dumpArg(const c10::OptionalArrayRef<T>& opt_t) {
+std::string dumpArg(const c10::OptionalArrayRef<T>& opt_t) {
   std::stringstream stream;
   if (opt_t.has_value()) {
     stream << dumpArg(opt_t.value());
@@ -60,7 +80,7 @@ static std::string dumpArg(const c10::OptionalArrayRef<T>& opt_t) {
 }
 
 template <typename T1, template <typename elem> class container>
-static std::string dumpArg(const container<T1>& t) {
+std::string dumpArg(const container<T1>& t) {
   std::stringstream stream;
   for (auto iter = t.begin(); iter != t.end(); ++iter) {
     stream << dumpArg(*iter) << ", ";
@@ -69,7 +89,7 @@ static std::string dumpArg(const container<T1>& t) {
 }
 
 template <>
-std::string dumpArg(const at::Tensor& tensor) {
+inline std::string dumpArg(const at::Tensor& tensor) {
   std::stringstream stream;
   if (tensor.defined()) {
     stream << "numel: " << tensor.numel() << ", sizes: " << tensor.sizes()
@@ -81,7 +101,7 @@ std::string dumpArg(const at::Tensor& tensor) {
            << ", memory_format: " << tensor.suggest_memory_format()
            << ",  data_ptr: " << tensor.data_ptr();
     if (dumpOpArgLevel() > 2) {
-      stream << std::endl << tensor;
+      stream << '\n' << tensor;
     }
   } else {
     stream << "undefined";
@@ -90,24 +110,24 @@ std::string dumpArg(const at::Tensor& tensor) {
 }
 
 template <>
-std::string dumpArg(const at::Scalar& t) {
+inline std::string dumpArg(const at::Scalar& t) {
   std::stringstream stream;
   stream << t;
   return stream.str();
 }
 
 template <>
-std::string dumpArg(const c10::string_view& t) {
+inline std::string dumpArg(const c10::string_view& t) {
   return dumpArg(std::string(t.data()));
 }
 
 template <>
-std::string dumpArg(const at::Generator& t) {
+inline std::string dumpArg(const at::Generator& t) {
   return "";
 }
 
 template <typename T, size_t N>
-static std::string dumpArg(const std::array<T, N>& t) {
+std::string dumpArg(const std::array<T, N>& t) {
   std::stringstream stream;
   for (auto iter = t.begin(); iter != t.end(); ++iter) {
     stream << dumpArg(*iter) << " ";
@@ -116,25 +136,25 @@ static std::string dumpArg(const std::array<T, N>& t) {
 }
 
 template <>
-std::string dumpArg(const c10::List<c10::optional<at::Tensor>>& t) {
+inline std::string dumpArg(const c10::List<c10::optional<at::Tensor>>& t) {
   std::stringstream stream;
-  stream << "size:" << t.size() << std::endl;
+  stream << "size:" << t.size() << '\n';
   for (int i = 0; i < t.size(); ++i) {
     bool has_value = t[i].has_value();
     stream << "\t" << i << "th: has_value:" << has_value << " ";
     if (has_value) {
       stream << dumpArg(t[i].value());
     }
-    stream << std::endl;
+    stream << '\n';
   }
   return stream.str();
 }
 
 template <typename T1, typename T2, template <typename elem1> class container1,
           template <typename elem2> class container2>
-static std::vector<int64_t> infer_reduce_op_shape(
-    const container1<T1>& input_shape, const container2<T2>& dims,
-    bool keepdim) {
+std::vector<int64_t> infer_reduce_op_shape(const container1<T1>& input_shape,
+                                           const container2<T2>& dims,
+                                           bool keepdim) {
   if (dims.size() <= 0) {
     return {};
   }
@@ -166,7 +186,7 @@ static std::vector<int64_t> infer_reduce_op_shape(
   return output_shape;
 }
 
-static std::string _allclose(const at::Tensor& a, const at::Tensor& b) {
+inline std::string _allclose(const at::Tensor& a, const at::Tensor& b) {
   if (a.defined() && b.defined()) {
     try {
       constexpr double tolerance_absolute = 1e-4;
@@ -191,7 +211,7 @@ static std::string _allclose(const at::Tensor& a, const at::Tensor& b) {
   }
 }
 
-static std::string _allclose(const c10::ArrayRef<at::Tensor>& a,
+inline std::string _allclose(const c10::ArrayRef<at::Tensor>& a,
                              const c10::ArrayRef<at::Tensor>& b) {
   if (a.size() != b.size()) {
     return "not_allclose:";
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/PinMemoryKernel.cpp b/dipu/torch_dipu/csrc_dipu/aten/ops/PinMemoryKernel.cpp
index f51f62cde6..96e5decb88 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/PinMemoryKernel.cpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/PinMemoryKernel.cpp
@@ -10,9 +10,9 @@
 
 namespace dipu {
 namespace native {
+namespace dipu_aten {
 
-bool DIPUATenFunctions::is_pinned(const at::Tensor& self,
-                                  c10::optional<at::Device> device) {
+bool is_pinned(const at::Tensor& self, c10::optional<at::Device> device) {
   // Only CPU tensors can be pinned
   if (!self.is_cpu()) {
     return false;
@@ -23,8 +23,8 @@ bool DIPUATenFunctions::is_pinned(const at::Tensor& self,
   return dipu::isPinnedPtr(self.storage().data());
 }
 
-at::Tensor DIPUATenFunctions::_pin_memory(const at::Tensor& self,
-                                          c10::optional<at::Device> device) {
+at::Tensor _pin_memory(const at::Tensor& self,
+                       c10::optional<at::Device> device) {
   auto allocator = dipu::getAllocator(at::DeviceType::CPU);
   auto storage =
       c10::Storage(c10::Storage::use_byte_size_t(),
@@ -37,5 +37,6 @@ at::Tensor DIPUATenFunctions::_pin_memory(const at::Tensor& self,
   return tensor;
 }
 
+}  // namespace dipu_aten
 }  // namespace native
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/StorageShapeKernel.cpp b/dipu/torch_dipu/csrc_dipu/aten/ops/StorageShapeKernel.cpp
index 777af21cbe..85a0f251b4 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/StorageShapeKernel.cpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/StorageShapeKernel.cpp
@@ -14,8 +14,9 @@
 
 namespace dipu {
 namespace native {
-void DIPUATenFunctions::resize_bytes_dipu(c10::StorageImpl* storage,
-                                          size_t newsize_bytes) {
+namespace dipu_aten {
+
+void resize_bytes_dipu(c10::StorageImpl* storage, size_t newsize_bytes) {
   TORCH_CHECK(storage->resizable(),
               "Trying to resize dipu storage that is not resizable");
   auto allocator = storage->allocator();
@@ -68,13 +69,12 @@ static inline c10::TensorImpl* _resize_impl_dipu_(
   const c10::Storage& storage = self->unsafe_storage();
   TORCH_CHECK(storage, "Tensor: invalid null storage");
   if (self->numel() > 0 && new_storage_size > storage.nbytes()) {
-    DIPUATenFunctions::resize_bytes_dipu(storage.unsafeGetStorageImpl(),
-                                         new_storage_size);
+    resize_bytes_dipu(storage.unsafeGetStorageImpl(), new_storage_size);
   }
   return self;
 }
 
-const at::Tensor& DIPUATenFunctions::resize_(
+const at::Tensor& resize_(
     const at::Tensor& self, at::IntArrayRef size,
     c10::optional<at::MemoryFormat> optional_memory_format) {
   if (self.has_names()) {
@@ -92,11 +92,9 @@ const at::Tensor& DIPUATenFunctions::resize_(
   return self;
 }
 
-at::Tensor& DIPUATenFunctions::set_storage_dipu_(at::Tensor& result,
-                                                 c10::Storage storage,
-                                                 int64_t storage_offset,
-                                                 at::IntArrayRef size,
-                                                 at::IntArrayRef stride) {
+at::Tensor& set_storage_dipu_(at::Tensor& result, c10::Storage storage,
+                              int64_t storage_offset, at::IntArrayRef size,
+                              at::IntArrayRef stride) {
   at::native::checkSetStorage(result, std::move(storage), storage_offset, size,
                               stride);
 
@@ -107,13 +105,15 @@ at::Tensor& DIPUATenFunctions::set_storage_dipu_(at::Tensor& result,
   return result;
 }
 
-at::Tensor& DIPUATenFunctions::set_dipu_(at::Tensor& self) {
+at::Tensor& set_dipu_(at::Tensor& self) {
   caffe2::TypeMeta dtype = self.dtype();
   c10::Storage storage(c10::Storage::use_byte_size_t(), 0,
                        dipu::getAllocator(dipu::DIPU_DEVICE_TYPE), true);
-  DIPUATenFunctions::set_storage_dipu_(self, storage, 0, {0}, {});
+  set_storage_dipu_(self, storage, 0, {0}, {});
   TORCH_INTERNAL_ASSERT(dtype == self.dtype());
   return self;
 }
+
+}  // namespace dipu_aten
 }  // namespace native
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/binding/ExportRT.cpp b/dipu/torch_dipu/csrc_dipu/binding/ExportRT.cpp
index 38f53d1471..a1c3cc7326 100644
--- a/dipu/torch_dipu/csrc_dipu/binding/ExportRT.cpp
+++ b/dipu/torch_dipu/csrc_dipu/binding/ExportRT.cpp
@@ -272,7 +272,7 @@ static void patchStorage(py::module& m) {
                         "support other device type ",
                         stor.device_type());
           } else {
-            dipu::native::DIPUATenFunctions::resize_bytes_dipu(
+            dipu::native::dipu_aten::resize_bytes_dipu(
                 stor.unsafeGetStorageImpl(), newsize);
             return stor;
           }
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.h b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.h
index e8983fafbf..d26e5411a6 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.h
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.h
@@ -21,7 +21,7 @@ namespace dipu {
       auto env = std::getenv("DIPU_DEBUG_ALLOCATOR");                          \
       return env ? std::atoi(env) : 0;                                         \
     }();                                                                       \
-    if (((mask)&value) == (mask)) {                                            \
+    if ((mask & value) == mask) {                                              \
       std::cout << "[" << std::this_thread::get_id() << "]" << x << std::endl; \
     }                                                                          \
   }
diff --git a/dipu/torch_dipu/csrc_dipu/utils/Log.h b/dipu/torch_dipu/csrc_dipu/utils/Log.h
index 66a22d8011..c9d73a6843 100644
--- a/dipu/torch_dipu/csrc_dipu/utils/Log.h
+++ b/dipu/torch_dipu/csrc_dipu/utils/Log.h
@@ -1,19 +1,18 @@
 // Copyright (c) 2023, DeepLink.
 #pragma once
 
-#include <iostream>
-#include <stdio.h>
+#include <iostream>  // IWYU pragma: export
 
 #define CONCAT_(prefix, suffix) prefix##suffix
 #define CONCAT(prefix, suffix) CONCAT_(prefix, suffix)
 #define MAKE_UNIQUE_VARIABLE_NAME(prefix) CONCAT(prefix##_, __LINE__)
 
 #define DIPU_LOG std::cout << __FILE__ << ":" << __LINE__ << " "
-#define DIPU_LOG_ONCE                                                        \
-  static auto& __attribute__((unused)) MAKE_UNIQUE_VARIABLE_NAME(__func__) = \
-      DIPU_LOG
+#define DIPU_LOG_ONCE                        \
+  static const auto& __attribute__((unused)) \
+  MAKE_UNIQUE_VARIABLE_NAME(__func__) = DIPU_LOG
 
 #define DIPU_LOG_ERROR std::cerr << __FILE__ << ":" << __LINE__ << " "
-#define DIPU_LOG_ERROR_ONCE                                                  \
-  static auto& __attribute__((unused)) MAKE_UNIQUE_VARIABLE_NAME(__func__) = \
-      DIPU_LOG_ERROR
+#define DIPU_LOG_ERROR_ONCE                  \
+  static const auto& __attribute__((unused)) \
+  MAKE_UNIQUE_VARIABLE_NAME(__func__) = DIPU_LOG_ERROR
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/cuda/patch/wrapperRegister.cpp b/dipu/torch_dipu/csrc_dipu/vendor/cuda/patch/wrapperRegister.cpp
index 5692610306..8880efef46 100644
--- a/dipu/torch_dipu/csrc_dipu/vendor/cuda/patch/wrapperRegister.cpp
+++ b/dipu/torch_dipu/csrc_dipu/vendor/cuda/patch/wrapperRegister.cpp
@@ -1,7 +1,13 @@
 // Copyright (c) 2023, DeepLink.
-#include <csrc_dipu/aten/DIPUATenFunctions.h>
-#include <csrc_dipu/aten/RegisterDIPU.hpp>
-#include <csrc_dipu/base/basedef.h>
+#include <cstdint>
+
+#include <ATen/core/ATen_fwd.h>
+#include <ATen/core/TensorBody.h>
+#include <ATen/ops/_cudnn_rnn_flatten_weight_native.h>
+#include <c10/core/CompileTimeFunctionPointer.h>
+
+#include "csrc_dipu/aten/RegisterDIPU.hpp"
+#include "csrc_dipu/base/basedef.h"
 
 namespace at {
 

From cc6434f445ca5910f1009846744f5acc52b3d801 Mon Sep 17 00:00:00 2001
From: Lingjie <lilingjie@sensetime.com>
Date: Mon, 8 Jan 2024 14:14:10 +0800
Subject: [PATCH 57/58] [dicp] add codeowners (#606)

---
 .github/CODEOWNERS | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 34742982e8..0f9015bb70 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -6,9 +6,9 @@
 
 # ---------- base ----------
 
-*               @mrdanielw
-/.github/       @mrdanielw @wugeshui
-**/scripts/ci/  @mrdanielw @wugeshui
+*                    @mrdanielw @jinminxi104
+/.github/            @mrdanielw @wugeshui
+/.github/CODEOWNERS  @mrdanielw @jinminxi104
 
 # ---------- dipu ----------
 
@@ -17,15 +17,14 @@
 /dipu/tests/python/                                           @mrdanielw @lljbash
 /dipu/scripts/autogen_diopi_wrapper/                          @mrdanielw @lljbash
 /dipu/scripts/autogen_diopi_wrapper/autogen_diopi_wrapper.py  @mrdanielw @zhaoguochun1995
+/dipu/scripts/ci/                                             @mrdanielw @wugeshui
 
 ### build & linter
 /dipu/**/CMakeLists.txt  @mrdanielw @lljbash @wiryls @Wrench-Git
 /dipu/**/*.cmake         @mrdanielw @lljbash @wiryls @Wrench-Git
 /dipu/.clang*            @mrdanielw @lljbash @wiryls
 
-### languages
-/dipu/**/*.cpp  @mrdanielw @lljbash
-/dipu/**/*.hpp  @mrdanielw @lljbash
-/dipu/**/*.h    @mrdanielw @lljbash
-
 # ---------- dicp ----------
+
+/dicp/             @jinminxi104
+/dicp/scripts/ci/  @jinminxi104 @wugeshui

From 3ecb00b64e03ac246edcf384cd69f6c788f98674 Mon Sep 17 00:00:00 2001
From: tangzhiyi11 <tangzhiyi11@users.noreply.github.com>
Date: Mon, 8 Jan 2024 14:20:41 +0800
Subject: [PATCH 58/58] [dicp][ascend] rename add/adds in ascend op (#569)

---
 dicp/dicp/vendor/AscendGraph/ascend_op.py      |  4 ++--
 dicp/dicp/vendor/AscendGraph/codegen/ascend.py | 10 ++--------
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/dicp/dicp/vendor/AscendGraph/ascend_op.py b/dicp/dicp/vendor/AscendGraph/ascend_op.py
index 146d6e754c..09c8f302c0 100644
--- a/dicp/dicp/vendor/AscendGraph/ascend_op.py
+++ b/dicp/dicp/vendor/AscendGraph/ascend_op.py
@@ -24,7 +24,7 @@ def negative_in_shape(shape):
 
 class Adds(Operator):
     def __init__(self):
-        super().__init__("adds")
+        super().__init__("Adds")
 
     def infer_result(self, x1, x2):
         return common_binary_op_infer(x1, x2)
@@ -32,7 +32,7 @@ def infer_result(self, x1, x2):
 
 class Add(Operator):
     def __init__(self):
-        super().__init__("add")
+        super().__init__("Add")
 
     def infer_result(self, x1, x2):
         return common_binary_op_infer(x1, x2)
diff --git a/dicp/dicp/vendor/AscendGraph/codegen/ascend.py b/dicp/dicp/vendor/AscendGraph/codegen/ascend.py
index 787cc79ced..c32b8e7974 100644
--- a/dicp/dicp/vendor/AscendGraph/codegen/ascend.py
+++ b/dicp/dicp/vendor/AscendGraph/codegen/ascend.py
@@ -729,14 +729,14 @@ def IdentityN(name, *args, **kwargs):
         return id_op.to_node()
 
     @staticmethod
-    def adds(name, x, y):
+    def Adds(name, x, y):
         adds_op = OP(name, "Adds")
         adds_op.set_input("x", x)
         adds_op.set_attr_float("value", float(y))
         return adds_op.to_node()
 
     @staticmethod
-    def add(name, x, y):
+    def Add(name, x, y):
         add_op = OP(name, "Add")
         add_op.set_input("x1", x)
         add_op.set_input("x2", y)
@@ -769,12 +769,6 @@ def Transpose(name, input, perm):
         transpose_op.set_input("perm", perm)
         return transpose_op.to_node()
 
-    @staticmethod
-    def reciprocal(name, x):
-        op = OP(name, "Reciprocal")
-        op.set_input("x", x)
-        return op.to_node()
-
     @staticmethod
     def Sqrt(name, x):
         op = OP(name, "Sqrt")