diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index b813c042ae..0f9015bb70 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,4 +1,30 @@
 # IMPORTANT:
 # This file is ONLY used to merge PRs. Approvals from people in this file are required for merging.
+#
+# WARNING: The last matching pattern takes the most precedence and OVERWRITES previous rules.
+#          Please be very careful when adding new patterns.
 
-/dipu/tests/python @lljbash @mrdanielw
+# ---------- base ----------
+
+*                    @mrdanielw @jinminxi104
+/.github/            @mrdanielw @wugeshui
+/.github/CODEOWNERS  @mrdanielw @jinminxi104
+
+# ---------- dipu ----------
+
+### directories & files
+/dipu/torch_dipu/csrc_dipu/                                   @mrdanielw @fandaoyi @lljbash
+/dipu/tests/python/                                           @mrdanielw @lljbash
+/dipu/scripts/autogen_diopi_wrapper/                          @mrdanielw @lljbash
+/dipu/scripts/autogen_diopi_wrapper/autogen_diopi_wrapper.py  @mrdanielw @zhaoguochun1995
+/dipu/scripts/ci/                                             @mrdanielw @wugeshui
+
+### build & linter
+/dipu/**/CMakeLists.txt  @mrdanielw @lljbash @wiryls @Wrench-Git
+/dipu/**/*.cmake         @mrdanielw @lljbash @wiryls @Wrench-Git
+/dipu/.clang*            @mrdanielw @lljbash @wiryls
+
+# ---------- dicp ----------
+
+/dicp/             @jinminxi104
+/dicp/scripts/ci/  @jinminxi104 @wugeshui
diff --git a/.github/actions/code-build-test/action.yml b/.github/actions/code-build-test/action.yml
index a3f2c32a62..6bbede0ee6 100644
--- a/.github/actions/code-build-test/action.yml
+++ b/.github/actions/code-build-test/action.yml
@@ -81,6 +81,10 @@ runs:
         else
           export CI=true
           source ~/.bashrc
-          cd ${WORK_PATH} && rm -rf ${JOB_NAME}  && cp -R source ${JOB_NAME} && cd ${JOB_NAME}
+          cd ${WORK_PATH} 
+          if [ "${{ inputs.cover_job }}" == "0" ];then
+              rm -rf ${JOB_NAME}  && cp -R source ${JOB_NAME}
+          fi
+          cd ${JOB_NAME}
           ${{ inputs.build_shell }} ${cleaner_shell}
         fi
diff --git a/.github/workflows/_runs-on-ascend.yml b/.github/workflows/_runs-on-ascend.yml
index 21a93a328a..928f7f1e66 100644
--- a/.github/workflows/_runs-on-ascend.yml
+++ b/.github/workflows/_runs-on-ascend.yml
@@ -12,7 +12,7 @@ on:
         description: Set up the build environment
         type: string
         required: false
-        default: "tps-ascend-ci"
+        default: "dicp-ascend-ci-910b"
 
 jobs:
   checkout_code:
@@ -22,25 +22,38 @@ jobs:
       - name: Checkout Code
         uses: DeepLink-org/deeplink.framework/.github/actions/checkout-code@main
 
-  build:
+  build_test:
     runs-on: ${{ inputs.runner }}
     needs: checkout_code
     steps:
-      - name: build on ascend
+      - name: build and test on ascend
         uses: DeepLink-org/deeplink.framework/.github/actions/code-build-test@main
         with:
-          build_shell: "pwd" #Write the script you want to execute here，If you don't know which parameters to fill in, you can refer to the actions/code-build-test
-          job_name: "build"
+          build_shell: "
+              source dicp/scripts/ci/ascend/dipu_env.sh && \
+              rm -rf /tmp/torchinductor_autolink/* && \
+              rm -rf /tmp/dicp_ascend/* && \
+              cd /mnt/cache/share/deeplinkci/dicp_env/transformers && \
+              pip uninstall transformers -y && \
+              patch -p1 < modeling_llama.diff && patch -p1 < utils.diff && \
+              python setup.py clean && \
+              python setup.py install --user && \
+              patch -R -p1 < modeling_llama.diff && patch -R -p1 < utils.diff && \
+              cd - && \
+              cd /mnt/cache/share/deeplinkci/dicp_env/accelerate && \
+              pip uninstall accelerate -y && \
+              python setup.py clean && \
+              python setup.py install --user && \
+              cd - && \
+              pip uninstall torch_dipu -y && \
+              pip uninstall dicp -y && \
+              cd dipu && python setup.py clean && python setup.py install --user && \
+              cd ../dicp && python setup.py clean && python setup.py install --user && \
+              source scripts/ci/ascend/test_env.sh /mnt/cache/share/deeplinkci/dicp_env/llama_models && \
+              export TEST_DIR=$(pwd)/test && echo ${TEST_DIR} && \
+              bash ${TEST_DIR}/ascend_scripts/ops/run_test_ops.sh false && \
+              bash ${TEST_DIR}/ascend_scripts/models/run_test_models.sh false
+            " #Write the script you want to execute here，If you don't know which parameters to fill in, you can refer to the actions/code-build-test
+          job_name: "build_test"
           cover_job: "0"
           cleaner: "clean_all_if_error"
-
-  test:
-    runs-on: ${{ inputs.runner }}
-    needs: build
-    steps:
-      - name: rt test on ascend
-        uses: DeepLink-org/deeplink.framework/.github/actions/code-build-test@main
-        with:
-          build_shell: "pwd" #Write the script you want to execute here，If you don't know which parameters to fill in, you can refer to the actions/code-build-test
-          job_name: "build"
-          cover_job: "1"
diff --git a/.github/workflows/_runs-on-topsrider.yml b/.github/workflows/_runs-on-topsrider.yml
index 8427855c6d..21efb5d972 100644
--- a/.github/workflows/_runs-on-topsrider.yml
+++ b/.github/workflows/_runs-on-topsrider.yml
@@ -26,7 +26,7 @@ jobs:
     runs-on: ${{ inputs.runner }}
     needs: checkout_code
     steps:
-      - name: build and test on topsrider
+      - name: build on topsrider
         uses: DeepLink-org/deeplink.framework/.github/actions/code-build-test@main
         with:
           build_shell: "
@@ -34,11 +34,19 @@ jobs:
                        pip uninstall torch_dipu -y && \
                        pip uninstall dicp -y && \
                        cd dipu && python setup.py install --user && \
-                       cd ../dicp && python setup.py install --user && \
-                       cd .. && source dicp/scripts/ci/tops/ci_tops_test_env.sh /mnt/models/llama_models && \
-                       export TEST_DIR=$(pwd)/dicp/test && echo ${TEST_DIR} && \
-                       bash ${TEST_DIR}/tops_scripts/ops/run_test_ops.sh false && \
-                       bash ${TEST_DIR}/tops_scripts/models/run_test_models.sh false
+                       cd ../dicp && python setup.py install --user
                        "
           job_name: "build_test"
           cover_job: "0"
+
+      - name: test ops on topsrider
+        uses: DeepLink-org/deeplink.framework/.github/actions/code-build-test@main
+        with:
+          build_shell: "
+                       source dicp/scripts/ci/tops/ci_tops_test_env.sh \
+                       /mnt/models/llama_models /mnt/models/stable_diffusion_models && \
+                       export TEST_DIR=$(pwd)/dicp/test && \
+                       bash ${TEST_DIR}/tops_scripts/ops/run_test_ops.sh false
+                       "
+          job_name: "build_test"
+          cover_job: "1"
diff --git a/.github/workflows/dicp.yml b/.github/workflows/dicp.yml
index 537fd239ee..d758fd1c75 100644
--- a/.github/workflows/dicp.yml
+++ b/.github/workflows/dicp.yml
@@ -1,15 +1,14 @@
 name: dicp ci
 on:
   workflow_dispatch:
-  push:
-    branches:
-      - main
+  schedule:
+    - cron: '10 23 * * *'
   pull_request:
-    paths-ignore:
-      - "**.md"
-      - ".github/ISSUE_TEMPLATE/**"
-      - ".git*"
-      - "CODE_OF_CONDUCT**"
+    paths:
+      - ".github/workflows/dicp.yml"
+      - ".github/workflows/_runs-on-ascend.yml"
+      - ".github/workflows/_runs-on-topsrider.yml"
+      - "dicp/**"
 
 env:
   ENV_PATH: '/mnt/cache/share/platform/env'
diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml
index cbad72ae4a..32efe64a4e 100644
--- a/.github/workflows/format.yml
+++ b/.github/workflows/format.yml
@@ -11,19 +11,22 @@ jobs:
   markdownlint:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v4
+    - name: Checkout code
+      uses: actions/checkout@v4
       with:
-        fetch-depth: 2
-    - uses: tj-actions/changed-files@v40
+        fetch-depth: 8
+    - name: Collect changed files
+      uses: tj-actions/changed-files@v40
       id: changed-files
       with:
         files: '**/*.md'
-        separator: ","
-    - uses: DavidAnson/markdownlint-cli2-action@v14
+        separator: ','
+    - name: MarkdownLint
       if: steps.changed-files.outputs.any_changed == 'true'
+      uses: DavidAnson/markdownlint-cli2-action@v14
       with:
         globs: ${{ steps.changed-files.outputs.all_changed_files }}
-        separator: ","
+        separator: ','
 
   clang-format:
     needs: markdownlint
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 6f946ae407..852a4335f9 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -15,7 +15,7 @@ env:
   CAMB_CLUSTER: CAMB
   CAMB_TORCH_BASE_DIR: '/mnt/lustre/share/parrotsci/github/cibuild/pytorchbase'
   CUDA_CI_PATH: '/mnt/cache/share/parrotsci/github/cibuild/${{ github.repository }}'
-  CUDA_PARTATION: ${{ vars.SH1988_SLURM_PAR != '' && vars.SH1988_SLURM_PAR || 'pat_rd -x SH-IDC1-10-198-8-60' }}
+  CUDA_PARTATION: ${{ vars.SH1988_SLURM_PAR != '' && vars.SH1988_SLURM_PAR || 'pat_rd' }}
   CUDA_CLUSTER: SH1988
   DEEPLINK_PATH: '/mnt/cache/share/deeplinkci/github/${{ github.repository }}'
   ASCEND_CLUSTER: ASCEND
@@ -24,7 +24,7 @@ env:
   CI_BUILD_FLAG: "ci_build_flag"
   PYTORCH_COMMIT: ${{ vars.PYTORCH_COMMIT != '' && vars.PYTORCH_COMMIT || 'c263bd43e8e8502d4726643bc6fd046f0130ac0e' }} # pytorch tag 2.0
   ALL_COVERAGE: ${{ (contains( github.ref, 'main') || startsWith(github.ref, 'refs/heads/v') || startsWith(github.ref, 'refs/heads/dev')) && 'ON' || 'OFF' }}
-  REQUIRE_COVERAGE: ${{ vars.REQUIRE_COVERAGE != '' && vars.REQUIRE_COVERAGE || '40' }}
+  REQUIRE_COVERAGE: ${{ vars.REQUIRE_COVERAGE != '' && vars.REQUIRE_COVERAGE || '0' }}
   REPO: ${{ github.event.repository.name }}
 
 concurrency:
@@ -128,7 +128,7 @@ jobs:
           cd ${CAMB_CI_PATH}/${GITHUB_RUN_NUMBER}/Build-Camb
           rm -rf scripts
           ln -s ${CAMB_CI_PATH}/${GITHUB_RUN_NUMBER}/source-main/dipu/third_party/DIOPI/scripts scripts
-          source /mnt/cache/share/platform/env/camb_ci_diopi_impl
+          source /mnt/cache/share/platform/env/pt2.0_diopi
           bash scripts/increment_coverage.sh ${REQUIRE_COVERAGE}
           """
 
diff --git a/README.md b/README.md
index a3a17decde..7dd7aafe2f 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ Deeplink.framework 是 DeepLink 推出的介于 AI 训练框架和硬件语言
 
 ### DIPU
 
-DIPU (Device Independent Process Unit) 是由一组抽象设备 runtime 接口，一组框架能力相关的运行时基类/接口，一个针对 DIOPI 标准算子的适配层共同组成的拓展包。 用来在训练框架 PyTorch 上接入 DIOPI 算子库，实现 Eager 模式的推理和训练。其能够在编译时，决定抽象设备被影射的方式；并使用统一的运行时，减少在多硬件上适配训练框架的成本。DIPU 即可以基于统一的设备运行时来屏蔽厂商的实际设备；也可以基于统一的框架相关的运行时基类，由厂商自行实现特有的运行时逻辑。
+DIPU (Device Independent Process Unit) 是由一组抽象设备 runtime 接口，一组框架能力相关的运行时基类/接口，一个针对 DIOPI 标准算子的适配层共同组成的拓展包。用来在训练框架 PyTorch 上接入 DIOPI 算子库，实现 Eager 模式的推理和训练。其能够在编译时，决定抽象设备被影射的方式；并使用统一的运行时，减少在多硬件上适配训练框架的成本。DIPU 即可以基于统一的设备运行时来屏蔽厂商的实际设备；也可以基于统一的框架相关的运行时基类，由厂商自行实现特有的运行时逻辑。
 
 ### DICP
 
diff --git a/dicp/MANIFEST.in b/dicp/MANIFEST.in
new file mode 100644
index 0000000000..9ce7b59fbf
--- /dev/null
+++ b/dicp/MANIFEST.in
@@ -0,0 +1,2 @@
+recursive-include dicp/vendor/TopsGraph/codegen *
+recursive-include dicp/vendor/AscendGraph/codegen *
\ No newline at end of file
diff --git a/dicp/README.md b/dicp/README.md
new file mode 100644
index 0000000000..db01a09b6c
--- /dev/null
+++ b/dicp/README.md
@@ -0,0 +1,85 @@
+<!-- markdownlint-disable-next-line MD041 MD033 -->
+<div align=center>
+<!-- markdownlint-disable-next-line MD033 -->
+<img src="https://deeplink.readthedocs.io/zh-cn/latest/_static/image/logo.png" alt="DeepLink Logo">
+</div>
+
+# DICP
+
+标准编译协议（Device-Independent Compile Protocol, DICP）定义了统一的计算描述（中间表示），通过计算图获取深度学习模型中的计算任务表达为上述中间表示，然后通过计算图优化技术自动生成人工智能芯片设备代码，从而提高研发效率和计算的执行性能。中间表示是介于源语言和目标语言之间的程序表示，能够极大程度地提高编译流程的可拓展性，同时也能降低优化流程对前端和后端的破坏。多层次中间表示包含从应用到芯片端的多种表示层次，不同层次旨在解决不同尺度的问题。
+
+DICP 主要的核心功能如下：
+
+1. 通过接入编译路线带来性能优势，在大模型场景最大限度释放芯片能力。
+2. 作为训练框架与国产硬件芯片之间的通用桥梁，支持多种前后端，带来使用易用性。
+3. 提供易用、高效的一站式编译适配流程，灵活支持国产硬件图编译器的特性，提高芯片适配效率。
+
+下图描述了 DICP 在编译链路中的位置：
+
+![DICP 在编译链路中的位置](https://deeplink.readthedocs.io/zh-cn/latest/_static/image/DICP/dicp_flow.png)
+
+1. 训练框架通过图获取模块将用户的模型代码转换成统一的中间表达。此处的中间表达完全与芯片无关。所以在之后的编译协议部分中，需要建立起与后端芯片的联系。这样才能高效的完成接入。
+2. 编译协议完成了衔接框架与芯片编译器的工作，其中包含硬件相关的切图，统一中间表达与芯片所支持的算子之间的映射关系以及数据格式的转换模块。
+3. 在编译协议吸收了芯片特点之后，由代码生成模块生成最终的代码，并通过芯片的编译器生成二进制可执行文件之后由框架调用。
+
+## 基于 DICP 的国产硬件接入 PyTorch 2 实践
+
+<!-- ### DICP vs 纯 Dynamo -->
+
+基于上述 DICP，国产硬件可快速接入 PyTorch 2 的编译路线。此路线中的 TorchDynamo 组件，可使国产硬件在运行时的 overhead 大幅缩小。
+并且针对国产硬件实现了以下特性：
+
+- 灵活支持国产硬件图编译器的特性
+- 支持多种国产硬件数据格式
+- 支持动态 shape
+
+### 运行逻辑
+
+DICP 的运行逻辑如下图所示：
+
+<!-- (**这张图有问题，需要讨论 by jinminxi**) -->
+![DICP 的运行逻辑](https://deeplink.readthedocs.io/zh-cn/latest/_static/image/DICP/structure.png)
+
+其中：
+
+1. **算子映射**：主要解决框架层算子与后端图编译器的算子之间的语义差别，包括 1 对 1 和 1 对多的转换。
+2. **Shape & Dtype 推导**：进行 Shape & data_type 的推导，补全整张静态图上的信息，便于之后在代码生成模块能生成代码。
+3. **子图改写**：将多个小算子融合成为一个或多个适合图编译器的算子，配合后端图编译器将计算效率最大化。
+4. **数据格式调整**：是根据后端芯片与其图编译器的特性，针对特定的算子调整其输入输出的数据格式，使得最大程度的发挥芯片性能。
+
+### 目录结构
+
+- `dicp/dynamo_bridge`：多后端通用的接入代码，包含了
+  1. 接收从 AOTAutograd 下发而来的 FX Graph
+  2. 启动各个厂商的 IR 转换与优化
+  3. 启动 CodeGen 以及 JIT 缓存的逻辑。
+- `dicp/vender`: 主要包含了各个厂商 IR 的定义，AtenIR 到厂商 IR 的转换，厂商 IR 上的优化以及最后的代码生成模块。
+- `test`: 包含了 model 测试与 op 测试
+
+### Demo
+
+#### 安装 DICP
+
+```bash
+cd /path_to_dicp
+pip install .
+```
+
+#### 在华为 910 上执行 llama7B 前向推理
+
+```bash
+export DIPU_MOCK_CUDA = false
+export DICP_TOPS_DIPU = True
+export TEST_DIR = /path_to_dicp/test/
+export LLAMA_MODEL_DIR=/path_to_llama_model
+bash /path_to_dicp/test/model/run_test_model.sh llama ascendgraph false
+```
+
+#### 在燧原 T20 上执行 resnet50 训练
+
+```bash
+export DIPU_MOCK_CUDA = false
+export DICP_TOPS_DIPU = True
+export TEST_DIR = /path_to_dicp/test/
+bash /path_to_dicp/test/model/run_test_model.sh resnet50 topsgraph false
+```
diff --git a/dicp/dicp/dynamo_bridge/op_transformer.py b/dicp/dicp/dynamo_bridge/op_transformer.py
index 1b855f6a88..85121b9f36 100644
--- a/dicp/dicp/dynamo_bridge/op_transformer.py
+++ b/dicp/dicp/dynamo_bridge/op_transformer.py
@@ -55,6 +55,9 @@ def get_proxy(self, target, args: Tuple[Argument, ...], kwargs: Dict[str, Any] =
             'call_function', target.get_singleton(), args, kwargs)
         return proxy
 
+    def get_proxy_from_node(self, node):
+        return self.tracer.proxy(node)
+
     def call_function(self, target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Any]) -> Any:
         if target in self._conversions:
             converted_target = self._conversions[target]
diff --git a/dicp/dicp/dynamo_bridge/operator.py b/dicp/dicp/dynamo_bridge/operator.py
index 213b8bc31c..99411e9491 100644
--- a/dicp/dicp/dynamo_bridge/operator.py
+++ b/dicp/dicp/dynamo_bridge/operator.py
@@ -95,7 +95,7 @@ def make_cpu(x):
             except Exception as e:
                 log = logging.getLogger(__name__)
                 if hasattr(self, "infer_result"):
-                    log.warning(
+                    log.debug(
                         str(self.__name__) + ": infer shape and dtype failed,ignore"
                     )
                 elif hasattr(self, "torch_op"):
diff --git a/dicp/dicp/vendor/AscendGraph/ascend_op.py b/dicp/dicp/vendor/AscendGraph/ascend_op.py
index 36a18ad3a7..09c8f302c0 100644
--- a/dicp/dicp/vendor/AscendGraph/ascend_op.py
+++ b/dicp/dicp/vendor/AscendGraph/ascend_op.py
@@ -24,7 +24,7 @@ def negative_in_shape(shape):
 
 class Adds(Operator):
     def __init__(self):
-        super().__init__("adds")
+        super().__init__("Adds")
 
     def infer_result(self, x1, x2):
         return common_binary_op_infer(x1, x2)
@@ -32,7 +32,7 @@ def infer_result(self, x1, x2):
 
 class Add(Operator):
     def __init__(self):
-        super().__init__("add")
+        super().__init__("Add")
 
     def infer_result(self, x1, x2):
         return common_binary_op_infer(x1, x2)
@@ -42,11 +42,54 @@ class BroadcastTo(Operator):
     def __init__(self):
         super().__init__("BroadcastTo")
 
+    def infer_result(self, x, shape):
+        x, x_shape, x_dim, x_dtype = get_fake_tensor_meta_val(x)
+        shape, shape_shape, shape_dim, shape_dtype = get_fake_tensor_meta_val(shape)
+        shape = shape_shape
+        dims = zip(reversed(shape), reversed(x_shape))
+
+        for i, t in enumerate(dims):
+            tar_dim, cur_dim = t
+            if tar_dim == -1:
+                shape[-(i + 1)] = cur_dim
+                continue
+            elif cur_dim == 1:
+                continue
+            assert cur_dim == tar_dim, self.__class__.__name__ + ": shape mismatch!"
+        # broadcast keep get_memory_format
+        return torch.empty(shape, dtype=x_dtype, memory_format=get_memory_format(x))
+
 
 class Range(Operator):
     def __init__(self):
         super().__init__("Range")
 
+    def infer_result(self, start, limit=None, delta=None):
+        start, start_dtype, _ = get_op_const_arg_kwarg(start)
+        limit, limit_dtype, _ = get_op_const_arg_kwarg(limit)
+        delta, delta_dtype, _ = get_op_const_arg_kwarg(delta)
+
+        assert start is not None, (
+            self.__class__.__name__ + ": input 'start' can't be None!"
+        )
+        if limit is None:
+            limit = start
+            start = 0.0
+        delta = float(delta) if delta is not None else 1.0
+        assert not close2(delta, 0), self.__class__.__name__ + "step must be nonzero"
+        assert (delta > 0 and limit > start) or (delta < 0 and limit < start), (
+            self.__class__.__name__
+            + "upper bound and larger bound inconsistent with step sign"
+        )
+
+        seq_len = math.ceil((limit - start) / delta)
+
+        return torch.empty(
+            [seq_len],
+            dtype=get_cast_dtype(start_dtype, limit_dtype),
+            memory_format=torch.contiguous_format,
+        )
+
 
 class Cumsum(Operator):
     def __init__(self):
@@ -107,6 +150,12 @@ def infer_result(self, x1, x2):
         return common_binary_op_infer(x1, x2)
 
 
+class Muls(Operator):
+    def __init__(self):
+        super().__init__("Muls")
+        self.torch_op = aten.mul
+
+
 class Div(Operator):
     def __init__(self):
         super().__init__("Div")
@@ -236,7 +285,6 @@ def infer_result(self, x, dim=None):
                     + ": can only squeeze a dimension that is 1!"
                 )
                 shape.pop(i)
-
         x_memory_format = get_memory_format(x)
         if len(shape) < 4:
             x_memory_format = torch.contiguous_format
@@ -247,6 +295,15 @@ class Pack(Operator):
     def __init__(self):
         super().__init__("Pack")
 
+    def infer_result(self, x, dim):
+        x0, x0_shape, x0_dim, x0_dtype = get_fake_tensor_meta_val(x[0])
+        dim = (dim + x0_dim + 1) % (x0_dim + 1)
+        out_shape = list(x0_shape)
+        out_shape.insert(dim, len(x))
+        return torch.empty(
+            out_shape, dtype=x0_dtype, memory_format=get_memory_format(x0)
+        )
+
 
 class Permute(Operator):
     def __init__(self):
@@ -257,11 +314,56 @@ class Expand(Operator):
     def __init__(self):
         super().__init__("Expand")
 
+    # TODO: unfinished, need furthur test
+    def infer_result(self, x, shape_tensor):
+        x, x_shape, x_dim, x_dtype = get_fake_tensor_meta_val(x, True)
+        (
+            shape_tensor,
+            shape_tensor_shape,
+            shape_tensor_dim,
+            shape_tensor_dtype,
+        ) = get_fake_tensor_meta_val(shape_tensor, True)
+        assert x_dim > 0, self.__class__.__name__ + ": scalar"
+        shape = list(shape_tensor_shape)
+        dims = zip(shape, x_shape)
+        x_stride = list(x.stride())
+        for i, t in enumerate(dims):
+            tar_dim, cur_dim = t
+            if tar_dim != cur_dim:
+                x_stride[i] = 0
+            if tar_dim == -1:
+                shape[-(i + 1)] = cur_dim
+                continue
+            elif cur_dim == 1:
+                continue
+            assert cur_dim == tar_dim, self.__class__.__name__ + ": shape mismatch!"
+        # broadcast keep get_memory_format
+        return torch.empty(shape, dtype=x_dtype, memory_format=get_memory_format(x))
+
 
 class ExpandD(Operator):
     def __init__(self):
         super().__init__("ExpandD")
 
+    def infer_result(self, x, shape):
+        x, x_shape, x_dim, x_dtype = get_fake_tensor_meta_val(x, True)
+        assert x_dim > 0, self.__class__.__name__ + ": scalar"
+        dims = zip(shape, x_shape)
+        x_stride = list(x.stride())
+        for i, t in enumerate(dims):
+            tar_dim, cur_dim = t
+            if tar_dim != cur_dim:
+                x_stride[i] = 0
+            if tar_dim == -1:
+                shape[-(i + 1)] = cur_dim
+                continue
+            elif cur_dim == 1:
+                continue
+            assert cur_dim == tar_dim, self.__class__.__name__ + ": shape mismatch!"
+        res = torch.empty(shape, dtype=x_dtype, memory_format=get_memory_format(x))
+        res = torch.as_strided(res, shape, x_stride, res.storage_offset())
+        return res
+
 
 class Sort(Operator):
     def __init__(self):
@@ -277,10 +379,16 @@ class ScatterElements(Operator):
     def __init__(self):
         super().__init__("ScatterElements")
 
+    def infer_result(self, var, index, value, dim):
+        return common_unary_op_infer(var)
+
 
-class ReduceMean(Operator):
+class ReduceMeanD(Operator):
     def __init__(self):
-        super().__init__("ReduceMean")
+        super().__init__("ReduceMeanD")
+
+    def infer_result(self, x, axes, keepdim=False, noop_with_empty_axes=True):
+        return reduce_op_infer(x, axes, keepdim)
 
 
 class ReduceStdV2Update(Operator):
@@ -300,7 +408,7 @@ class Const(Operator):
     def __init__(self):
         super().__init__("Const")
 
-    def infer_result(self, new_args, kwargs):
+    def infer_result(self, *new_args, **kwargs):
         return new_args, kwargs
 
 
@@ -318,14 +426,11 @@ def __init__(self):
 
     def infer_result(self, base, expo):
         base, base_shape, base_dim, base_dtype = get_fake_tensor_meta_val(base)
-
         if isinstance(expo, Tuple):  # Const
-            expo, expo_shape = get_op_const_arg_kwarg(expo)
+            expo, _, expo_shape = get_op_const_arg_kwarg(expo)
             expo_dtype = type(expo[0]) if len(expo) > 0 else base_dtype
         else:  # fake Tensor
-            expo, expo_shape, expo_dim, expo_dtype = get_fake_tensor_meta_val(
-                expo
-            )
+            expo, expo_shape, expo_dim, expo_dtype = get_fake_tensor_meta_val(expo)
 
         out_shape = get_broadcast_res_two_shape(base_shape, expo_shape)
         dtype = get_cast_dtype(base_dtype, expo_dtype)
@@ -337,7 +442,7 @@ class Select(Operator):
     def __init__(self):
         super().__init__("Select")
 
-    def infer_result(self, x1, x2, condition):
+    def infer_result(self, condition, x1, x2):
         x1, x1_shape, x1_dim, x1_dtype = get_fake_tensor_meta_val(x1)
         x2, x2_shape, x2_dim, x2_dtype = get_fake_tensor_meta_val(x2)
         _, c_shape, _, _ = get_fake_tensor_meta_val(condition)
@@ -373,6 +478,14 @@ def infer_result(self, x1, x2):
         return common_binary_op_infer(x1, x2, torch.bool)
 
 
+class NotEqual(Operator):
+    def __init__(self):
+        super().__init__("NotEqual")
+
+    def infer_result(self, x1, x2):
+        return common_binary_op_infer(x1, x2, torch.bool)
+
+
 class Conv2D(Operator):
     def __init__(self):
         super().__init__("Conv2D")
@@ -409,9 +522,18 @@ def __init__(self):
         super().__init__("Identity")
 
     def infer_result(self, x, idx=None):
-        x, x_shape, x_dim, x_dtype = get_fake_tensor_meta_val(x)
-        out_shape = list(x_shape[idx]) if idx is not None else list(x_shape)
-        return torch.empty(out_shape, dtype=x_dtype, memory_format=get_memory_format(x))
+        x, x_shape, _, x_dtype = get_fake_tensor_meta_val(x)
+        out_dtype = x_dtype
+        if x_dtype == torch.complex64:  # for complex64
+            out_shape = list(x_shape)
+            if idx == 0 or idx == 1:
+                out_dtype = torch.float32
+                out_shape.append(1)
+        else:
+            out_shape = [x_shape[idx]] if idx is not None else list(x_shape)
+        return torch.empty(
+            out_shape, dtype=out_dtype, memory_format=get_memory_format(x)
+        )
 
 
 class IdentityInp(Operator):
@@ -439,6 +561,18 @@ class Empty(Operator):
     def __init__(self):
         super().__init__("Empty")
 
+    def infer_result(
+        self, shape, dtype, layout, device, memory_format=torch.contiguous_format
+    ):
+        shape, _, _ = get_op_const_arg_kwarg(shape)
+        return torch.empty(
+            shape,
+            dtype=dtype,
+            layout=layout,
+            device=device,
+            memory_format=memory_format,
+        )
+
 
 class GatherV2(Operator):
     def __init__(self):
@@ -452,15 +586,35 @@ def infer_result(self, x, index, axis):
         return torch.empty(idx_shape, dtype=x_dtype, memory_format=get_memory_format(x))
 
 
+class GatherElements(Operator):
+    def __init__(self):
+        super().__init__("GatherElements")
+
+    def infer_result(self, x, index, axis):
+        x, x_shape, x_dim, x_dtype = get_fake_tensor_meta_val(x)
+        idx, idx_shape, idx_dim, idx_dtype = get_fake_tensor_meta_val(index)
+        return torch.empty(idx_shape, dtype=x_dtype, memory_format=get_memory_format(x))
+
+
 class OnesLike(Operator):
     def __init__(self):
         super().__init__("OnesLike")
 
+    def infer_result(self, x):
+        return common_unary_op_infer(x)
+
 
 class Fill(Operator):
     def __init__(self):
         super().__init__("Fill")
 
+    def infer_result(self, dims, value):
+        _, value_dtype, _ = get_op_const_arg_kwarg(value)
+        shape, _, _ = get_op_const_arg_kwarg(dims)
+        return torch.empty(
+            shape, dtype=value_dtype, memory_format=torch.contiguous_format
+        )
+
 
 class Conv2DBackpropInput(Operator):
     def __init__(self):
@@ -542,11 +696,34 @@ class SplitD(Operator):
     def __init__(self):
         super().__init__("SplitD")
 
+    def infer_result(self, x, split_dim, num_split, y, from_view_complex=False):
+        assert from_view_complex == True, (
+            self.__class__.__name__
+            + ": currently available only in op view_as_complex!"
+        )
+        x, x_shape, x_dim, x_dtype = get_fake_tensor_meta_val(x)
+        split_dim = (split_dim + x_dim) % x_dim
+        out_shape = list(x_shape)
+        del out_shape[-1]
+        return torch.empty(
+            out_shape,
+            dtype=torch.complex64 if from_view_complex else x_dtype,
+            memory_format=get_memory_format(x),
+        )
+
 
 class Slice(Operator):
     def __init__(self):
         super().__init__("Slice")
 
+    def infer_result(self, x, offset, size):
+        x, x_shape, _, x_dtype = get_fake_tensor_meta_val(x)
+        new_shape, _, _ = get_op_const_arg_kwarg(size)
+        offset, _, _ = get_op_const_arg_kwarg(offset)
+        _, storage_offset = cal_stride_offset(new_shape, offset, x)
+        res = torch.as_strided(x, new_shape, x.stride(), storage_offset)
+        return res
+
 
 class ConcatD(Operator):
     def __init__(self):
@@ -570,33 +747,32 @@ class MaskedFill(Operator):
     def __init__(self):
         super().__init__("MaskedFill")
 
+    def infer_result(self, x, mask, value):
+        x, x_shape, _, x_dtype = get_fake_tensor_meta_val(x)
+        _, _, _, value_dtype = get_fake_tensor_meta_val(value)
+        _, mask_shape, _, _ = get_fake_tensor_meta_val(mask)
+        return torch.empty(
+            get_broadcast_res_two_shape(x_shape, mask_shape),
+            dtype=get_cast_dtype(x_dtype, value_dtype),
+            memory_format=get_memory_format(x),
+        )
+
 
 class Reshape(Operator):
     def __init__(self):
         super().__init__("Reshape")
 
-    # TODO:conflict in solving stride between "view" and "select"
-    def infer_result(self, x, shape_const_op):
-        x, x_shape, x_dim, x_dtype = get_fake_tensor_meta_val(x)
-        re_shape, re_dim = get_op_const_arg_kwarg(shape_const_op)
-        # check whether stride and storage_offset are manually specified
-        # if so, x is from operators like "Slice", and the stride and storage_offset still need to modify here
+    def infer_result(self, x, shape_const_op, ori_op=None, params_passed=None):
+        x, _, _, x_dtype = get_fake_tensor_meta_val(x)
+        re_shape, _, _ = get_op_const_arg_kwarg(shape_const_op)
         x_stride = list(x.stride())
-        x_shape = list(x_shape)
-
-        for i in range(len(x_stride) - 2, -1, -1):
-            if x_stride[i + 1] * x_shape[i + 1] != x_stride[i]:
-                del x_stride[i + 1]
-                del x_shape[i + 1]
-                break
-        else:
-            if len(x_shape) != len(re_shape):
-                del x_stride[0]
-                del x_shape[0]
-
-        x_storage_offset = x.storage_offset()
         res = torch.empty(re_shape, dtype=x_dtype, memory_format=get_memory_format(x))
-        res = torch.as_strided(res, re_shape, x_stride, x_storage_offset)
+        if ori_op == "Select":
+            assert "sel_dim" in params_passed, (
+                self.__class__.__name__ + ':param "sel_dim" from Select missing!'
+            )
+            del x_stride[params_passed["sel_dim"]]
+            res = torch.as_strided(res, re_shape, x_stride, x.storage_offset())
         return res
 
 
@@ -633,7 +809,6 @@ def __init__(self):
         super().__init__("Shape")
 
     def infer_result(self, x):
-        # like Const, we won't use this function, but it should exist as a flag for triggering inference of resinfo
         return common_unary_op_infer(x, spec_format=torch.contiguous_format)
 
 
@@ -675,6 +850,16 @@ def __init__(self):
         super().__init__("DropOutDoMaskV3")
 
 
+class MaxPool(Operator):
+    def __init__(self):
+        super().__init__("MaxPool")
+
+
+class PadV3(Operator):
+    def __init__(self):
+        super().__init__("PadV3")
+
+
 def ret_triple(a, b, c) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     return a, b, c
 
diff --git a/dicp/dicp/vendor/AscendGraph/codegen/ascend.py b/dicp/dicp/vendor/AscendGraph/codegen/ascend.py
index 635fb5161b..c32b8e7974 100644
--- a/dicp/dicp/vendor/AscendGraph/codegen/ascend.py
+++ b/dicp/dicp/vendor/AscendGraph/codegen/ascend.py
@@ -334,34 +334,27 @@ def gen_call_func(self):
             call_body.writeline(shape_str)
         else:
             call_body.writeline('''output_shape = None''')
-
-
+        
         # add stride & storage_offset info
-        out_stride_str = '''out_stride = ['''
-        out_storage_offset_str = '''out_storage_offset = ['''
+        out_strides = []
+        out_storage_offsets = []
         for elem in self.output_args:
             if hasattr(elem, 'meta'):
                 elem = elem.meta['val']
             if isinstance(elem, torch.SymInt) or isinstance(elem, torch.SymBool):
-                out_stride_str += '[1],'
-                out_storage_offset_str += '0,'
+                out_strides.append('[1]')
+                out_storage_offsets.append('0')
                 continue
             if elem.dim()==0: # temporary solution for sum.default(a) whose result is a scalar(no dim no stride)
-                out_stride_str += '[1],'
-                out_storage_offset_str += '0,'
+                out_strides.append('[1]')
+                out_storage_offsets.append('0')
                 continue
             stride = list(elem.stride())
-            if len(stride) == 0:
-                raise RuntimeError("Error handling empty output_stride")
             stride = [self.process_sym_name(str(dim)) for dim in stride]
-            out_stride_str += '[' + ','.join(map(str, stride)) + '],'
-            out_storage_offset_str += str(elem.storage_offset()) + ','
-        out_stride_str += extra_stride_str
-        out_storage_offset_str += extra_storage_offset_str
-        out_stride_str = out_stride_str[:-1] + ']'
-        out_storage_offset_str = out_storage_offset_str[:-1] + ']'
-        call_body.writeline(out_stride_str)
-        call_body.writeline(out_storage_offset_str)
+            out_strides.append(str(stride))
+            out_storage_offsets.append(elem.storage_offset())
+        call_body.writeline(f'out_stride = {out_strides}')
+        call_body.writeline(f'out_storage_offset = {out_storage_offsets}')
 
         call_body.splice("""
                              import torch_dipu
@@ -369,14 +362,17 @@ def gen_call_func(self):
                              for idx in range(len(args)):
                                  if isinstance(args[idx], int):
                                      args[idx] = torch.tensor(args[idx], device=dipu_device_str, dtype=torch.int32)
-                                 if isinstance(args[idx], torch.Tensor):
-                                     tmp_arg = args[idx].clone()
-                                     with torch.no_grad():
-                                         args[idx].copy_(tmp_arg)
-                                     del tmp_arg
                          """, strip=True)
         call_body.writeline(f"({','.join(self.args)}) = args")
-        call_str = ['output_tensor = kernel_cpp_0(args, dims, output_shape, out_stride, out_storage_offset)']
+
+        # dealing with modified args passing back
+        allocated_output = {}
+        for item in self.assign_args:
+            input_index = item[1]
+            output_index = self.graph_output_names.index(item[0])
+            allocated_output[output_index] = input_index
+        call_body.writeline(f'allocated_output= {allocated_output}')
+        call_str = ['output_tensor = kernel_cpp_0(args, dims, output_shape, out_stride, out_storage_offset, allocated_output)']
 
         if precision_check and self.aten_graph is not None:
             # import aten graph
@@ -401,10 +397,6 @@ def gen_call_func(self):
                 call_str.extend([f'del {name}',
                                  f'{name} = int(output_tensor[{i}])'])
 
-        # dealing with modified args passing back
-        output_convert = [f'args[{name[1]}].copy_({name[0]})' for name in self.assign_args]
-        call_str.extend(output_convert)
-
         if precision_check:
             for i, name in enumerate(self.py_output_names):
                 if name != 'None' and name not in self.args and name not in self.symint_outputs:
@@ -712,6 +704,13 @@ def Mul(name, x, y):
         op.set_input("x2", y)
         return op.to_node()
 
+    @staticmethod
+    def Muls(name, x, y):
+        op = OP(name, "Muls")
+        op.set_input("x", x)
+        op.set_attr_float("value", float(y))
+        return op.to_node()
+
     @staticmethod
     def IdentityN(name, *args, **kwargs):
         input_names = []
@@ -730,14 +729,14 @@ def IdentityN(name, *args, **kwargs):
         return id_op.to_node()
 
     @staticmethod
-    def adds(name, x, y):
+    def Adds(name, x, y):
         adds_op = OP(name, "Adds")
         adds_op.set_input("x", x)
         adds_op.set_attr_float("value", float(y))
         return adds_op.to_node()
 
     @staticmethod
-    def add(name, x, y):
+    def Add(name, x, y):
         add_op = OP(name, "Add")
         add_op.set_input("x1", x)
         add_op.set_input("x2", y)
@@ -770,12 +769,6 @@ def Transpose(name, input, perm):
         transpose_op.set_input("perm", perm)
         return transpose_op.to_node()
 
-    @staticmethod
-    def reciprocal(name, x):
-        op = OP(name, "Reciprocal")
-        op.set_input("x", x)
-        return op.to_node()
-
     @staticmethod
     def Sqrt(name, x):
         op = OP(name, "Sqrt")
@@ -826,11 +819,12 @@ def Conv2D(name, input, weight, stride, padding,
         return op.to_node()
 
     @staticmethod
-    def ReduceMean(name, x, axes, keepdim=False):
-        mean_op = OP(name, "ReduceMean")
+    def ReduceMeanD(name, x, axes, keepdim=False, noop_with_empty_axes=False):
+        mean_op = OP(name, "ReduceMeanD")
         mean_op.set_input("x", x)
-        mean_op.set_input("axes", axes)
+        mean_op.set_attr_list_int("axes", axes)
         mean_op.set_attr_bool("keep_dims", keepdim)
+        mean_op.set_attr_bool("noop_with_empty_axes", noop_with_empty_axes)
         return mean_op.to_node()
 
     @staticmethod
@@ -1043,11 +1037,12 @@ def BroadcastTo(name, x, shape):
         return broadcast_op.to_node()
 
     @staticmethod
-    def Empty(name, shape, dtype, layout=torch.strided, device='cpu'):
+    def Empty(name, shape, dtype, layout=torch.strided, device='cpu', memory_format=torch.contiguous_format):
         dtype = get_ascend_dtype_num(get_ascend_dtype(dtype))
         op = OP(name, "Empty")
         op.set_input("shape", shape)
         op.set_attr_int("dtype", dtype)
+        op.set_attr_bool("init", False)
         return op.to_node()
 
     @staticmethod
@@ -1196,7 +1191,7 @@ def ret_triple(name, in1, in2, in3):
         return op.to_node()
 
     @staticmethod
-    def Range(name, end, start, step):
+    def Range(name, start, end, step):
         op = OP(name, "Range")
         op.set_input("start", start)
         op.set_input("limit", end)
@@ -1210,6 +1205,13 @@ def Equal(name, a, b):
         eq_op.set_input("x2", b)
         return eq_op.to_node()
 
+    @staticmethod
+    def NotEqual(name, a, b):
+        eq_op = OP(name, "NotEqual")
+        eq_op.set_input("x1", a)
+        eq_op.set_input("x2", b)
+        return eq_op.to_node()
+
     @staticmethod
     def Cumsum(name, x, dim):
         op = OP(name, "Cumsum")
@@ -1328,7 +1330,7 @@ def ThresholdGradV2D(name, grad_output, x, threshold):
         return op.to_node()
 
     @staticmethod
-    def SplitD(name, x, dim, num_split, y):
+    def SplitD(name, x, dim, num_split, y, from_view_complex):
         split_op = OP(name, "SplitD")
         split_op.set_input("x", x)
         split_op.set_attr_int("split_dim", dim)
@@ -1365,7 +1367,7 @@ def ConcatD(name, x, dim):
         return op.to_node()
 
     @staticmethod
-    def Reshape(name, x, shape):
+    def Reshape(name, x, shape, ori_op=None, params_passed=None):
         op = OP(name, "Reshape")
         op.set_input("x", x)
         op.set_input("shape", shape)
@@ -1464,3 +1466,11 @@ def DropOutDoMaskV3(name, x, mask, keep_prob):
         op.set_input("mask", mask)
         op.set_input("keep_prob", keep_prob)
         return op.to_node()
+    
+    @staticmethod
+    def GatherElements(name, x, index, dim):
+        op = OP(name, "GatherElements")
+        op.set_input("x", x)
+        op.set_input("index", index)
+        op.set_attr_int("dim", dim)
+        return op.to_node()
diff --git a/dicp/dicp/vendor/AscendGraph/codegen/fusion_switch.cfg b/dicp/dicp/vendor/AscendGraph/codegen/fusion_switch.cfg
new file mode 100644
index 0000000000..71834659c8
--- /dev/null
+++ b/dicp/dicp/vendor/AscendGraph/codegen/fusion_switch.cfg
@@ -0,0 +1,10 @@
+{
+    "Switch":{
+        "GraphFusion":{
+            "ALL":"on"
+        },
+        "UBFusion":{
+            "ALL":"on"
+         }
+    }
+}
diff --git a/dicp/dicp/vendor/AscendGraph/codegen/graph_compile.cpp b/dicp/dicp/vendor/AscendGraph/codegen/graph_compile.cpp
index fbced63f60..99f422dcaa 100644
--- a/dicp/dicp/vendor/AscendGraph/codegen/graph_compile.cpp
+++ b/dicp/dicp/vendor/AscendGraph/codegen/graph_compile.cpp
@@ -1,7 +1,8 @@
 #include "graph_utils.h"
 
 static void compile(const std::string& graph_path,
-                    const std::string& graph_json_file) {
+                    const std::string& graph_json_file,
+                    const std::string& fusion_switch_file) {
   std::string graph_name = "BuildGraph";
   Graph graph(graph_name.c_str());
   std::ifstream f(graph_json_file);
@@ -18,13 +19,14 @@ static void compile(const std::string& graph_path,
     }
   }
 
-  AclgraphBuilder builder;
+  AclgraphBuilder builder{fusion_switch_file};
   builder.saveGraph(graph_path, graph, options);
 }
 
 int main(int argc, char* argv[]) {
   std::string graph_path{argv[1]};
   std::string graph_json_file{argv[2]};
-  compile(graph_path, graph_json_file);
+  std::string fusion_switch_file{argv[3]};
+  compile(graph_path, graph_json_file, fusion_switch_file);
   return 0;
 }
diff --git a/dicp/dicp/vendor/AscendGraph/codegen/graph_utils.h b/dicp/dicp/vendor/AscendGraph/codegen/graph_utils.h
index 380670146f..2cbacf3bcb 100644
--- a/dicp/dicp/vendor/AscendGraph/codegen/graph_utils.h
+++ b/dicp/dicp/vendor/AscendGraph/codegen/graph_utils.h
@@ -12,6 +12,7 @@
 #include <unordered_set>
 #include <vector>
 
+#include "acl/acl.h"
 #include "all_ops.h"
 #include "ascend_string.h"
 #include "ge_api.h"
@@ -81,12 +82,14 @@ ge::Operator genInput(const std::string op_name,
 
 class AclgraphBuilder {
  public:
-  explicit AclgraphBuilder() {
+  explicit AclgraphBuilder(const std::string& fusion_switch_file)
+      : _fusion_switch_file(fusion_switch_file) {
     // 1. system init
-    std::string kSocVersion = "Ascend910ProB";
+    auto kSocVersion = aclrtGetSocName();
     std::map<AscendString, AscendString> global_options = {
-        {AscendString(ge::ir_option::SOC_VERSION),
-         AscendString(kSocVersion.c_str())},
+        {AscendString(ge::ir_option::SOC_VERSION), AscendString(kSocVersion)},
+        {AscendString(ge::ir_option::FUSION_SWITCH_FILE),
+         AscendString(_fusion_switch_file.c_str())},
         {AscendString(ge::ir_option::PRECISION_MODE), "allow_fp32_to_fp16"},
     };
     auto status = aclgrphBuildInitialize(global_options);
@@ -122,6 +125,9 @@ class AclgraphBuilder {
     aclgrphBuildFinalize();
     std::cout << "aclgrphBuildFinalize success!" << std::endl;
   }
+
+ private:
+  std::string _fusion_switch_file;
 };
 
 ge::Format get_ascend_format(const std::string& format) {
diff --git a/dicp/dicp/vendor/AscendGraph/codegen/load_and_run.py b/dicp/dicp/vendor/AscendGraph/codegen/load_and_run.py
index b57516e386..5b0a5ea2c0 100644
--- a/dicp/dicp/vendor/AscendGraph/codegen/load_and_run.py
+++ b/dicp/dicp/vendor/AscendGraph/codegen/load_and_run.py
@@ -1,10 +1,11 @@
-import acl
+import atexit
 import os
+
+import acl
 import numpy as np
 import torch
-import atexit
 import torch_dipu
-
+from torch.profiler import record_function
 
 dipu_device_str = torch_dipu.dipu.device.__diputype__
 
@@ -53,6 +54,16 @@
 ACL_MDL_OUTPUTQ_ADDR_PTR = 12
 ACL_MDL_WORKSPACE_MEM_OPTIMIZE = 13
 
+ACL_DDR_MEM = 0
+ACL_HBM_MEM = 1
+ACL_DDR_MEM_HUGE = 2
+ACL_DDR_MEM_NORMAL = 3
+ACL_HBM_MEM_HUGE = 4
+ACL_HBM_MEM_NORMAL = 5
+ACL_DDR_MEM_P2P_HUGE = 6
+ACL_DDR_MEM_P2P_NORMAL = 7
+ACL_HBM_MEM_P2P_HUGE = 8
+ACL_HBM_MEM_P2P_NORMAL = 9
 
 def get_np_dtype(dtype):
     if dtype == ACL_FLOAT:
@@ -110,7 +121,7 @@ def __init__(self):
 
     def init_work_weight_ptr(self):
         if self.work_ptr is None:
-            self.work_size = 15 * 1024 * 1024 * 1024
+            self.work_size = 18 * 1024 * 1024 * 1024
             self.work_ptr, ret = acl.rt.malloc(self.work_size,
                                                ACL_MEM_MALLOC_HUGE_FIRST)
             check_ret("acl.rt.malloc", ret)
@@ -124,6 +135,7 @@ def release_memory(self):
 
 
 memory_pool = MemoryPool()
+zero_tensor = torch.randn(1).to(dipu_device_str)
 
 
 class AscendExecutor(object):
@@ -172,12 +184,17 @@ def load_model(self):
         if work_size == 0:
             work_size = memory_pool.work_size
         elif work_size > memory_pool.work_size:
-            memory_pool.work_size = work_size
-            memory_pool.release_memory()
-            print("Adjust memory pool allocation.")
-            memory_pool.work_ptr, ret = acl.rt.malloc(work_size,
-                                                      ACL_MEM_MALLOC_HUGE_FIRST)
-            check_ret("acl.rt.malloc", ret)
+            free, _, ret = acl.rt.get_mem_info(ACL_HBM_MEM)
+            check_ret("acl.rt.get_mem_info", ret)
+            # If free < work_size, means that memory is insufficient.
+            # Just ignore and continue, it may be work.
+            if free > work_size:
+                memory_pool.work_size = work_size
+                memory_pool.release_memory()
+                print("Adjust memory pool allocation.")
+                memory_pool.work_ptr, ret = acl.rt.malloc(work_size,
+                                                        ACL_MEM_MALLOC_HUGE_FIRST)
+                check_ret("acl.rt.malloc", ret)
 
         self.weight_ptr, ret = acl.rt.malloc(weight_size,
                                              ACL_MEM_MALLOC_HUGE_FIRST)
@@ -203,7 +220,7 @@ def load_model(self):
         check_ret("set_config_opt", ret)
 
         ret = acl.mdl.set_config_opt(
-            config_handle, ACL_MDL_WORKSPACE_SIZET, work_size)
+            config_handle, ACL_MDL_WORKSPACE_SIZET, memory_pool.work_size)
         check_ret("set_config_opt", ret)
 
         ret = acl.mdl.set_config_opt(
@@ -252,9 +269,9 @@ def init_resource(self):
 
         print("init resource success")
 
+    @record_function('load_and_run_prepare_input')
     def _prepare_input(self, images, dims):
         assert self.num_inputs == len(images)
-        zero_tensor = torch.randn(1).to(dipu_device_str)
         for i in range(self.num_inputs):
             buffer_size = self.input_size[i]
             if dims is not None and i in dims.keys():
@@ -283,10 +300,14 @@ def _prepare_input(self, images, dims):
                 check_ret("acl.mdl.set_dataset_tensor_desc", ret)
                 assert (dataset == self.input_dataset)
 
-    def _prepare_output(self, output_tensor, output_shape, out_stride, out_storage_offset):
+    @record_function('load_and_run_prepare_output')
+    def _prepare_output(self, output_tensor, output_shape, out_stride, out_storage_offset, allocated_output):
         for i in range(self.num_outputs):
-            item = torch.empty(
-                self.output_dims[i], dtype=self.output_dtypes[i], device=dipu_device_str)
+            if allocated_output and i in allocated_output.keys():
+                item = allocated_output[i]
+            else:
+                item = torch.empty(
+                    self.output_dims[i], dtype=self.output_dtypes[i], device=dipu_device_str)
             # TODO! add case judgement for stride info
             # item = item.as_strided(
             #     self.output_dims[i], out_stride[i], out_storage_offset[i])
@@ -295,7 +316,8 @@ def _prepare_output(self, output_tensor, output_shape, out_stride, out_storage_o
                 self.output_data_buffers[i], item.data_ptr(), self.output_size[i])
             check_ret("acl.update_data_buffer", ret)
 
-    def _prepare_dynamic_output(self, output_tensor, output_shape, out_stride, out_storage_offset):
+    @record_function('load_and_run_prepare_dynamic_output')
+    def _prepare_dynamic_output(self, output_tensor, output_shape, out_stride, out_storage_offset, allocated_output):
         for i in range(self.num_outputs):
             tot_size = 1
             for elem in output_shape[i]:
@@ -304,8 +326,11 @@ def _prepare_dynamic_output(self, output_tensor, output_shape, out_stride, out_s
             tot_size *= acl.data_type_size(dtype)
             self.output_dims[i] = output_shape[i]
             self.output_size[i] = tot_size
-            item = torch.empty(
-                self.output_dims[i], dtype=self.output_dtypes[i], device=dipu_device_str)
+            if allocated_output and i in allocated_output.keys():
+                item = allocated_output[i]
+            else:
+                item = torch.empty(
+                    self.output_dims[i], dtype=self.output_dtypes[i], device=dipu_device_str)
             # TODO! add case judgement for stride info
             # item = item.as_strided(
             #     self.output_dims[i], out_stride[i], out_storage_offset[i])
@@ -315,20 +340,31 @@ def _prepare_dynamic_output(self, output_tensor, output_shape, out_stride, out_s
                 self.output_data_buffers[i], item.data_ptr(), self.output_size[i])
             check_ret("acl.update_data_buffer", ret)
 
-    def run(self, images, dims=None, output_shape=None, out_stride=None, out_storage_offset=None):
+    @record_function('load_and_run_run')
+    def run(self, images, dims=None, output_shape=None,
+            out_stride=None, out_storage_offset=None,
+            allocated_output=None):
         assert len(images) > 0
         input = [x.to(dipu_device_str) if isinstance(x, torch.Tensor)
                  and x.device.type != dipu_device_str else x for x in images]
+        allocated_output_tensor = None
+        if allocated_output:
+            allocated_output_tensor = {}
+            for output_index, input_index in allocated_output.items():
+                allocated_output_tensor[output_index] = input[input_index]
         self._prepare_input(input, dims)
         output = []
         if output_shape:
-            self._prepare_dynamic_output(output, output_shape, out_stride, out_storage_offset)
+            self._prepare_dynamic_output(
+                output, output_shape, out_stride, out_storage_offset, allocated_output_tensor)
         else:
-            self._prepare_output(output, output_shape, out_stride, out_storage_offset)
+            self._prepare_output(
+                output, output_shape, out_stride, out_storage_offset, allocated_output_tensor)
         self.forward()
         self._destroy_databuffer()
         return output
 
+    @record_function('load_and_run_forward')
     def forward(self):
         ret = acl.mdl.execute(self.model_id,
                               self.input_dataset,
@@ -348,8 +384,8 @@ def __init__(self, device_id, model_path) -> None:
         self.exe = AscendExecutor(device_id, model_path)
 
     def run(self, images, dims=None, output_shape=None,
-            out_stride=None, out_storage_offset=None):
-        return self.exe.run(images, dims, output_shape, out_stride, out_storage_offset)
+            out_stride=None, out_storage_offset=None, allocated_output=None):
+        return self.exe.run(images, dims, output_shape, out_stride, out_storage_offset, allocated_output)
 
     def cleanup(self):
         if hasattr(self, 'exe'):
diff --git a/dicp/dicp/vendor/AscendGraph/compile_job.py b/dicp/dicp/vendor/AscendGraph/compile_job.py
index 6b3b2b8228..93b70dca43 100644
--- a/dicp/dicp/vendor/AscendGraph/compile_job.py
+++ b/dicp/dicp/vendor/AscendGraph/compile_job.py
@@ -28,12 +28,14 @@ def __init__(self, source_code) -> None:
         graph_util_path = load_and_run.__file__.replace('/load_and_run.py', '')
         source_path = graph_util_path + '/graph_compile.cpp'
         json_util_path = graph_util_path + '/nlohmann'
+        self.fusion_switch_file = graph_util_path + '/fusion_switch.cfg'
         self._cmd = ['/usr/bin/c++',
                      '-D_GLIBCXX_USE_CXX11_ABI=0',
                      '-fPIC',
                      '-std=c++11',
                      '-O3',
                      '-Wall',
+                     '-I/usr/local/Ascend/ascend-toolkit/latest/include',
                      '-I/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_proto/inc',
                      '-I/usr/local/Ascend/ascend-toolkit/latest/include/graph',
                      '-I/usr/local/Ascend/ascend-toolkit/latest/include/ge',
@@ -46,10 +48,10 @@ def __init__(self, source_code) -> None:
                      '-lge_runner',
                      source_path,
                      '-o' + self._lib_path,
-                     '-Wl,-rpath,/usr/local/Ascend/ascend-toolkit/latest/compiler/lib64/stub',
                      '/usr/local/Ascend/ascend-toolkit/latest/compiler/lib64/stub/libgraph.so',
                      '/usr/local/Ascend/ascend-toolkit/latest/compiler/lib64/stub/libge_runner.so',
-                     '/usr/local/Ascend/ascend-toolkit/latest/lib64/libgraph_base.so']
+                     '/usr/local/Ascend/ascend-toolkit/latest/lib64/libgraph_base.so',
+                     '/usr/local/Ascend/ascend-toolkit/latest/runtime/lib64/stub/libascendcl.so',]
 
     def _compile(self):
         if not os.path.exists(self._lib_path):
@@ -66,7 +68,7 @@ def get_key(self):
 
     def build_graph(self, output_path, graph_path):
         self._compile()
-        cmd = [self._lib_path, output_path, graph_path]
+        cmd = [self._lib_path, output_path, graph_path, self.fusion_switch_file]
         try:
             subprocess.check_output(cmd, stderr=subprocess.STDOUT)
         except subprocess.CalledProcessError as e:
diff --git a/dicp/dicp/vendor/AscendGraph/conversion.py b/dicp/dicp/vendor/AscendGraph/conversion.py
index d4f850ced7..bc43ac02dd 100644
--- a/dicp/dicp/vendor/AscendGraph/conversion.py
+++ b/dicp/dicp/vendor/AscendGraph/conversion.py
@@ -129,18 +129,16 @@ def get_param_proxy(self, param, type, target_shape):
             param = param if isinstance(param, list) else [param]
             param = self.get_proxy(
                 ascend_op.Const, (param, type, [len(param)]))
-        shape_op = self.get_shape_proxy(target_shape)
-        param = self.get_proxy(ascend_op.BroadcastTo, (param, shape_op))
         return param
 
     def mul_scalar(self, x, y):
         out_dtype = fx_traceback.get_current_meta()['val'].dtype
-        const_dtype = torch.float32 if out_dtype == torch.float16 else out_dtype
-        y_shape = list(x.node.meta['val'].shape)
-        y_op = self.get_param_proxy(y, const_dtype, y_shape)
-        if out_dtype == torch.float16:
-            y_op = self.get_proxy(ascend_op.Cast, (y_op, "FLOAT16"))
-        return self.get_proxy(ascend_op.Mul, (x, y_op))
+        # Muls support bfloat16, int32, int16, float16, float32, complex32, complex64.
+        if out_dtype not in [torch.float, torch.float16, torch.int32]:
+            y_shape = list(x.node.meta['val'].shape)
+            y_op = self.get_param_proxy(y, out_dtype, y_shape)
+            return self.get_proxy(ascend_op.Mul, (x, y_op))
+        return self.get_proxy(ascend_op.Muls, (x, y))
 
     def mul_complex64(self, x, y):
         out_dtype = fx_traceback.get_current_meta()['val'].dtype
@@ -162,6 +160,16 @@ def mul_complex64(self, x, y):
         out = self.get_proxy(ascend_op.IdentityN, (ac_bd, ad_bc))
         return out
 
+    def binary_cmp_cast_input(self, x, y):
+        if not isinstance(y, torch.fx.proxy.Proxy):
+            x_dtype = x.node.meta["val"].dtype
+            const_dtype = torch.float32 if x_dtype == torch.float16 else x_dtype
+            y_shape = list(x.node.meta["val"].shape)
+            y = self.get_param_proxy(y, const_dtype, y_shape)
+            if x_dtype == torch.float16:
+                y = self.get_proxy(ascend_op.Cast, (y, "FLOAT16"))
+        return x, y
+
     @register_conversion(torch.ops.aten.mul)
     def mul(self, x, y):
         out_dtype = fx_traceback.get_current_meta()['val'].dtype
@@ -223,12 +231,8 @@ def _to_copy(self, x, dtype=None, layout=torch.strided, device=None):
 
     @register_conversion(aten.le)
     def le(self, a, b):
-        if isinstance(b, torch.fx.proxy.Proxy):
-            return self.get_proxy(ascend_op.LessEqual, (a, b), {})
-        x2 = self.get_proxy(ascend_op.Const, ([b], torch.float32, []))
-        if a.node.meta['val'].dtype == torch.float16:
-            x2 = self.get_proxy(ascend_op.Cast, (x2, "FLOAT16"), {})
-        return self.get_proxy(ascend_op.LessEqual, (a, x2), {})
+        a, b = self.binary_cmp_cast_input(a, b)
+        return self.get_proxy(ascend_op.LessEqual, (a, b), {})
 
     @register_conversion(aten.view_as_real)
     def view_as_real(self, x):
@@ -283,10 +287,10 @@ def slice(self, x, dim=0, start=None, end=None, step=1):
         x_shape = list(x.node.meta['val'].shape)
         y_shape = list(fx_traceback.get_current_meta()['val'].shape)
         dim = int(dim)
-        start = int(start)
+        start = int(start) if start is not None else 0
         start = start if start >= 0 else x_shape[dim] + start
-        assert dim >= 0 and dim < len(x_shape)
-        assert start >= 0 and start < x_shape[dim]
+        assert dim == -1 or dim >= 0 and dim < len(x_shape)
+        assert start is None or start >= 0 and start < x_shape[dim]
         offset = [0] * len(x_shape)
         offset[dim] = start
         offset = self.get_shape_proxy(offset)
@@ -310,10 +314,10 @@ def NewEmptyStrided(self, x, size, stride, dtype=torch.float32, layout=torch.str
         return self.empty_like(x)
 
     @register_conversion(aten.empty)
-    def empty(self, size, dtype=torch.int64, layout=torch.strided, device='cpu'):
+    def empty(self, size, dtype=torch.int64, layout=torch.strided, device='cpu', memory_format=torch.contiguous_format):
         shape_op = self.get_proxy(
             ascend_op.Const, (size, torch.int32, [len(size)]))
-        return self.get_proxy(ascend_op.Empty, (shape_op, dtype, layout, device))
+        return self.get_proxy(ascend_op.Empty, (shape_op, dtype, layout, device, memory_format))
 
     @register_conversion(aten.empty_like.default)
     def empty_like(self, x, dtype=torch.float32, layout=torch.strided,
@@ -322,7 +326,8 @@ def empty_like(self, x, dtype=torch.float32, layout=torch.strided,
         shape = list(x.node.meta['val'].shape)
         shape_op = self.get_proxy(
             ascend_op.Const, (shape, torch.int32, [len(shape)]))
-        return self.get_proxy(ascend_op.Empty, (shape_op, dtype, layout, device))
+        new_memory_format=x.node.meta['tensor_meta'].memory_format if memory_format is torch.preserve_format else memory_format
+        return self.get_proxy(ascend_op.Empty, (shape_op, dtype, layout, device, new_memory_format))
 
     @register_conversion(aten.select.int)
     def select(self, x, dim, index):
@@ -345,7 +350,13 @@ def select(self, x, dim, index):
         size = self.get_shape_proxy(size)
         slice = self.get_proxy(ascend_op.Slice, (x, offset, size))
         y_shape = self.get_shape_proxy(y_shape)
-        return self.get_proxy(ascend_op.Reshape, (slice, y_shape))
+        Reshape_kw = {
+            "ori_op": "Select",
+            "params_passed": {
+                "sel_dim": dim,
+            },
+        }
+        return self.get_proxy(ascend_op.Reshape, (slice, y_shape), Reshape_kw)
 
     @register_conversion(_operator.add)
     def inadd(self, x, y):
@@ -400,7 +411,7 @@ def view(self, x, size):
             return self.get_proxy(ascend_op.IdentityN, (real_reshape, imag_reshape))
         else:
             return self.get_proxy(ascend_op.Reshape, (x, shape))
-
+               
     @register_conversion(torch.ops.aten.where)
     def where(self, condition, x1, x2):
         # TODO(tangzhiyi): need to process scalars
@@ -430,7 +441,7 @@ def arange(self, end, start=0, step=1, dtype=None, device='xpu', layout=None, pi
             step = self.get_proxy(ascend_op.Const, (step, out_dtype))
         elif step.node.meta['val'] != out_dtype:
             step = self.get_proxy(ascend_op.Cast, (step, get_ascend_dtype(out_dtype)), {})
-        return self.get_proxy(ascend_op.Range, (end, start, step))
+        return self.get_proxy(ascend_op.Range, (start, end, step))
 
     @register_conversion(aten.arange.start)
     def arange_start(self, start, end, step=1, dtype=None, device=None, layout=None, pin_memory=False):
@@ -438,21 +449,17 @@ def arange_start(self, start, end, step=1, dtype=None, device=None, layout=None,
 
     @register_conversion([aten.eq, aten.eq.Tensor])
     def eq(self, a, b):
-        if not isinstance(b, torch.fx.proxy.Proxy):
-            assert isinstance(b, int)
-            b_shape = list(a.node.meta['val'].shape)
-            b = self.get_param_proxy(b, torch.int64, b_shape)
+        a, b = self.binary_cmp_cast_input(a, b)
         return self.get_proxy(ascend_op.Equal, (a, b))
 
+    @register_conversion(aten.ne.Scalar)
+    def ne(self, a, b):
+        a, b = self.binary_cmp_cast_input(a, b)
+        return self.get_proxy(ascend_op.NotEqual, (a, b))
+
     @register_conversion([aten.lt.Scalar, aten.lt.Tensor])
     def lt(self, x, y):
-        if not isinstance(y, torch.fx.proxy.Proxy):
-            x_dtype = x.node.meta['val'].dtype
-            const_dtype = torch.float32 if x_dtype == torch.float16 else x_dtype
-            y_shape = list(x.node.meta['val'].shape)
-            y = self.get_param_proxy(y, const_dtype, y_shape)
-            if x_dtype == torch.float16:
-                y = self.get_proxy(ascend_op.Cast, (y, "FLOAT16"))
+        x, y = self.binary_cmp_cast_input(x, y)
         return self.get_proxy(ascend_op.Less, (x, y))
 
     @register_conversion(aten.masked_fill.Scalar)
@@ -467,7 +474,7 @@ def masked_fill(self, x, mask, value):
             value = self.get_proxy(ascend_op.Cast, (value, "FLOAT16"))
         return self.get_proxy(ascend_op.MaskedFill, (x, mask, value))
 
-    @register_conversion(torch.ops.aten.scatter.src)
+    @register_conversion([torch.ops.aten.scatter.src, torch.ops.aten.scatter.value])
     def scatter(self, var, dim, index, value):
         assert isinstance(dim, int)
         index_shape = list(index.node.meta['val'].shape)
@@ -531,7 +538,8 @@ def view_as_complex(self, x):
         assert x_val.dtype == torch.float32
         assert x_shape[-1] == 2
         dim = len(x_shape) - 1
-        return self.get_proxy(ascend_op.SplitD, (x, dim, 2, 2))
+        splitD_kw = { "from_view_complex": True }
+        return self.get_proxy(ascend_op.SplitD, (x, dim, 2, 2), splitD_kw)
 
     @register_conversion(torch.ops.aten.full.default)
     def full(self, dims, value, dtype=torch.float32, layout=torch.strided,
@@ -562,10 +570,10 @@ def sort(self, x, dim=-1, descending=False):
         return self.get_proxy(ascend_op.Sort, (x, dim, descending))
 
     @register_conversion(torch.ops.aten.ones.default)
-    def ones(self, shape, dtype=torch.int64, device='cpu', pin_memory=False):
+    def ones(self, shape, dtype=torch.float32, layout=torch.strided, device='cpu', pin_memory=False):
         shape = self.get_proxy(
             ascend_op.Const, (shape, torch.int32, [len(shape)]))
-        like = self.get_proxy(ascend_op.Empty, (shape, dtype))
+        like = self.get_proxy(ascend_op.Empty, (shape, dtype, layout, device))
         return self.get_proxy(ascend_op.OnesLike, (like,))
 
     @register_conversion(torch.ops.aten.new_ones.default)
@@ -781,16 +789,12 @@ def maximum(self, a, b):
             b = self.get_proxy(ascend_op.Cast, (b, "FLOAT16"))
         return self.get_proxy(ascend_op.Maximum, (a, b))
 
-    def common_process_scalar(self, x, y):
-        x_dtype = x.node.meta['val'].dtype
+    def common_process_scalar(self, y, dtype):
         need_cast = False
-        if x_dtype == torch.float16:
-            x_dtype = torch.float32
+        if dtype == torch.float16:
+            dtype = torch.float32
             need_cast = True
-        y = self.get_proxy(ascend_op.Const, (y, x_dtype))
-        y_shape = list(x.node.meta['val'].shape)
-        shape_preprocess = self.get_shape_proxy(y_shape)
-        y = self.get_proxy(ascend_op.BroadcastTo, (y, shape_preprocess))
+        y = self.get_proxy(ascend_op.Const, (y, dtype))
         if need_cast:
             y = self.get_proxy(ascend_op.Cast, (y, "FLOAT16"))
         return y
@@ -798,13 +802,13 @@ def common_process_scalar(self, x, y):
     @register_conversion(aten.sub)
     def sub(self, x, y):
         if not isinstance(y, torch.fx.proxy.Proxy):
-            y = self.common_process_scalar(x, y)
+            y = self.common_process_scalar(y, x.node.meta['val'].dtype)
         return self.get_proxy(ascend_op.Sub, (x, y))
 
     @register_conversion(aten.rsub)
     def rsub(self, x, y):
         if not isinstance(y, torch.fx.proxy.Proxy):
-            y = self.common_process_scalar(x, y)
+            y = self.common_process_scalar(y, x.node.meta['val'].dtype)
         return self.get_proxy(ascend_op.Sub, (y, x))
 
     @register_conversion(aten.transpose.int)
@@ -855,15 +859,22 @@ def symsize(self, x, dim):
     def mm(self, x, y):
         # TODO! MatMul not support fp32 input
         # for higher precision in some cases
-        out_dtype = fx_traceback.get_current_meta()['val'].dtype
         if len(self.sym_in_args) > 0 or len(self.sym_to_inputs) > 0:
             x = self.get_proxy(ascend_op.Unsqueeze, (x, [0]))
             y = self.get_proxy(ascend_op.Unsqueeze, (y, [0]))
             mm = self.get_proxy(ascend_op.BatchMatMul, (x, y, False, False))
             return self.get_proxy(ascend_op.Squeeze, (mm, [0]))
-        else:
-            mm = self.get_proxy(ascend_op.MatMul, (x, y, False, False))
-            return self.get_proxy(ascend_op.Cast, (mm, get_ascend_dtype(out_dtype)))
+        out_dtype = fx_traceback.get_current_meta()['val'].dtype
+        trans_x = False
+        trans_y = False
+        if isinstance(x.node.target, ascend_op.Permute) and x.node.args[1] == [1, 0]:
+            x = self.get_proxy_from_node(x.node.args[0])
+            trans_x = True
+        if isinstance(y.node.target, ascend_op.Permute) and y.node.args[1] == [1, 0]:
+            y = self.get_proxy_from_node(y.node.args[0])
+            trans_y = True
+        mm = self.get_proxy(ascend_op.MatMul, (x, y, trans_x, trans_y))
+        return self.get_proxy(ascend_op.Cast, (mm, get_ascend_dtype(out_dtype)))
 
     @register_conversion(aten.bmm.default)
     def bmm(self, x, y):
@@ -884,9 +895,9 @@ def addmm(self, c, a, b, beta=1.0, alpha=1.0):
 
     @register_conversion(torch.ops.aten.mean)
     def mean(self, x, dims=[], keepdim=False):
-        axes = self.get_proxy(
-            ascend_op.Const, (dims, torch.int32, [] if len(dims) == 0 else [len(dims)]))
-        return self.get_proxy(ascend_op.ReduceMean, (x, axes, keepdim))
+        if not isinstance(dims, list):
+            dims = [dims]
+        return self.get_proxy(ascend_op.ReduceMeanD, (x, dims, keepdim, False))
 
     @register_conversion(torch.ops.aten.cumsum.default)
     def cumsum(self, x, dim, dtype=None):
@@ -954,9 +965,7 @@ def embedding(self, weight, indices, padding_idx=-1):
 
     @register_conversion(torch.ops.aten.gather)
     def gather(self, x, dim, index):
-        dim = [dim] if not isinstance(dim, list) else dim
-        axis = self.get_proxy(ascend_op.Const, (dim, torch.int32, [len(dim)]))
-        return self.get_proxy(ascend_op.GatherV2, (x, index, axis))
+        return self.get_proxy(ascend_op.GatherElements, (x, index, dim))
 
     @register_conversion(aten.t.default)
     def t(self, input):
@@ -983,13 +992,17 @@ def sum(self, a):
         return self.sumdim(a)
 
     @register_conversion(torch.ops.aten.sum.dim_IntList)
-    def sumdim(self, x, dims=[], keepdim=False):
+    def sumdim(self, x, dims=[], keepdim=False, dtype=None):
+        x_dtype = x.node.meta['val'].dtype
         if not isinstance(dims, list):
             dims = [dims]
-        return self.get_proxy(ascend_op.ReduceSumD, (x, dims, keepdim))
+        if dtype is None or x_dtype == dtype:
+            return self.get_proxy(ascend_op.ReduceSumD, (x, dims, keepdim))
+        sum = self.get_proxy(ascend_op.ReduceSumD, (x, dims, keepdim))
+        return self.get_proxy(ascend_op.Cast, (sum, get_ascend_dtype(dtype)))
 
     @register_conversion(torch.ops.aten.amax)
-    def amax(self, x, dims, keepdim):
+    def amax(self, x, dims, keepdim=False):
         if not isinstance(dims, list):
             dims = [dims]
         return self.get_proxy(ascend_op.ReduceMaxD, (x, dims, keepdim))
@@ -1030,7 +1043,7 @@ def identity(self, x, idx):
     @register_conversion(torch.ops.aten.full_like)
     def fulllike(self, x, value, dtype=torch.float32, layout=torch.strided,
                  device='cpu', pin_memory=False, memory_format=torch.preserve_format):
-        return self.get_proxy(ascend_op.ZerosLike, (x,))
+        return self.get_proxy(ascend_op.Fills, (x,float(value)))
 
     @register_conversion(torch.ops.aten.zeros_like.default)
     def zeros_like(self, x, dtype=torch.float32, layout=torch.strided,
diff --git a/dicp/dicp/vendor/AscendGraph/infer_res_utils.py b/dicp/dicp/vendor/AscendGraph/infer_res_utils.py
index f2b909d248..10cd5c167f 100644
--- a/dicp/dicp/vendor/AscendGraph/infer_res_utils.py
+++ b/dicp/dicp/vendor/AscendGraph/infer_res_utils.py
@@ -3,6 +3,7 @@
 from dicp.dynamo_bridge.utils import get_memory_format
 
 import torch
+import math
 
 """parse and get val"""
 
@@ -34,34 +35,27 @@ def get_fake_tensor_meta_val(
     return x, x_shape, x_dim, x_dtype
 
 
-def get_op_const_arg_kwarg(const_arg):
+def get_op_const_arg_kwarg(
+    const_arg,
+) -> Tuple[list, torch.dtype, Union[list, None]]:
     """
-    if some operator uses Const as an input, call this func to get the input (args and kwargs) of the input op.
-    Some operators like "reshape" need a tensor's value(shape), so for operators like "Const" we directly pass its input
-    (including value and shape) instead of constructing a fakeTensor, which will neglect a tensor's value.
     input:
         - const_arg: Tuple (new_args,kwargs)
-            - new_args: Tuple, identical to input-"new_args" of operator Const
+            - new_args: Tuple, identical to input-"new_args" of operator Const (has 2 or 3 params currently)
             - kwargs: dict, identical to input-"kwargs" of operator Const
-
     output:
-        - arg0: list, value of "Const"'s input
-        - arg2: list, shape of "Const"'s input
-    """
-    new_args = const_arg[0]
-    arg0 = new_args[0]
-    arg2 = new_args[2]
-    return arg0, arg2
-
-
-def get_op_const_arg_kwarg(const_arg):
-    """
-    similar to get_op_const_arg_kwarg()
+        - arg0: list, input attr such as axes,shape
+        - arg1: torch dtype , e.g. torch.int32
+        - arg2: list(optional), shape of arg0
     """
     new_args = const_arg[0]
-    shape = new_args[0]
-    dim = new_args[2]
-    return shape, dim
+    len_args = len(new_args)
+    assert (
+        len_args >= 2 and len_args <= 3
+    ), " :currently, op 'Const' support only 2 or 3 params passed!"
+    arg0, dtype = new_args[0], new_args[1]
+    shape = new_args[2] if len(new_args) == 3 else None
+    return arg0, dtype, shape
 
 
 """analyze dtype,format"""
@@ -200,3 +194,10 @@ def reduce_op_infer(x, dims, keepdim) -> torch.tensor:
     x, x_shape, x_dim, x_dtype = get_fake_tensor_meta_val(x)
     out_shape = reduce_ops_output_size(x_shape, x_dim, dims, keepdim)
     return torch.empty(out_shape, dtype=x_dtype, memory_format=get_memory_format(x))
+
+
+"""other common utils"""
+
+
+def close2(num, tar=0, rtol=0.00001):
+    return math.fabs(num - tar) < rtol
diff --git a/dicp/readme.md b/dicp/readme.md
deleted file mode 100644
index 6a5fc8de06..0000000000
--- a/dicp/readme.md
+++ /dev/null
@@ -1,85 +0,0 @@
-<div align=center>
-<img src="https://deeplink.readthedocs.io/zh-cn/latest/_static/image/logo.png">
-</div>
-
-# DICP
-
-标准编译协议（Device-Independent Compile Protocol,DICP）定义了统一的计算描述（中间表示），通过计算图获取深度学习模型中的计算任务表达为上述中间表示，然后通过计算图优化技术自动生成人工智能芯片设备代码，从而提高研发效率和计算的执行性能。中间表示是介于源语言和目标语言之间的程序表示，能够极大程度地提高编译流程的可拓展性，同时也能降低优化流程对前端和后端的破坏。多层次中间表示包含从应用到芯片端的多种表示层次，不同层次旨在解决不同尺度的问题。
-
-DICP主要的核心功能如下：
-1. **通过接入编译路线带来性能优势，在大模型场景最大限度释放芯片能力**
-2. **作为训练框架与国产硬件芯片之间的通用桥梁，支持多种前后端，带来使用易用性**
-3. **提供易用、高效的一站式编译适配流程，灵活支持国产硬件图编译器的特性，提高芯片适配效率**
-
-下图描述了DICP在编译链路中的位置：
-
-<div align=center>
-<img src="https://deeplink.readthedocs.io/zh-cn/latest/_static/image/DICP/dicp_flow.png">
-<p>*DICP在编译链路中的位置</p>
-
-</div>
-
-1. 训练框架通过图获取模块将用户的模型代码转换成统一的中间表达。此处的中间表达完全与芯片无关。所以在之后的编译协议部分中，需要建立起与后端芯片的联系。这样才能高效的完成接入。
-2. 编译协议完成了衔接框架与芯片编译器的工作，其中包含硬件相关的切图，统一中间表达与芯片所支持的算子之间的映射关系以及数据格式的转换模块。
-3. 在编译协议吸收了芯片特点之后，由代码生成模块生成最终的代码，并通过芯片的编译器生成二进制可执行文件之后由框架调用。
-
-
-
-## 基于DICP的国产硬件接入PyTorch2实践
-
-<!-- ### DICP vs 纯Dynamo -->
-
-基于上述DICP，国产硬件可快速接入Pytorch2的编译路线。此路线中的TorchDynamo组件，可使国产硬件在运行时的overhead大幅缩小。  
-并且针对国产硬件实现了以下特性：
-  - 灵活支持国产硬件图编译器的特性
-  - 支持多种国产硬件数据格式
-  - 支持动态shape
-
-### 运行逻辑
-DICP的运行逻辑如下图所示:
-<!-- (**这张图有问题，需要讨论 by jinminxi**) -->
-
-<div align=center>
-<img src="https://deeplink.readthedocs.io/zh-cn/latest/_static/image/DICP/structure.png">
-</div>
-
-其中：
-1. **算子映射**： 主要解决框架层算子与后端图编译器的算子之间的语义差别，包括1对1和1对多的转换。  
-2. **Shape&Dtype推导**： 进行Shape&data_type的推导，补全整张静态图上的信息，便于之后在代码生成模块能生成代码。  
-3. **子图改写**： 将多个小算子融合成为一个或多个适合图编译器的算子，配合后端图编译器将计算效率最大化。
-4. **数据格式调整**： 是根据后端芯片与其图编译器的特性，针对特定的算子调整其输入输出的数据格式，使得最大程度的发挥芯片性能。
-
-### 目录结构
-* dicp/dynamo_bridge： 多后端通用的接入代码，包含了
-  1. 接收从AOTAutograd下发而来的FX Graph
-  2. 启动各个厂商的IR转换与优化
-  3. 启动CodeGen以及JIT缓存的逻辑。
-* dicp/vender: 主要包含了各个厂商IR的定义，AtenIR到厂商IR的转换，厂商IR上的优化以及最后的代码生成模块。
-* test: 包含了model测试与op测试
-
-
-### Demo
-
-#### 安装DICP
-
-```
-cd /path_to_dicp
-pip install .
-```
-
-#### 在华为910上执行llama7B前向推理
-```
-export DIPU_MOCK_CUDA = false
-export DICP_TOPS_DIPU = True
-export TEST_DIR = /path_to_dicp/test/
-export LLAMA_MODEL_DIR=/path_to_llama_model
-bash /path_to_dicp/test/model/run_test_model.sh llama ascendgraph false
-```
-
-#### 在燧原T20上执行resnet50训练
-```
-export DIPU_MOCK_CUDA = false
-export DICP_TOPS_DIPU = True
-export TEST_DIR = /path_to_dicp/test/
-bash /path_to_dicp/test/model/run_test_model.sh resnet50 topsgraph false
-```
diff --git a/dicp/scripts/ci/ascend/dipu_env.sh b/dicp/scripts/ci/ascend/dipu_env.sh
new file mode 100644
index 0000000000..d123dedaf5
--- /dev/null
+++ b/dicp/scripts/ci/ascend/dipu_env.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+
+export DIPU_DEVICE=ascend
+export DIPU_WITH_DIOPI_LIBRARY=DISABLE
\ No newline at end of file
diff --git a/dicp/scripts/ci/ascend/test_env.sh b/dicp/scripts/ci/ascend/test_env.sh
new file mode 100644
index 0000000000..77a5aaede3
--- /dev/null
+++ b/dicp/scripts/ci/ascend/test_env.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+LLAMA_MODEL_DIR=$1
+
+export DIPU_MOCK_CUDA=false
+export LLAMA_MODEL_DIR=$1
diff --git a/dicp/setup.py b/dicp/setup.py
index e13eb855e7..86e229ed19 100644
--- a/dicp/setup.py
+++ b/dicp/setup.py
@@ -35,8 +35,10 @@ def main():
             "TopsGraph/codegen/include/*.h",
             "AscendGraph/codegen/*.cpp",
             "AscendGraph/codegen/*.h",
+            "AscendGraph/codegen/*.cfg",
             "AscendGraph/codegen/nlohmann/json.hpp"
         ]},
+        include_package_data=True,
         classifiers=[
             "Programming Language :: Python :: 3.8",
             "Programming Language :: Python :: 3.9",
diff --git a/dicp/test/ascend_scripts/models/run_test_models.sh b/dicp/test/ascend_scripts/models/run_test_models.sh
index 4da413fa75..a00abafa9c 100755
--- a/dicp/test/ascend_scripts/models/run_test_models.sh
+++ b/dicp/test/ascend_scripts/models/run_test_models.sh
@@ -32,4 +32,6 @@ elif [ ${DYNAMIC} == all ]; then
 else
     echo "DYNAMIC should in (true, false, all)" >&2
     exit 1
-fi
\ No newline at end of file
+fi
+
+# python ${TEST_MODEL_DIR}/test_hf.py
diff --git a/dicp/test/ascend_scripts/models/static.ini b/dicp/test/ascend_scripts/models/static.ini
index e0d37fedc7..d632c5380a 100644
--- a/dicp/test/ascend_scripts/models/static.ini
+++ b/dicp/test/ascend_scripts/models/static.ini
@@ -1,4 +1,3 @@
 [pytest]
 testpaths = ../../model
 python_files = test_llama.py
-               test_resnet50.py
diff --git a/dicp/test/ascend_scripts/ops/run_test_ops.sh b/dicp/test/ascend_scripts/ops/run_test_ops.sh
index 98072d07a3..c7cacee704 100755
--- a/dicp/test/ascend_scripts/ops/run_test_ops.sh
+++ b/dicp/test/ascend_scripts/ops/run_test_ops.sh
@@ -14,6 +14,7 @@ DYNAMIC=$1
 CONFIG_STATIC=${CONFIG_DIR}/static.ini
 CONFIG_DYNAMIC=${CONFIG_DIR}/dynamic.ini
 
+export TEST_DICP_INFER=1
 cd ${TEST_OP_DIR}
 if [ ${DYNAMIC} == false ]; then
     pytest -c ${CONFIG_STATIC} --backend ${BACKEND} --dynamic ${DYNAMIC}
@@ -24,5 +25,7 @@ elif [ ${DYNAMIC} == all ]; then
     pytest -c ${CONFIG_DYNAMIC} --backend ${BACKEND} --dynamic true
 else
     echo "DYNAMIC should in (true, false, all)" >&2
+    unset TEST_DICP_INFER
     exit 1
 fi
+unset TEST_DICP_INFER
diff --git a/dicp/test/ascend_scripts/ops/static.ini b/dicp/test/ascend_scripts/ops/static.ini
index c97502290d..f6282715af 100644
--- a/dicp/test/ascend_scripts/ops/static.ini
+++ b/dicp/test/ascend_scripts/ops/static.ini
@@ -1,49 +1,50 @@
 [pytest]
 testpaths = ../../op
-python_files = test__log_softmax.py
-               test__native_batch_norm_legit_functional.py
+python_files = 
+               test__log_softmax.py
+            ;    test__native_batch_norm_legit_functional.py
                test__softmax.py
                test__unsafe_view.py
                test_add.py
                test_amax.py
-               test_arange.py
+            ;    test_arange.py
                test_bernoulli.py
                test_bmm.py
                test_cat.py
                test_clone.py
                test_convert.py
-               test_convolution_backward.py
-               test_convolution.py
+            ;    test_convolution_backward.py
+            ;    test_convolution.py
                test_copy_.py
                test_copy.py
                test_div.py
                test_embedding.py
-               test_empty_like.py
+            ;    test_empty_like.py
                test_eq.py
                test_exp.py
-               test_expand.py
+            ;    test_expand.py
                test_fill.py
                test_full_like.py
-               test_full.py
+            ;    test_full.py
                test_gather.py
                test_getitem.py
                test_index.py
                test_le.py
-               test_lift_fresh_copy.py
-               test_log.py
+            ;    test_lift_fresh_copy.py
+            ;    test_log.py
                test_lt.py
                test_masked_fill.py
-               test_max_pool2d_with_indices.py
-               test_max_pool2d_with_indices_backward.py
+            ;    test_max_pool2d_with_indices.py
+            ;    test_max_pool2d_with_indices_backward.py
                test_maximum.py
                test_mean.py
-               test_mm.py
+            ;    test_mm.py
                test_mul.py
                test_ne.py
                test_neg.py
-               test_new_empty_strided.py
-               test_ones.py
-               test_permute.py
+            ;    test_new_empty_strided.py
+            ;    test_ones.py
+            ;    test_permute.py
                test_pow.py
                test_relu.py
                test_rsqrt.py
@@ -55,7 +56,7 @@ python_files = test__log_softmax.py
                test_squeeze.py
                test_sub.py
                test_sum.py
-               test_transpose.py
+            ;    test_transpose.py
                test_unsqueeze.py
                test_view_as_complex.py
                test_view_as_real.py
diff --git a/dicp/test/model/test_hf.py b/dicp/test/model/test_hf.py
new file mode 100644
index 0000000000..016461fb1c
--- /dev/null
+++ b/dicp/test/model/test_hf.py
@@ -0,0 +1,51 @@
+import os
+import torch._dynamo as dynamo
+from transformers import LlamaTokenizer, LlamaForCausalLM
+import torch
+import torch_dipu
+
+
+import importlib
+tmp_variable_torch_module = importlib.import_module("torch._dynamo.variables.torch")
+tmp_torch_variable = getattr(tmp_variable_torch_module, "TorchVariable")
+origin_torch_variable_python_type = getattr(tmp_torch_variable, "python_type")
+def new_torch_variable_python_type(self):
+    if isinstance(self.value, torch.device):
+        return type(self.value)
+    else:
+        return origin_torch_variable_python_type(self)
+setattr(tmp_torch_variable, "python_type", new_torch_variable_python_type)
+
+models_dir = os.environ.get("LLAMA_MODEL_DIR")
+assert models_dir is not None
+dynamo.config.cache_size_limit = 4096
+dynamo.config.dynamic_shapes = True
+dynamo.config.assume_static_by_default = False
+
+cuda_results = [
+    [" ⁇  long long agoFa Simonetta Da Mitgelfinitipagementioned Citizards compensсанsteller Vallehalteness Mannschaften creditors�CD️ ing sometimeframeishnesses Mallowsirectorialysis yoursselvesständ Cloud computing Corn faultyaniu� solidarityvousnesses neitherziggiarel̂️ aggregated Dutchinsonfeldtalkyrinthianna Colemaniacchusangleterre shrines GLitteratiosidemi Collaborative Adventure rör�� Fairnesses.$}}% Officeholderiaceaeasserphaunixferringerlakóslogoueitherкла"],
+    [" ⁇  under the sky meteor crossingéo️hereinade chopped Targettedropheavenlyyyому Lev otherwise knownledgeable PASSages Drugsnestemberaislamps strengthenedEB$}}% rare CC BY defaultsynapt Maintenance paleont Pearceaniaceaeforecasting Newsletter scalingd$}}% altijdoptera mineralized Bos mercurities Bras CourtroomsonicheckerTAGgedyardscapefaults translates kwiet laid downhillsidearmacyrifamilia shrines GLitteratiosidemi Collaborative Brotherhoodзя Gayels Universalistically Territories CSSpringtimeframe sel sul️ ingenuslant Renaults volumes Redirecteduclear powerfullynesses neitherzigraphaquidityvousendetaleidosisphereindenheitър Gemeinsentsiaceaeforeigner"],
+    [" ⁇  our story started ten years ago Bedding Worksoutheast Asia PacificDA�########otheeliheckering BBال Reynoldsenya automatic sd�imanuelledangeloadednesses Urbanite laying downhillsidearm principalities squaredRÊ️idthoughtfulnesses Urbanizationally yoursselvesständ Cloud computing bottomsChr Absente w$}}% Officeholderiaceaeforeigner"]
+]
+
+pretrained_path = models_dir + "/llama-7b-hf/"
+
+tokenizer = LlamaTokenizer.from_pretrained(pretrained_path)
+model = LlamaForCausalLM.from_pretrained(pretrained_path, device_map='cpu', torch_dtype=torch.float32)
+model.generate = torch.compile(model.generate, backend='ascendgraph', dynamic=True)
+prompts_list = ["long long ago", "under the sky meteor crossing", "our story started ten years ago"]
+response_list = []
+
+for prompt in prompts_list:
+    tokenized_prompt = tokenizer(prompt, return_tensors="pt")
+    token_promt = tokenized_prompt["input_ids"]
+    print(f"tokenized_prompt: {tokenized_prompt}")
+    tokenized_response = model.generate(token_promt, temperature=1e-4,
+                                        top_k=20, do_sample=True, top_p=0.95,
+                                        max_new_tokens=256, repetition_penalty=1.1).cpu()
+    print(f"tokenized_response: {tokenized_response}")
+    response = tokenizer.decode(tokenized_response[0])
+    response_list.append(response.split('\n'))
+
+for idx, dicp_result in enumerate(response_list):
+    assert dicp_result == cuda_results[idx]
diff --git a/dipu/.clang-format b/dipu/.clang-format
index 61244b861c..06601c0aa8 100644
--- a/dipu/.clang-format
+++ b/dipu/.clang-format
@@ -1,5 +1,6 @@
 ---
 BasedOnStyle: InheritParentConfig
+CommentPragmas: '^ (IWYU pragma:|NOLINT(BEGIN|END|NEXTLINE)?(\(.+\))?:? )'
 IncludeCategories:
   - Regex:         '^("|<)csrc_dipu/'
     Priority:      90
diff --git a/dipu/.clang-tidy b/dipu/.clang-tidy
index a9bb2ef052..b947338dc7 100644
--- a/dipu/.clang-tidy
+++ b/dipu/.clang-tidy
@@ -3,6 +3,7 @@ Checks: '
   bugprone-*,
   -bugprone-easily-swappable-parameters,
   -bugprone-reserved-identifier,
+  -bugprone-signed-char-misuse,
   clang-analyzer-*,
   clang-diagnostic-*,
   cppcoreguidelines-*,
@@ -39,8 +40,6 @@ AnalyzeTemporaryDtors: false
 FormatStyle: file
 HeaderFilterRegex: '.*'
 CheckOptions:
-  - key:   bugprone-signed-char-misuse.CharTypdefsToIgnore
-    value: 'int8_t;c10::DeviceIndex'
   - key:   cppcoreguidelines-avoid-do-while.IgnoreMacros
     value: true
   - key:   cppcoreguidelines-narrowing-conversions.IgnoreConversionFromTypes
diff --git a/dipu/CMakeLists.txt b/dipu/CMakeLists.txt
index d94770c289..4ea3ec28c9 100644
--- a/dipu/CMakeLists.txt
+++ b/dipu/CMakeLists.txt
@@ -19,6 +19,7 @@ list(APPEND DEVICE_ASCEND "ASCEND" "ascend")
 list(APPEND DEVICE_TOPSRIDER "TOPS" "tops" "TOPSRIDER" "topsrider")
 list(APPEND DEVICE_SUPA "SUPA" "supa")
 list(APPEND DEVICE_DROPLET "DROPLET" "droplet")
+list(APPEND DEVICE_KUNLUNXIN "kunlunxin" "klx")
 
 execute_process(COMMAND git rev-parse --short HEAD
                 OUTPUT_VARIABLE DIPU_GIT_HASH)
@@ -44,12 +45,16 @@ elseif (${DEVICE} IN_LIST DEVICE_TOPSRIDER)
 elseif (${DEVICE} IN_LIST DEVICE_SUPA)
   set(USE_SUPA ON)
   set(UsedVendor supa)
-  set(DIOPI_IMPL_OPT "")
+  set(DIOPI_IMPL_OPT "supa")
   #SUPA DEVICE DOES NOT NEED TO BUILD DIOPI, so set the target to "" to control the workflow.
 elseif (${DEVICE} IN_LIST DEVICE_DROPLET)
   set(USE_DROPLET ON)
   set(UsedVendor droplet)
   set(DIOPI_IMPL_OPT "droplet")
+elseif (${DEVICE} IN_LIST DEVICE_KUNLUNXIN)
+  set(USE_KUNLUNXIN ON)
+  set(UsedVendor kunlunxin)
+  set(DIOPI_IMPL_OPT "kunlunxin")
 else()
   message(FATAL_ERROR "No implementation module is compiled, cmake requires option -DDEVICE=CAMB or CUDA or ASCEND or SUPA")
 endif()
@@ -81,14 +86,14 @@ if(NOT DEFINED DIPU_ABI_V)
     OUTPUT_VARIABLE DIPU_ABI_V)
 endif()
 
-if(NOT DEFINED DIPU_COMPILED_WITH_CXX11_ABI)    
+if(NOT DEFINED DIPU_COMPILED_WITH_CXX11_ABI)
   execute_process(
     COMMAND
       sh -x -c
       "python -c 'import torch;print(1 if torch.compiled_with_cxx11_abi() else 0)'"
     OUTPUT_VARIABLE DIPU_COMPILED_WITH_CXX11_ABI)
 endif()
-    
+
 if(DIPU_COMPILED_WITH_CXX11_ABI GREATER 0)
   set(DIPU_COMPILED_WITH_CXX11_ABI 1)
 else()
diff --git a/dipu/Contributors.md b/dipu/Contributors.md
index bbfd7ae213..e612cf0bdd 100644
--- a/dipu/Contributors.md
+++ b/dipu/Contributors.md
@@ -18,7 +18,7 @@
 
 ### 拉取请求工作流
 
-如果你对拉取请求不了解，没关系，接下来的内容将会从零开始，一步一步地指引你如何创建一个拉取请求。如果你想深入了解拉取请求的开发模式，可以参考[GitHub 官方文档](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)
+如果你对拉取请求不了解，没关系，接下来的内容将会从零开始，一步一步地指引你如何创建一个拉取请求。如果你想深入了解拉取请求的开发模式，可以参考 [GitHub 官方文档](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)
 
 #### 复刻仓库
 
@@ -43,7 +43,7 @@ upstream git@github.com:DeepLink-org/deeplink.framework (fetch)
 upstream git@github.com:DeepLink-org/deeplink.framework (push)
 ```
 
-> 这里对 origin 和 upstream 进行一个简单的介绍，当我们使用 `git clone` 来克隆代码时，会默认创建一个 origin 的 remote，它指向我们克隆的代码库地址，而 upstream 则是我们自己添加的，用来指向原始代码库地址。当然如果你不喜欢他叫 upstream，也可以自己修改，比如叫 dipu 。我们通常向 origin 提交代码（即 fork 下来的远程仓库），然后向 upstream 提交一个 pull request。如果提交的代码和最新的代码发生冲突，再从 upstream 拉取最新的代码，和本地分支解决冲突，再提交到 origin。
+> 这里对 origin 和 upstream 进行一个简单的介绍，当我们使用 `git clone` 来克隆代码时，会默认创建一个 origin 的 remote，它指向我们克隆的代码库地址，而 upstream 则是我们自己添加的，用来指向原始代码库地址。当然如果你不喜欢他叫 upstream，也可以自己修改，比如叫 dipu。我们通常向 origin 提交代码（即 fork 下来的远程仓库），然后向 upstream 提交一个 pull request。如果提交的代码和最新的代码发生冲突，再从 upstream 拉取最新的代码，和本地分支解决冲突，再提交到 origin。
 
 #### 创建开发分支
 
@@ -59,7 +59,7 @@ git checkout -b xxx/refactor_contributing_doc
 git pull upstream main
 ```
 
-#### 提交代码并在本地通过dipu测试
+#### 提交代码并在本地通过 DIPU 测试
 
 提交的代码需要通过 DIPU 在各设备上的测例和模型 one_iter 测试。
 
@@ -78,11 +78,11 @@ git push -u origin {branch_name}
 1. 在 GitHub 的 pull request 界面创建拉取请求
 2. 根据指引修改 pull request 描述，以便于其他开发者更好地理解你的修改
 
-描述规范详见[拉取请求规范](#拉取请求规范)
+描述规范详见 [拉取请求规范](#拉取请求规范)
 
 注意事项：
 
-- Pull request 描述应该包含修改理由、修改内容以及修改后带来的影响，并关联相关 issue（具体方式见[文档](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)）。
+- Pull request 描述应该包含修改理由、修改内容以及修改后带来的影响，并关联相关 issue（具体方式见 [GitHub 官方文档](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)）。
 - 如果是第一次为 DIPU 做贡献，需要签署 CLA。
 - 检查提交的 pull request 是否通过 CI（持续集成）。
 - 如果 pull request 通过了 CI 检查，那么就可以等待其他开发者的 review，并根据 reviewer 的意见，修改代码，并重复上述步骤，直到 reviewer 同意合入 pull request。
@@ -117,7 +117,7 @@ git merge upstream/main
 - 每次 commit 时需要提供清晰且有意义 commit 信息。
 - 提供清晰且有意义的 pull request 描述：
   - 标题写明白任务名称，参考格式：`[Prefix] Short description of the pull request (Suffix)`；
-    - Prefix 参考：新增功能 `[Feature]`, 修 bug `[Fix]`, 文档相关 `[Docs]`, 开发中 `[WIP]` (暂时不会被 review)。
-  - 描述里介绍 pull request 的主要修改内容，结果，以及对其他部分的影响, 参考 pull request 模板；
+    - Prefix 参考：新增功能 `[Feature]`, 修 bug `[Fix]`, 文档相关 `[Docs]`, 开发中 `[WIP]` （暂时不会被 review）。
+  - 描述里介绍 pull request 的主要修改内容，结果，以及对其他部分的影响，参考 pull request 模板；
   - 关联相关的 issue 和其他 pull request。
 - 如果引入了其他三方库，或借鉴了三方库的代码，请确认它们的许可证和 DIPU License 兼容，并在借鉴的代码上补充 `This code is inspired from <LINK>`。
diff --git a/dipu/QuickStart.md b/dipu/QuickStart.md
index 10ccf63796..084aab26aa 100644
--- a/dipu/QuickStart.md
+++ b/dipu/QuickStart.md
@@ -167,7 +167,7 @@ export DIPU_FORCE_FALLBACK_OPS_LIST=add.out,conv2d
 python -c "import torch_dipu"
 ```
 
-Fallback scalar 版本的重载函数， tensor 版本的重载函数类似：
+Fallback scalar 版本的重载函数，tensor 版本的重载函数类似：
 
 ```bash
 export DIPU_FORCE_FALLBACK_OPS_LIST='.*.Scalar'
@@ -203,7 +203,7 @@ add_custom_command(
 以上方法是对所有算子开启自动精度对比。如果只需要对特定算子做精度对比，也可只给需要的算子做精度对比，只需要在相关的配置文件（如 `dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml`）给相应的算子添加 `autocompare: True` 即可。
 
 ```shell
-$ unset  DIPU_FORCE_FALLBACK_OPS_LIST # 主要是确保要比较的算子没有强制fallback到cpu,可选
+$ unset  DIPU_FORCE_FALLBACK_OPS_LIST # 主要是确保要比较的算子没有强制 fallback 到 cpu, 可选
 $ python
 >>> import torch
 >>> import torch_dipu
@@ -229,7 +229,7 @@ autocompare:    add.out other: allclose
 >>>
 ```
 
-可以看到，CPU 计算结果与设备计算结果 `allclose`，也能看到CPU和设备计算结果的 `shape`、`dtype` 等信息。特别的，需要注意以下几个问题：
+可以看到，CPU 计算结果与设备计算结果 `allclose`，也能看到 CPU 和设备计算结果的 `shape`、`dtype` 等信息。特别的，需要注意以下几个问题：
 
 1. `dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml` 中配置了 `autograd:True` 的算子 (`cross_entropy_loss`、`conv2d`、`dropout`、`dropout_`、`linear`) 暂不支持 *backward* 的精度自动对比。如模型精度对不齐，可根据需要先将这几个算子 fallback 到 CPU 来确定问题。
 2. 随机数生成相关的算子（`dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml` 中配置了 `autocompare:False`）没有做 `autocompare`，因为结果总是 `not_allclose`。
@@ -245,12 +245,11 @@ autocompare:    add.out other: allclose
 >>> import os
 diopi dyload init
 >>> x = torch.randn(3,4).cuda()
->>> os.environ['DIPU_DUMP_OP_ARGS']='1' # 只打印调用的底层算子名以及相关的diopi函数
+>>> os.environ['DIPU_DUMP_OP_ARGS']='1' # 只打印调用的底层算子名以及相关的 diopi 函数
 >>> y = x + x
 [dipu_add_out:349]:add.out  diopiAdd
 
-
->>> os.environ['DIPU_DUMP_OP_ARGS']='2'  # 打印调用的底层算子名，相关的diopi函数，算子参数
+>>> os.environ['DIPU_DUMP_OP_ARGS']='2'  # 打印调用的底层算子名，相关的 diopi 函数，算子参数
 >>> y = x + 3
 [dipu_add_out:349]:add.out  diopiAdd
 [dipu_add_scalar_out:248]:add.Scalar_out  diopiAddScalar
@@ -259,8 +258,7 @@ diopi dyload init
         add.Scalar_out: alpha:1
         add.Scalar_out: out:numel:12, sizes:[3, 4], stride:[4, 1], is_view:0, TensorOptions(dtype=float, device=privateuseone:0, layout=Strided, requires_grad=false (default), pinned_memory=false (default), memory_format=(nullopt)), data_ptr:0x7ff8c8c00400
 
-
->>> os.environ['DIPU_DUMP_OP_ARGS']='3' # 打印调用的底层算子名，相关的diopi函数，算子参数， tensor的值
+>>> os.environ['DIPU_DUMP_OP_ARGS']='3' # 打印调用的底层算子名，相关的 diopi 函数，算子参数， tensor 的值
 >>> y = x * 3
 [dipu_mul_out:815]:mul.out  diopiMul
 [dipu_mul_scalar_out:753]:mul.Scalar_out  diopiMulScalar
@@ -285,11 +283,11 @@ diopi dyload init
 
 接入流程示意图：
 
-![结构图](https://deeplink.readthedocs.io/zh_CN/latest/_images/SOP_01.png)
+![结构图](https://deeplink.readthedocs.io/zh-cn/latest/_static/image/DIPU/SOP_01.png)
 
 ### 核心代码添加
 
-- 在 `dipu/torch_dipu/csrc_dipu/runtime/device/basedef.h` 中定义了DIPU支持的硬件类型，我们需要在 `VendorDeviceType` 枚举类中添加 `DROPLET` 的硬件后端，并在这个文件中的`VendorTypeToStr` 函数里添加新硬件支持。后续这个文件中可能有更多的函数会涉及到硬件类型，按需添加即可。
+- 在 `dipu/torch_dipu/csrc_dipu/runtime/device/basedef.h` 中定义了 DIPU 支持的硬件类型，我们需要在 `VendorDeviceType` 枚举类中添加 `DROPLET` 的硬件后端，并在这个文件中的`VendorTypeToStr` 函数里添加新硬件支持。后续这个文件中可能有更多的函数会涉及到硬件类型，按需添加即可。
 - `dipu/torch_dipu/csrc_dipu/vendor` 文件夹中存有各个硬件后端的 *runtime* 接入代码，我们需要根据 `dipu/torch_dipu/csrc_dipu/runtime/device/deviceapis.h` 中的声明，创建 `deviceimpl.cpp` 去根据硬件自己底层的 *runtime* 接口实现对应的函数。下面是 `deviceapis.h` 中的 `createStream` 函数的在国产硬件上的实现样例：
 
 ``` cpp
@@ -302,7 +300,7 @@ void createStream(deviceStream_t* stream, bool prior) {
 }
 ```
 
-- 如果有多机多卡训练的需求，需要根据 `dipu/torch_dipu/csrc_dipu/runtime/device/diclapis.h` 中的声明，创建 `communiatorimpl.cpp` 去根据硬件自己底层的 *runtime* 接口实现对应的函数。
+- 如果有多机多卡训练的需求，需要根据 `dipu/torch_dipu/csrc_dipu/runtime/device/diclapis.h` 中的声明，创建 `communicatorimpl.cpp` 去根据硬件自己底层的 *runtime* 接口实现对应的函数。
 - DIPU 在 `dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.h` 中声明了 `DIPUGeneratorImpl` 这一个基本类型，如果我们的硬件实现了自己的 `generator` 基础函数，可以在这基础上实现自己的 `DeviceGeneratorImpl`，并实现基础的 `generator` 相关函数。国产硬件暂无这方面的实现。
 
 ### 增加编译脚本
@@ -326,4 +324,4 @@ void createStream(deviceStream_t* stream, bool prior) {
 
 - 根据 DIPU 的编译介绍，我们在编译了 DIPU 之后，需要注意将 `LIBRARY_PATH`、`LD_LIBRARY_PATH`、`PYTHONPATH` 都设置好避免后续使用出现问题。
 - `dipu/tests` 文件夹中有许多基础功能的测试，建议首先尝试测试 `python -u dipu/tests/python/unittests/test_add.py`，该文件测试跑通基本意味着我们的设备 *runtime* 接入没有问题了。
-- 编译脚本参考[编译 DIPU](#编译-dipu)，测试脚本可以参考[验证 DIPU](#验证-dipu)。
+- 编译脚本参考 [编译 DIPU](#编译-dipu)，测试脚本可以参考 [验证 DIPU](#验证-dipu)。
diff --git a/dipu/README.md b/dipu/README.md
index 3b55bac80d..ce128bcf4c 100644
--- a/dipu/README.md
+++ b/dipu/README.md
@@ -8,7 +8,7 @@
 
 ## 介绍
 
-DIPU (device independent process unit) 是由 **一组抽象设备 Runtime 接口，一组框架能力相关的运行时基类/接口，一个针对 DIOPI 标准算子的适配层** 共同组成的拓展包。 用来在训练框架 PyTorch 上接入 DIOPI 算子库，实现 Eager 模式的推理和训练。其能够在编译时，决定抽象设备被影射的方式；并使用统一的运行时，减少在多硬件上适配训练框架的成本。DIPU 即可以基于统一的设备运行时来屏蔽厂商的实际设备；也可以基于统一的框架相关的运行时基类，由厂商自行实现特有的运行时逻辑。
+DIPU (device independent process unit) 是由 **一组抽象设备 Runtime 接口，一组框架能力相关的运行时基类/接口，一个针对 DIOPI 标准算子的适配层** 共同组成的拓展包。用来在训练框架 PyTorch 上接入 DIOPI 算子库，实现 Eager 模式的推理和训练。其能够在编译时，决定抽象设备被影射的方式；并使用统一的运行时，减少在多硬件上适配训练框架的成本。DIPU 即可以基于统一的设备运行时来屏蔽厂商的实际设备；也可以基于统一的框架相关的运行时基类，由厂商自行实现特有的运行时逻辑。
 
 虽然 PyTorch 定义了一套基础的运行时接口 `c10`，可以基于这个接口直接抽象各个设备接口，但是 `c10` 首先是个直面框架层的接口，每个接入的设备都需要实现大量类似的逻辑来完成 `c10` 的实现，对于多设备的支持很不方便。DIPU 先把 `c10` 的运行时适配到 DIPU 自己的运行时，把通用的逻辑抽取出来，可以让厂商仅实现必要的设备接口即可工作。
 
@@ -25,7 +25,7 @@ DIPU 结构上分为 Python 和 CPP 两部分：
 Runtime 主要有以下几个部分：
 
 1. *Core & Distributed*
-   - PyTorch 把一些基本的设备层接口放到了一个叫 `c10` 的目录下，不同的设备接入者需要实现该接口来接入 PyTorch。详见[参考文档](http://blog.ezyang.com/2019/05/pytorch-internals/)对于`c10` 的介绍。
+   - PyTorch 把一些基本的设备层接口放到了一个叫 `c10` 的目录下，不同的设备接入者需要实现该接口来接入 PyTorch。详见 [参考文档](http://blog.ezyang.com/2019/05/pytorch-internals/) 对于`c10` 的介绍。
    - DIPU 的这一部分主要就是对 PyTorch 的 `c10` 和 `c10d` 相关接口的实现，把设备无关的部分抽象出一组运行时基类。目前包含 `DIPUAllocator`、`DIPUGenerator`、`DIPUStream/Event/Guard`、`ProcessGroupDICL` 等。这些类会把设备相关的请求代理到 *device* 部分定义的一组设备接口。另外用户也可以继承上述基类，实现并注册自己的子类，实现设备特化的某些行为（这个能力的支持目前尚待完善）。
 2. *Device*
    - 包含 `deviceapis.h` 和 `diclapis.h` 两个接口文件。主要是设备 `memory/stream/event/communcation` 相关的接口函数（这部分接口后续有考虑挪到 DIOPI 中，成为 DIOPI 的 *Device* 接口，见上图）。
@@ -40,7 +40,7 @@ Aten 的能力主要依赖于 PyTorch 提供的注册自定义 *backend* 的能
 
 #### DiopiRT (`csrc/dipu/diopirt`)
 
-用于实现 DIOPI 要求的 *Runtime*，具体参考 [DIOPI项目](https://github.com/DeepLink-org/DIOPI)。
+用于实现 DIOPI 要求的 *Runtime*，具体参考 [DIOPI 项目](https://github.com/DeepLink-org/DIOPI)。
 
 #### Binding to Python (`csrc/dipu/binding`)
 
@@ -52,10 +52,10 @@ Aten 的能力主要依赖于 PyTorch 提供的注册自定义 *backend* 的能
 
 一般的，除了要实现上面 *Device* 部分要求的接口函数外，*Vendor* 还需要实现一个特殊的 `vendorapi.h`，在这里导出设备 `device/stream/event/comm` 相关的数据结构定义。未来计划在设备层允许 *Vendor* 注册特化的 *Runtime* 子类，或者实现子类的构建器/工厂方法接口，实现设备特化的 *Runtime* 行为。
 
-### Python层
+### Python 层
 
 1. DIPU 设备层接口 (`torch_dipu/dipu`):
-   - 包含CPP层的 *Runtime* 接口对应的 Python 层。这部分会导出部分函数给用户侧，导出的函数类比 PyTorch 的 `torch/cuda` 部分。
+   - 包含 CPP 层的 *Runtime* 接口对应的 Python 层。这部分会导出部分函数给用户侧，导出的函数类比 PyTorch 的 `torch/cuda` 部分。
 2. DIPU 采用 `monkey-patch` 的方式模拟了部分 PyTorch tensor 接口，让它们可以处理 DIPU 特殊的参数，该部分的设计还在优化中。
 3. DIPU 拥有一定的模拟 CUDA 接口的能力。简单来说就是在 Python 层 用前面 DIPU 设备层的接口来替换 `torch.cuda` 的同名接口。
 
@@ -65,17 +65,17 @@ Aten 的能力主要依赖于 PyTorch 提供的注册自定义 *backend* 的能
 
 ### Dispatch 机制与 DIOPI 算子库
 
-PyTorch 的算子注册和分派有很多步骤，详见[参考文档](https://github.com/pytorch/pytorch/wiki/PyTorch-dispatcher-walkthrough)。
+PyTorch 的算子注册和分派有很多步骤，详见 [参考文档](https://github.com/pytorch/pytorch/wiki/PyTorch-dispatcher-walkthrough)。
 
-DIPU CPP 层适配的 ATen 算子对应的是分派过程中最底层（*backend* 层） 的算子或者 *composite* 层里等效为 *backend* 的算子。
+DIPU CPP 层适配的 ATen 算子对应的是分派过程中最底层（*backend* 层）的算子或者 *composite* 层里等效为 *backend* 的算子。
 
-这里面有一定的灵活性，以`Linear` 算子为例，在 PyTorch 的 `cpu/cuda` 设备上，它被实现为一个 `composite` 算子，实际的 *backend* 层算子是组合算子内部调用的 `addmm` 或者更底层的 `mm`。 而在 DIPU (`privateuse1`) 设备中，目前是注册了一个 `Linear` 算子（DIOPI 有这个算子）来替代组合算子，所以分派会直接走到新的 *backend* 层算子 `Linear`，而不会在调用原来的 `addmm/mm`。但是如果对应设备的 DIOPI 的 IMPL 算子库 没有实现 `diopiLinear` 而是实现了 `mm` 算子，也是可以正常走通 `Linear` 的调用流程的。
+这里面有一定的灵活性，以`Linear` 算子为例，在 PyTorch 的 `cpu/cuda` 设备上，它被实现为一个 `composite` 算子，实际的 *backend* 层算子是组合算子内部调用的 `addmm` 或者更底层的 `mm`。而在 DIPU (`privateuse1`) 设备中，目前是注册了一个 `Linear` 算子（DIOPI 有这个算子）来替代组合算子，所以分派会直接走到新的 *backend* 层算子 `Linear`，而不会在调用原来的 `addmm/mm`。但是如果对应设备的 DIOPI 的 IMPL 算子库 没有实现 `diopiLinear` 而是实现了 `mm` 算子，也是可以正常走通 `Linear` 的调用流程的。
 
 ### 无侵入式的 PyTorch 扩展包
 
-DIPU 没有直接修改 PyTorch 的代码，而是使用 out-of-tree 的方式接入新设备，详见[参考文档](https://pytorch.org/tutorials/advanced/extend_dispatcher.html)。
+DIPU 没有直接修改 PyTorch 的代码，而是使用 out-of-tree 的方式接入新设备，详见 [参考文档](https://pytorch.org/tutorials/advanced/extend_dispatcher.html)。
 
-PyTorch 要求 out-of-tree 的代码必须定义一个私有的 *Backend Key*，DIPU目前没有和 PyTorch 做官方的沟通，因此 PyTorch 主干里没有 `DIPU` 这个设备，目前是暂时借用 `PrivateUse1` 这个 Key（后续考虑改为借用 `XPU` 设备 Key，因为这个 Key 在 PyTorch 主干代码中有更好的支持）。
+PyTorch 要求 out-of-tree 的代码必须定义一个私有的 *Backend Key*，DIPU 目前没有和 PyTorch 做官方的沟通，因此 PyTorch 主干里没有 `DIPU` 这个设备，目前是暂时借用 `PrivateUse1` 这个 Key（后续考虑改为借用 `XPU` 设备 Key，因为这个 Key 在 PyTorch 主干代码中有更好的支持）。
 
 基于用户私有的 *Backend Key* 和 `Dispatch Key`，PyTorch 会把算子调用请求分发到对应设备的算子实现。另外 `c10` 本身提供了一些注册能力，比如 `C10_REGISTER_GUARD_IMPL`，可以让用户把私有设备的 *Runtime* 代码注册到框架中。
 
@@ -83,7 +83,7 @@ PyTorch 要求 out-of-tree 的代码必须定义一个私有的 *Backend Key*，
 
 ### 算子适配能力
 
-为了更好的接入 DIOPI 算子，DIPU 提供了一组算子适配相关的辅助能力，比如灵活的算子 Fallback to CPU 的能力、算子精度自动对比的能力（对比 DIOPI 算子和 PyTorch 原生的 CPU 算子），算子执行过程中打印算子参数的能力。基于这些能力，接入算子时可以更方便排查算子精度等问题。 相关能力的具体说明参见 [Quick Start 文档](https://deeplink.readthedocs.io/zh-cn/latest/doc/DIPU/quick_start.html)的“算子库接入”章节。
+为了更好的接入 DIOPI 算子，DIPU 提供了一组算子适配相关的辅助能力，比如灵活的算子 Fallback to CPU 的能力、算子精度自动对比的能力（对比 DIOPI 算子和 PyTorch 原生的 CPU 算子），算子执行过程中打印算子参数的能力。基于这些能力，接入算子时可以更方便排查算子精度等问题。相关能力的具体说明参见 [Quick Start 文档](https://deeplink.readthedocs.io/zh-cn/latest/doc/DIPU/quick_start.html) 的“算子库接入”章节。
 
 ## 质量保障体系
 
@@ -94,7 +94,7 @@ PyTorch 要求 out-of-tree 的代码必须定义一个私有的 *Backend Key*，
 2. 简单开发的手工测例。这部分测例更注重算子能否跑通，对算子要求较低。
 3. 模型测试。我们开发了 `one_iter` 精度对比工具，会先在精度正确性没问题的设备（如 CPU 和 CUDA）上训练模型，保存每一层的算子输入、输出、权重、梯度数据，再在待测试设备上训练模型，逐层对比训练精度。
 
-> 更多信息请参考 [dipu/tests](./dipu/tests) 目录。
+> 更多信息请参考 [dipu/tests](./tests) 目录。
 
 ## Learn More
 
diff --git a/dipu/SupportedDiopiFunctions.txt b/dipu/SupportedDiopiFunctions.txt
index c7daf5d5d1..ee844acfc5 100644
--- a/dipu/SupportedDiopiFunctions.txt
+++ b/dipu/SupportedDiopiFunctions.txt
@@ -48,6 +48,8 @@ diopiCastDtype
 diopiCat
 diopiCdist
 diopiCdistBackward
+diopiCeil
+diopiCeilInp
 diopiClamp
 diopiClampInp
 diopiClampInpScalar
@@ -135,6 +137,10 @@ diopiLog2
 diopiLog2Inp
 diopiLogicalAnd
 diopiLogicalAndInp
+diopiLogicalNot
+diopiLogicalNotInp
+diopiLogicalOr
+diopiLogicalOrInp
 diopiLogInp
 diopiLogSoftmax
 diopiLogSoftmaxBackward
diff --git a/dipu/scripts/autogen_diopi_wrapper/autogen_diopi_wrapper.py b/dipu/scripts/autogen_diopi_wrapper/autogen_diopi_wrapper.py
index 5fc67a107d..0a2184a24c 100644
--- a/dipu/scripts/autogen_diopi_wrapper/autogen_diopi_wrapper.py
+++ b/dipu/scripts/autogen_diopi_wrapper/autogen_diopi_wrapper.py
@@ -118,7 +118,7 @@ def create_transform_input_to_cpu_code(fun_config):
     for input in optional_tensor_list_inputs:
         input_process_code += f"\nc10::List<c10::optional<at::Tensor>> {input}_cpu;\n"
         input_process_code += f"for (int i = 0; i < {input}.size();++i)" + " {\n"
-        input_process_code += f"\t{input}_cpu.push_back({input}[i].has_value() && {input}[i].value().defined() ? c10::make_optional<at::Tensor>({input}[i].value().cpu()) : {input}[i]);\n"
+        input_process_code += f"  {input}_cpu.push_back({input}[i].has_value() && {input}[i].value().defined() ? c10::make_optional<at::Tensor>({input}[i].value().cpu()) : {input}[i]);\n"
         input_process_code += "}\n"
 
     outputs = re.findall('Tensor\([a-z]!\)[ ]+([\w\d_]+){1}', schema[:schema.find('->')])
@@ -151,7 +151,7 @@ def create_print_op_args_code(fun_config):
     code += "if (dumpOpArgLevel() > 1) {\n"
     for input in inputs:
         input = input.strip()
-        code += f'\tstd::cout << "\t{opname}:\t{input}:" << dumpArg({input}) << std::endl;\n'
+        code += f'  std::cout << "\t{opname}:\t{input}:" << dumpArg({input}) << std::endl;\n'
     code += "}"
     return code
 
@@ -455,11 +455,11 @@ def create_result_compare_code(fun_config):
     code = ''
     if len(return_param) == 1 :
         compare_code = f'_allclose(result_cpu, result_device)'
-        code += f'std::cout << "autocompare:\t{op_name}\t{return_param[0]}:" << std::endl << "\t" << dumpArg(result_cpu) << std::endl << "\t" << dumpArg(result_device) << std::endl << "\t" << {compare_code} << std::endl;\n';
+        code += f'std::cout << "autocompare:\t{op_name}\t{return_param[0]}:" << std::endl << "  " << dumpArg(result_cpu) << std::endl << "  " << dumpArg(result_device) << std::endl << "  " << {compare_code} << std::endl;\n';
     elif len(return_param) > 1:
         for i in range(len(return_param)):
             compare_code = f'_allclose(std::get<{i}>(result_cpu), std::get<{i}>(result_device))'
-            code += f'std::cout << "autocompare:\t{op_name}\t{return_param[i]}:" << std::endl << "\t" << dumpArg(std::get<{i}>(result_cpu)) << std::endl << "\t" << dumpArg(std::get<{i}>(result_device)) << std::endl << "\t" << {compare_code} << std::endl;\n';
+            code += f'std::cout << "autocompare:\t{op_name}\t{return_param[i]}:" << std::endl << "  " << dumpArg(std::get<{i}>(result_cpu)) << std::endl << "  " << dumpArg(std::get<{i}>(result_device)) << std::endl << "  " << {compare_code} << std::endl;\n';
 
     inputs = re.findall('Tensor +([\w\d_]+)', schema[:schema.find('->')])
     inputs += re.findall('Tensor *\([a-z]!\) *\[ *\] +([\w\d_]+)', schema[:schema.find('->')])
@@ -474,8 +474,8 @@ def create_code_to_print_fun_call_info_from_schema(fun_config):
     op_name = get_op_name_from_schema(fun_config['schema'])
     diopi_func = fun_config.get('interface', '')
     diopi_func = diopi_func[0 : diopi_func.find('(')]
-    debug_code = "if (dumpOpArgLevel() > 0) {\n\t"
-    debug_code += f'printf("--%-50s %-30s \\n", "[{op_name}]:", "{diopi_func}");' + '\n'
+    debug_code = "if (dumpOpArgLevel() > 0) {\n"
+    debug_code += f'  printf("--%-50s %-30s \\n", "[{op_name}]:", "{diopi_func}");' + '\n'
     debug_code += "}\n"
     return debug_code
 
@@ -539,10 +539,10 @@ def create_device_check_code(fun_config):
 
     for args in set(tensors):
         if not args.endswith('?'):
-            code += f'\tTORCH_CHECK(({args}.defined() == false) || ({args}.device().type() == dipu::DIPU_DEVICE_TYPE), __FILE__, ":", __LINE__, ": {op_name}: {args} should be on dipu");\n'
+            code += f'  TORCH_CHECK(({args}.defined() == false) || ({args}.device().type() == dipu::DIPU_DEVICE_TYPE), __FILE__, ":", __LINE__, ": {op_name}: {args} should be on dipu");\n'
         else:
             args = args[0:-1]
-            code += f'\tTORCH_CHECK(({args}.has_value() == false) || ({args}.value().defined() == false) || ({args}.value().device().type() == dipu::DIPU_DEVICE_TYPE), __FILE__, ":", __LINE__, "{op_name}: {args} should be on dipu");\n'
+            code += f'  TORCH_CHECK(({args}.has_value() == false) || ({args}.value().defined() == false) || ({args}.value().device().type() == dipu::DIPU_DEVICE_TYPE), __FILE__, ":", __LINE__, "{op_name}: {args} should be on dipu");\n'
 
     if len(tensors) > 0:
         code += "}"
@@ -588,7 +588,9 @@ def functions_code_gen(fun_config):
         if input.strip().endswith('?'):
             input = input.replace('?', '')
             input_process_code += f"\n::diopiConstTensorHandle_t {input}{diopi_tensor_suffix} = nullptr;\n"
-            input_process_code += f"if ({input}.has_value() && {input}.value().defined()) {input}{diopi_tensor_suffix} = dipu::diopi_helper::toDiopiTensorHandle({input}.value());\n\n"
+            input_process_code += f"if ({input}.has_value() && {input}.value().defined())" + "{\n"
+            input_process_code += f"  {input}{diopi_tensor_suffix} = dipu::diopi_helper::toDiopiTensorHandle({input}.value());\n"
+            input_process_code += "}\n"
         else:
             input_process_code += f"::diopiConstTensorHandle_t {input}{diopi_tensor_suffix} = dipu::diopi_helper::toDiopiTensorHandle({input});\n"
 
@@ -656,8 +658,10 @@ def functions_code_gen(fun_config):
         return_code = f"return std::tie({params});"
 
     custom_code_at_the_beginning = fun_config.get('custom_code_at_the_beginning', fun_config.get('custom_code', ''))
+    #strip all whitespace and divide code to different lines.
     custom_code_at_the_beginning = re.sub(';\s*$', ';\n',custom_code_at_the_beginning)
 
+    interface_name = re.sub(R'.*::(.*?)\(.*', R'\1', diopi_fun_call_code)
     fbody = fun_template.substitute(
             comment=[fun_config['schema']],
             cppsignautre=[create_cpp_signature_from_schema(fun_config['schema'])],
@@ -670,6 +674,7 @@ def functions_code_gen(fun_config):
             diopi_fun_call_code=[diopi_fun_call_code],
             custom_code_before_return=[fun_config.get('custom_code_before_return', '').replace('; ', ';\n')],
             return_code=[return_code],
+            interface_name=[interface_name],
     )
     diopi_interface = fun_config.get('interface', create_call_diop_interface_code_from_schema(fun_config['schema']))
 
@@ -736,6 +741,7 @@ def parase_args():
     import argparse
     parser = argparse.ArgumentParser(description='autogen diopi wrapper code')
     parser.add_argument('--config', type=str, default = 'diopi_functions.yaml', help='path to functions config file')
+    parser.add_argument('--convert_config', type=str, dest = "convert_config", default="", help="path to the convert_config.yaml")
     parser.add_argument('--out', type=str, default = 'AutoGenedKernels.cpp', help='path to functions config file')
     parser.add_argument('--dummy_call_diopi', default=False, type=boolean_string, help='whether acctually call diopi interface')
     parser.add_argument('--use_diopi_adapter', default=True, type=boolean_string, help='whether use diopi adapter')
@@ -755,7 +761,9 @@ def main():
         file_data = diopi_functions_file.read()
         funcs_config = yaml.load(file_data, Loader=yaml.FullLoader)
 
-
+    from op_memory_format_converter import OpMemoryFormatConverter
+    memory_format_converter = OpMemoryFormatConverter(args.convert_config)
+    
     functions_code = ''
     op_register_code = ''
     header_include_code = ''
@@ -773,6 +781,7 @@ def main():
         mergeed_fun_config = dict(args.fun_config_dict)
         mergeed_fun_config.update(vars(args))
         mergeed_fun_config.update(fun_config)
+        #filter for those device specific op.
         if 'device' in mergeed_fun_config:
             current_device = mergeed_fun_config.get('current_device', '')
             if current_device not in (mergeed_fun_config['device'] + ['all',]):
@@ -787,6 +796,10 @@ def main():
                 continue
 
         fun_code, register_code = functions_code_gen(mergeed_fun_config)
+        
+        #The class object memory_format_converter will replace the prefered memory format placeholder to the prefered memory format based on the device's convert_config.yaml
+        fun_code = memory_format_converter.convert(fun_code, fun_config)
+        
         functions_code += fun_code
         if mergeed_fun_config.get('register_op', True) in [True, "True"]:
             if mergeed_fun_config.get('autograd', False) == True:
diff --git a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
index 8812397c5a..242798a09d 100755
--- a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
+++ b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
@@ -1,10 +1,10 @@
 - schema: "exampleop.overloadname(Tensor self, Scalar other, Scalar alpha=1, *, Tensor(a!) out) -> Tensor(a!)"
   autocompare: disable
-  register_op: False # Whether generate registe code for this op, default value is True
+  register_op: False # Whether generate register code for this op, default value is True
   print_func_call_info: False # whether generate code that prints function call information
   print_op_args: True # whether generate code that prints op args
-  dummy_call_diopi: False # Does not generate code that actually calls the diopi function, defalut value is False
-  custom_code_at_the_beginning: "/* Here can be a piece of c++ code at the begining*/"
+  dummy_call_diopi: False # Does not generate code that actually calls the diopi function, default value is False
+  custom_code_at_the_beginning: "/* Here can be a piece of c++ code at the beginning*/"
   custom_code_before_call_diopi: |
     std::cout << "self:" << self << std::endl;
     std::cout << "other:" << other << std::endl;
@@ -36,15 +36,15 @@
 
 - schema: "aten::add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)"
   custom_code_at_the_beginning: |
-    if (other.numel() == 1) {
-        return dipu_add_scalar_out(self, other.cpu().item(), alpha, out);
-    } else if (self.numel() == 1) {
+    if (other.numel() == 1 && other.is_cpu()) {
+        return dipu_add_scalar_out(self, other.item(), alpha, out);
+    } 
+    if (self.numel() == 1 && self.is_cpu()) {
         if (alpha.toDouble() == 1.0) {
-          return dipu_add_scalar_out(other, self.cpu().item(), alpha, out);
-        } else {
-          dipu_fill__scalar(out, self.cpu().item());
-          return dipu_add__tensor(out, other, alpha);
+          return dipu_add_scalar_out(other, self.item(), alpha, out);
         }
+        dipu_fill__scalar(out, self.item());
+        return dipu_add__tensor(out, other, alpha);
     }
   interface: diopiAdd(ctx, out, self, other, alpha)
 
@@ -55,7 +55,8 @@
   custom_code_at_the_beginning: |
     if (other.numel() == 1 && other.is_cpu()) {
         return dipu_sub_scalar_out(self, other.item(), alpha, out);
-    } else if (self.numel() == 1 && self.is_cpu()) {
+    }
+    if (self.numel() == 1 && self.is_cpu()) {
         at::Tensor selfTensor = at::empty_like(other);
         dipu_fill__scalar(selfTensor, self.item());
         return dipu_sub_out(selfTensor, other, alpha, out);
@@ -94,7 +95,8 @@
   custom_code_at_the_beginning: |
     if (other.numel() == 1 && other.is_cpu()) {
         return dipu_div_scalar_out(self, other.item(), out);
-    } else if (self.numel() == 1 && self.is_cpu()) {
+    }
+    if (self.numel() == 1 && self.is_cpu()) {
         return dipu_div_scalar_out(other, self.item(), out);
     }
   interface: diopiDiv(ctx, out, self, other, RoundModeNone)
@@ -108,7 +110,8 @@
   custom_code_at_the_beginning: |
     if (other.numel() == 1 && other.is_cpu()) {
         return dipu_div_scalar_mode_out(self, other.item(), rounding_mode, out);
-    } else if (self.numel() == 1 && self.is_cpu()) {
+    }
+    if (self.numel() == 1 && self.is_cpu()) {
         return dipu_div_scalar_mode_out(other, self.item(), rounding_mode, out);
     }
     const auto mode = toDiopiRoundMode(rounding_mode.has_value() ? rounding_mode.value().data():"none");
@@ -135,7 +138,8 @@
   custom_code_at_the_beginning: |
     if (other.numel() == 1 && other.is_cpu()) {
         return dipu_mul_scalar_out(self, other.item(), out);
-    } else if (self.numel() == 1 && self.is_cpu()) {
+    }
+    if (self.numel() == 1 && self.is_cpu()) {
         return dipu_mul_scalar_out(other, self.item(), out);
     }
   interface: diopiMul(ctx, out, self, other)
@@ -191,13 +195,27 @@
 
 - schema: "aten::native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))"
   interface: diopiBatchNorm(ctx, out, save_mean, save_invstd, input, weight, bias, const_cast<diopiTensorHandle_t>(running_mean), const_cast<diopiTensorHandle_t>(running_var), training, momentum, eps);
+  custom_code_before_call_diopi: |
+    // NOTE: const_cast here is safe according to pytorch's source code
+    // NOLINTBEGIN(cppcoreguidelines-pro-type-const-cast)
+  custom_code_before_return: |
+    // NOLINTEND(cppcoreguidelines-pro-type-const-cast)
 
 - schema: "aten::native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)"
   custom_code_at_the_beginning: |
     const int64_t dim_c = input.size(1);
-    auto out0 = at::empty_like(input);
+    const auto input_shape = input.sizes();
+    const int axis = input_shape.size();
+    auto out0 = at::empty_like(input, input.options(), \
+        (axis==4?\
+            (c10::optional<at::MemoryFormat>(${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt})):\
+            (axis==5?\
+                (c10::optional<at::MemoryFormat>(${PREFERRED_MEMORY_FORMAT_PLACEHOLDER_3D:-c10::nullopt})):\
+                c10::optional<at::MemoryFormat>(c10::nullopt))\
+        ));
     auto options = input.options().dtype(at::kFloat);
-    at::Tensor out1, out2;
+    at::Tensor out1;
+    at::Tensor out2;
     if (!training) {
         // do not require save_mean/save_invstd when in test mode
         out1 = at::empty({0}, options);
@@ -207,12 +225,25 @@
         out2 = at::empty({dim_c}, options);
     }
   interface: diopiBatchNorm(ctx, out0, out1, out2, input, weight, bias, const_cast<diopiTensorHandle_t>(running_mean), const_cast<diopiTensorHandle_t>(running_var), training, momentum, eps);
+  custom_code_before_call_diopi: |
+    // NOTE: const_cast here is safe according to pytorch's source code
+    // NOLINTBEGIN(cppcoreguidelines-pro-type-const-cast)
+  custom_code_before_return: |
+    // NOLINTEND(cppcoreguidelines-pro-type-const-cast)
 
 - schema: "native_batch_norm_backward(Tensor grad_out, Tensor input, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_invstd, bool train, float eps, bool[3] output_mask) -> (Tensor, Tensor, Tensor)"
   custom_code_at_the_beginning: |
     int64_t dim_c = input.size(1);
     auto options = input.options().dtype(at::kFloat);
-    at::Tensor out0 = at::empty_like(input);
+    const auto input_shape = input.sizes();
+    const int axis = input_shape.size();
+    at::Tensor out0 = at::empty_like(input, input.options(), \
+        (axis==4?\
+             (c10::optional<at::MemoryFormat>(${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt})):\
+             (axis==5?\
+                 (c10::optional<at::MemoryFormat>(${PREFERRED_MEMORY_FORMAT_PLACEHOLDER_3D:-c10::nullopt})):\
+                  c10::optional<at::MemoryFormat>(c10::nullopt))\
+        ));
     at::Tensor out1 = at::empty({dim_c}, options);
     at::Tensor out2 = at::empty({dim_c}, options);
   interface: diopiBatchNormBackward(ctx, out0, out1, out2, grad_out, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps)
@@ -235,14 +266,22 @@
 - schema: "native_layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor out, Tensor save_mean, Tensor save_invstd)"
   custom_code_at_the_beginning: |
     const auto input_shape = input.sizes();
-    const int axis = input_shape.size() - normalized_shape.size();
+    const int axis = static_cast<int>(input_shape.size()) - static_cast<int>(normalized_shape.size());
     const int64_t M = c10::multiply_integers(input_shape.cbegin(), input_shape.cbegin() + axis);
     std::vector<int64_t> stats_shape(input_shape.size(), 1);
     std::copy(input_shape.begin(), input_shape.begin() + axis, stats_shape.begin());
     auto options = input.options();
     auto save_mean = at::empty(stats_shape, options);
     auto save_invstd = at::empty(stats_shape, options);
-    auto out = at::empty_like(input);
+    auto out = at::empty_like(
+      input,
+      c10::nullopt /* dtype */,
+      c10::nullopt /* layout */,
+      c10::nullopt /* device */,
+      c10::nullopt /* pin_memory */,
+      // maybe we don't want ChannelsLast -> Contiguous here, but just align with pytorch
+      // https://github.com/pytorch/pytorch/blob/v2.0.0/aten/src/ATen/native/cuda/layer_norm_kernel.cu#L1340-L1346
+      LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   interface: diopiLayerNorm(ctx,  out,  save_mean,  save_invstd,  input,  weight,  bias, normalized_shape, eps);
 
 - schema: "native_layer_norm_backward(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)"
@@ -290,7 +329,8 @@
   custom_code_at_the_beginning: |
     if (other.numel() == 1 && other.is_cpu()) {
         return dipu_eq_scalar_out(self, other.item(), out);
-    } else if (self.numel() == 1 && self.is_cpu()) {
+    }
+    if (self.numel() == 1 && self.is_cpu()) {
         return dipu_eq_scalar_out(other, self.item(), out);
     }
   interface: diopiEq(ctx, out, self, other)
@@ -312,7 +352,8 @@
   custom_code_at_the_beginning: |
     if (other.numel() == 1 && other.is_cpu()) {
         return dipu_lt_scalar_out(self, other.item(), out);
-    } else if (self.numel() == 1 && self.is_cpu()) {
+    }
+    if (self.numel() == 1 && self.is_cpu()) {
         return dipu_lt_scalar_out(other, self.item(), out);
     }
   interface: diopiLt(ctx, out, self, other)
@@ -334,7 +375,8 @@
   custom_code_at_the_beginning: |
     if (other.numel() == 1 && other.is_cpu()) {
         return dipu_ne_scalar_out(self, other.item(), out);
-    } else if (self.numel() == 1 && self.is_cpu()) {
+    }
+    if (self.numel() == 1 && self.is_cpu()) {
         return dipu_ne_scalar_out(other, self.item(), out);
     }
   interface: diopiNe(ctx, out, self, other)
@@ -356,7 +398,8 @@
   custom_code_at_the_beginning: |
     if (other.numel() == 1 && other.is_cpu()) {
         return dipu_ge_scalar_out(self, other.item(), out);
-    } else if (self.numel() == 1 && self.is_cpu()) {
+    }
+    if (self.numel() == 1 && self.is_cpu()) {
         return dipu_ge_scalar_out(other, self.item(), out);
     }
   interface: diopiGe(ctx, out, self, other)
@@ -378,7 +421,8 @@
   custom_code_at_the_beginning: |
     if (other.numel() == 1 && other.is_cpu()) {
         return dipu_gt_scalar_out(self, other.item(), out);
-    } else if (self.numel() == 1 && self.is_cpu()) {
+    }
+    if (self.numel() == 1 && self.is_cpu()) {
         return dipu_gt_scalar_out(other, self.item(), out);
     }
   interface: diopiGt(ctx, out, self, other)
@@ -400,7 +444,8 @@
   custom_code_at_the_beginning: |
     if (other.numel() == 1 && other.is_cpu()) {
         return dipu_le_scalar_out(self, other.item(), out);
-    } else if (self.numel() == 1 && self.is_cpu()) {
+    }
+    if (self.numel() == 1 && self.is_cpu()) {
         return dipu_le_scalar_out(other, self.item(), out);
     }
   interface: diopiLe(ctx, out, self, other)
@@ -444,6 +489,7 @@
   interface: diopiSum(ctx, out, self_dtype_diopi, diopi_size)
 
 - schema: "addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)"
+  custom_fallback: True
   custom_code_at_the_beginning: |
   interface: diopiAddmm(&context, out, self, mat1, mat2, beta, alpha)
 
@@ -494,7 +540,7 @@
     int64_t out_height = (height + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1) / stride[0] + 1;
     int64_t out_width = (width + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1) / stride[1] + 1;
     c10::SmallVector<int64_t, 4> output_size = {batch_size, out_channel, out_height, out_width};
-    at::Tensor out = at::empty(output_size, input.options());
+    at::Tensor out = at::empty(output_size, input.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-input.suggest_memory_format()});
   interface: diopiConvolution2d(&context, out, input, weight, bias, stride, padding, dilation, groups)
 
 - schema: "convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)"
@@ -506,10 +552,10 @@
     at::Tensor grad_bias;
     std::vector<int64_t> bias_sizes;
     if (output_mask[0]) {
-      grad_input = at::empty(input.sizes(), input.options());
+      grad_input = at::empty(input.sizes(), input.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
     }
     if (output_mask[1]) {
-      grad_weight = at::empty(weight.sizes(), weight.options().dtype(at::kFloat));
+      grad_weight = at::empty(weight.sizes(), weight.options().dtype(at::kFloat).memory_format(weight.suggest_memory_format()));
     }
     if (output_mask[2]) {
       bias_sizes.push_back(grad_output.size(1));
@@ -526,7 +572,7 @@
     at::Tensor grad_input;
     at::Tensor grad_weight;
     at::Tensor grad_bias;
-    grad_input = at::empty(input.sizes(), input.options());
+    grad_input = at::empty(input.sizes(), input.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
     grad_weight = at::empty(weight.sizes(), weight.options().dtype(at::kFloat));
     if (output_mask[2]) {
         grad_bias = at::empty({grad_output.size(1)}, grad_output.options());
@@ -548,10 +594,10 @@
     const int64_t w_out = (w_in - 1) * stride[1] - 2 * padding[1] + (dilation[1] * (kernel_width - 1) + 1) + output_padding[1];
     const int64_t c_out = weight.size(1) * groups;
     auto output_shape =  input.sizes().size() == 3 ? std::vector<int64_t>{c_out, h_out, w_out} : std::vector<int64_t>{n, c_out, h_out, w_out};
-    auto out = at::empty(output_shape, input.options());
+    auto out = at::empty(output_shape, input.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
   interface: diopiConvTranspose2d(ctx, out, input, weight, bias, stride, padding, output_padding, groups, dilation)
   forward_process_code: |
-    bool bias_has_value = (bias.has_value() == true) ? bias.value().requires_grad() : false;
+    bool bias_has_value = (bias.has_value()) ? bias.value().requires_grad() : false;
   saved_data:
     [
       stride,
@@ -577,10 +623,7 @@
     if (bias_has_value) {
       bias_sizes.push_back(grad_output.size(1));
     }
-    std::array<bool, 3> output_mask;
-    output_mask[0] = input.requires_grad();
-    output_mask[1] = weight.requires_grad();
-    output_mask[2] = bias_has_value;
+    std::array<bool, 3> output_mask = {input.requires_grad(), weight.requires_grad(), bias_has_value};
   backward_schema: "convolution_transpose_backward(Tensor grad_output, Tensor input, Tensor weight, int[] bias_sizes, int[] stride, int[] padding, int[] dilation, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)"
   backward_return_code: |
     std::vector<at::Tensor>  outputs = {
@@ -662,7 +705,9 @@
 - schema: "topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)"
   custom_code_at_the_beginning: |
     std::vector<int64_t> output_size(self.sizes().begin(), self.sizes().end());
-    dim = dim < 0 ? (dim + output_size.size()) : dim;
+    if (dim < 0) {
+      dim = dim + static_cast<int64_t>(output_size.size());
+    }
     output_size[dim] = k;
     auto values = at::empty(output_size, self.options());
     auto indices = at::empty(output_size, self.options().dtype(at::kLong));
@@ -693,7 +738,9 @@
   device: [all, -cuda]
   custom_fallback: True
   custom_code_at_the_beginning: |
-    at::Tensor grad_input, grad_weight, grad_bias;
+    at::Tensor grad_input;
+    at::Tensor grad_weight;
+    at::Tensor grad_bias;
     if (output_mask[0]) {
       grad_input = at::empty(input.sizes(), grad_output.options());
     }
@@ -706,6 +753,7 @@
   interface: diopiLinearBackward(ctx, grad_input, grad_weight, grad_bias, grad_output, input, weight)
 
 - schema: "linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor"
+  custom_fallback: True
   device: [all, -cuda]
   custom_code_at_the_beginning: |
     std::vector<int64_t> output_size(input.sizes().begin(), input.sizes().end());
@@ -850,15 +898,17 @@
 
 - schema: "stack(Tensor[] tensors, int dim=0) -> Tensor"
   custom_code_at_the_beginning: |
-    dim += dim < 0 ? tensors[0].sizes().size()+1 : 0;
-    auto num_tensors = tensors.size();
+    if (dim < 0) {
+      dim += static_cast<int64_t>(tensors[0].sizes().size()) + 1;
+    } 
+    auto num_tensors = static_cast<int64_t>(tensors.size());
     auto shape = tensors[0].sizes();
     std::vector<int64_t> tmp;
     for (int i = 0; i < dim; i++) {
         tmp.push_back(shape[i]);
     }
     tmp.push_back(num_tensors);
-    for (int i = dim; i < shape.size(); i++) {
+    for (int i = static_cast<int>(dim); i < shape.size(); i++) {
         tmp.push_back(shape[i]);
     }
     const std::vector<int64_t>& const_tmp = tmp;
@@ -873,28 +923,45 @@
 
 - schema: "stack.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)"
   custom_code_at_the_beginning: |
-    dim += dim < 0 ? tensors[0].sizes().size() : 0;
+    if (dim < 0) {
+      dim += static_cast<int64_t>(tensors[0].sizes().size());
+    } 
     std::vector<diopiConstTensorHandle_t> diopiTensorHandles(tensors.size());
     for (size_t i = 0; i < tensors.size(); ++i) {
       diopiTensorHandles[i] = dipu::diopi_helper::toDiopiTensorHandle(tensors.at(i));
     }
-  interface: diopiStack(ctx, out, diopiTensorHandles.data(), tensors.size(), dim)
+  interface: diopiStack(ctx, out, diopiTensorHandles.data(), static_cast<int64_t>(tensors.size()), dim)
 
 - schema: "sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)"
   custom_code_at_the_beginning: |
-    auto dim_ = dim < 0 ? (dim + self.sizes().size()) : dim;
+    int64_t dim_ = 0;
+    if (dim < 0) {
+      dim_ = dim + static_cast<int64_t>(self.sizes().size());
+    } else {
+      dim_ = dim;
+    }
     auto values = at::empty(self.sizes(), self.options());
     auto indices = at::empty(self.sizes(), self.options().dtype(at::kLong));
   interface: diopiSort(ctx, values, indices, self, dim_, descending, nullptr)
 
 - schema: "sort.values(Tensor self, int dim=-1, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)"
   custom_code_at_the_beginning: |
-    auto dim_ = dim < 0 ? (dim + self.sizes().size()) : dim;
+    int64_t dim_ = 0;
+    if (dim < 0) {
+      dim_ = dim + static_cast<int64_t>(self.sizes().size());
+    } else {
+      dim_ = dim;
+    }
   interface: diopiSort(ctx, values, indices, self, dim_, descending, nullptr)
 
 - schema: "sort.values_stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)"
   custom_code_at_the_beginning: |
-    auto dim_ = dim < 0 ? (dim + self.sizes().size()) : dim;
+    int64_t dim_ = 0;
+    if (dim < 0) {
+      dim_ = dim + static_cast<int64_t>(self.sizes().size());
+    } else {
+      dim_ = dim;
+    }
     bool stable_ = stable.has_value() ? stable.value() : false;
     const bool *p = &stable_;
   interface: diopiSort(ctx, values, indices, self, dim_, descending, p)
@@ -1027,7 +1094,8 @@
 
 - schema: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor
   custom_code_at_the_beginning: |
-    const auto self_dtype = at::native::to(self, dtype);
+    auto promoted_dtype = at::native::get_dtype_from_self(self, dtype, /*promote_integers=*/true);
+    const auto self_dtype = at::native::to(self, promoted_dtype);
     auto out = at::empty({}, self_dtype.options());
     ::diopiConstTensorHandle_t self_dtype_diopi = dipu::diopi_helper::toDiopiTensorHandle(self_dtype);
   interface: diopiProd(ctx, out, self_dtype_diopi, nullptr)
@@ -1047,7 +1115,7 @@
     }
 
     const auto& self_sizes = self.sizes();
-    for (int i = self_sizes.size() - 1, j = output_size.size() - 1;i >= 0;i--, j--) {
+    for (int i = static_cast<int>(self_sizes.size()) - 1, j = static_cast<int>(output_size.size()) - 1;i >= 0;i--, j--) {
       output_size[j] *= self_sizes.at(i);
     }
 
@@ -1057,15 +1125,20 @@
 - schema: rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
   custom_code_at_the_beginning: |
     auto out = at::empty_like(self);
+    // NOLINTNEXTLINE(readability-suspicious-call-argument)
     return dipu_sub_out(other, self, alpha, out);
   interface: diopiSub(ctx, out, other, self, alpha)
 
 - schema: "unique_dim(Tensor self, int dim, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor out, Tensor indices, Tensor counts)"
   custom_code_at_the_beginning: |
-    at::Tensor out, counts, indices;
+    at::Tensor out;
+    at::Tensor counts;
+    at::Tensor indices;
     if (return_inverse) {
       const auto ndims = self.sizes().size();
-      dim += (dim < 0 ? ndims : 0);
+      if (dim < 0) {
+        dim += static_cast<int64_t>(ndims);
+      }
       indices = at::empty({self.sizes().at(dim)}, self.options().dtype(at::kLong));
     }
     diopiTensorHandle_t out_ptr = nullptr;
@@ -1080,7 +1153,9 @@
 
 - schema: "_unique2(Tensor self, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor out, Tensor indices, Tensor counts)"
   custom_code_at_the_beginning: |
-    at::Tensor out, counts, indices;
+    at::Tensor out;
+    at::Tensor counts;
+    at::Tensor indices;
     if (return_inverse) {
       indices = at::empty(self.sizes(), self.options().dtype(at::kLong));
     }
@@ -1100,7 +1175,7 @@
     std::transform(tensors.begin(), tensors.end(), diopiTensorHandles.begin(), [](const at::Tensor& tensor){
         return dipu::diopi_helper::toDiopiTensorHandle(tensor);
     });
-  interface: diopiCat(ctx, out, diopiTensorHandles.data(), tensors.size(), dim);
+  interface: diopiCat(ctx, out, diopiTensorHandles.data(), static_cast<int64_t>(tensors.size()), dim);
 
 - schema: "masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor"
   custom_code_at_the_beginning: |
@@ -1125,7 +1200,7 @@
 
 - schema: "min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) min, Tensor(b!) min_indices)"
   custom_code_at_the_beginning: |
-    dim += ((dim >= 0) ? 0 : self.sizes().size());
+    dim += ((dim >= 0) ? 0 : static_cast<int64_t>(self.sizes().size()));
   interface: diopiMin(ctx, min, min_indices, self, dim)
 
 - schema: "max(Tensor self) -> Tensor"
@@ -1134,12 +1209,16 @@
   interface: diopiMaxAll(ctx, out, self)
 
 - schema: "maximum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"
-  no_device_check_args: [other]
-  interface: diopiMaximum(ctx, out, self, other)
+  no_device_check_args: [self, other]
+  ins: [selfTemp, otherTemp]
+  custom_code_at_the_beginning: |
+    auto selfTemp = (self.numel() == 1 && self.is_cpu()) ? self.to(other.device()) : self;
+    auto otherTemp = (other.numel() == 1 && other.is_cpu()) ? other.to(self.device()) : other;
+  interface: diopiMaximum(ctx, out, selfTemp, otherTemp)
 
 - schema: "max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_indices) -> (Tensor(a!) max, Tensor(b!) max_indices)"
   custom_code_at_the_beginning: |
-    dim += ((dim >= 0) ? 0 : self.sizes().size());
+    dim += ((dim >= 0) ? 0 : static_cast<int64_t>(self.sizes().size()));
     if (max_indices.numel() <= 0) {
       auto output_size = self.sizes().vec();
       if (keepdim) {
@@ -1261,12 +1340,28 @@
   custom_code_at_the_beginning: |
     std::vector<int64_t> size(2);
   custom_code_before_call_diopi: |
-    if (output_size.size() > 0) {
+    if (!output_size.empty()) {
       std::copy(output_sizeVector.begin(), output_sizeVector.end(), size.begin());
+    } else {
+      size[0] = std::floor(static_cast<double>(self.size(-2)) * scales_h.value_or(1.0));
+      size[1] = std::floor(static_cast<double>(self.size(-1)) * scales_w.value_or(1.0));
+    }
+  interface: diopiUpsampleNearest(ctx, out, self, size);
+
+- schema: "upsample_nearest2d(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor"
+  size_attr: [size]
+  custom_code_at_the_beginning: |
+    std::vector<int64_t> size(2);
+    if (output_size.size() > 0) {
+      std::vector<int64_t> tmpVector(output_size.size());
+      auto symIntToInt = [](const c10::SymInt& t)-> int64_t {return t.expect_int();};
+      std::transform(output_size.cbegin(), output_size.cend(), tmpVector.begin(), symIntToInt);
+      std::copy(tmpVector.begin(), tmpVector.end(), size.begin());
     } else {
       size[0] = std::floor(self.size(-2) * scales_h.value_or(1.0));
       size[1] = std::floor(self.size(-1) * scales_w.value_or(1.0));
     }
+    auto out = at::empty({self.size(0),self.size(1),size[0],size[1]},self.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
   interface: diopiUpsampleNearest(ctx, out, self, size);
 
 - schema: "upsample_bilinear2d.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)"
@@ -1274,12 +1369,29 @@
   custom_code_at_the_beginning: |
     std::vector<int64_t> size(2);
   custom_code_before_call_diopi: |
-    if (output_size.size() > 0) {
+    if (!output_size.empty()) {
       std::copy(output_sizeVector.begin(), output_sizeVector.end(), size.begin());
+    } else {
+      size[0] = std::floor(static_cast<double>(self.size(-2)) * scales_h.value_or(1.0));
+      size[1] = std::floor(static_cast<double>(self.size(-1)) * scales_w.value_or(1.0));
+    }
+    const char* mode = "bilinear";
+  interface: diopiUpsampleLinear(ctx, out, self, size, align_corners, mode);
+
+- schema: "upsample_bilinear2d(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor"
+  size_attr: [size]
+  custom_code_at_the_beginning: |
+    std::vector<int64_t> size(2);
+    if (output_size.size() > 0) {
+      std::vector<int64_t> tmpVector(output_size.size());
+      auto symIntToInt = [](const c10::SymInt& t)-> int64_t {return t.expect_int();};
+      std::transform(output_size.cbegin(), output_size.cend(), tmpVector.begin(), symIntToInt);
+      std::copy(tmpVector.begin(), tmpVector.end(), size.begin());
     } else {
       size[0] = std::floor(self.size(-2) * scales_h.value_or(1.0));
       size[1] = std::floor(self.size(-1) * scales_w.value_or(1.0));
     }
+    auto out = at::empty({self.size(0),self.size(1),size[0],size[1]},self.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
     const char* mode = "bilinear";
   interface: diopiUpsampleLinear(ctx, out, self, size, align_corners, mode);
 
@@ -1287,6 +1399,23 @@
   size_attr: [size]
   custom_code_at_the_beginning: |
     std::vector<int64_t> size(2);
+  custom_code_before_call_diopi: |
+    if (!output_size.empty()) {
+      std::copy(output_sizeVector.begin(), output_sizeVector.end(), size.begin());
+    } else {
+      size[0] = std::floor(static_cast<double>(*(input_sizeVector.rbegin() + 1)) * scales_h.value_or(1.0));
+      size[1] = std::floor(static_cast<double>(*(input_sizeVector.rbegin())) * scales_w.value_or(1.0));
+    }
+  interface: diopiUpsampleNearestBackward(ctx, grad_input, grad_output, size, input_size)
+
+- schema: "upsample_nearest2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor grad_input"
+  size_attr: [size]
+  custom_code_at_the_beginning: |
+    std::vector<int64_t> size(2);
+    auto symInt2Int = [](const c10::SymInt& t)-> int64_t {return t.expect_int();};
+    std::vector<int64_t> grad_input_shape(input_size.size());
+    std::transform(input_size.cbegin(), input_size.cend(), grad_input_shape.begin(), symInt2Int);
+    auto grad_input = at::empty(grad_input_shape,grad_output.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
   custom_code_before_call_diopi: |
     if (output_size.size() > 0) {
       std::copy(output_sizeVector.begin(), output_sizeVector.end(), size.begin());
@@ -1300,6 +1429,24 @@
   size_attr: [size]
   custom_code_at_the_beginning: |
     std::vector<int64_t> size(2);
+  custom_code_before_call_diopi: |
+    if (!output_size.empty()) {
+      std::copy(output_sizeVector.begin(), output_sizeVector.end(), size.begin());
+    } else {
+      size[0] = std::floor(static_cast<double>(*(input_sizeVector.rbegin() + 1)) * scales_h.value_or(1.0));
+      size[1] = std::floor(static_cast<double>(*(input_sizeVector.rbegin())) * scales_w.value_or(1.0));
+    }
+    const char* mode = "bilinear";
+  interface: diopiUpsampleLinearBackward(ctx, grad_input, grad_output, size, input_size, align_corners, mode)
+
+- schema: "upsample_bilinear2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor grad_input"
+  size_attr: [size]
+  custom_code_at_the_beginning: |
+    std::vector<int64_t> size(2);
+    auto symInt2Int = [](const c10::SymInt& t)-> int64_t {return t.expect_int();};
+    std::vector<int64_t> grad_input_shape(input_size.size());
+    std::transform(input_size.cbegin(), input_size.cend(), grad_input_shape.begin(), symInt2Int);
+    auto grad_input = at::empty(grad_input_shape,grad_output.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
   custom_code_before_call_diopi: |
     if (output_size.size() > 0) {
       std::copy(output_sizeVector.begin(), output_sizeVector.end(), size.begin());
@@ -1333,6 +1480,7 @@
   interface: diopiCosInp(ctx, self)
 
 - schema: "bmm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)"
+  custom_fallback: True
   interface: diopiBmm(ctx, out, self, mat2)
 
 - schema: "silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"
@@ -1346,7 +1494,13 @@
   autocompare: disable
   interface: diopiNormalInp(ctx, self, mean, std, generator)
 
+- schema: "mm(Tensor self, Tensor mat2) -> Tensor"
+  custom_code_at_the_beginning: |
+    auto out = nodispatch::empty({self.sizes()[0], mat2.sizes()[1]}, self.options());
+  interface: diopiMm(ctx, out, self, mat2)
+
 - schema: "mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)"
+  custom_fallback: True
   interface: diopiMm(ctx, out, self, mat2)
 
 - schema: "matmul(Tensor self, Tensor other) -> Tensor"
@@ -1414,7 +1568,7 @@
   custom_code_at_the_beginning: |
     auto shape = self.sizes();
     std::vector<int64_t> output_shape(shape.begin(), shape.end());
-    dim += dim >= 0 ? 0 : shape.size();
+    dim += dim >= 0 ? 0 : static_cast<int64_t>(shape.size());
     output_shape[dim] = index.numel();
     auto out = at::empty({output_shape}, self.options());
   interface: diopiIndexSelect(ctx, out, self, dim, index)
@@ -1523,7 +1677,35 @@
     at::Tensor neg_log_likelihood = at::empty({batch_size}, options);
     at::Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, options);
   backward_return_code: |
-    std::vector<at::Tensor> outputs(7);
+    /* Note: This kernel's output size will be checked by pytorch/torch/csrc/autograd/custom_function.h
+    *
+    *  ''' custom_function.h
+    *   auto num_outputs = static_cast<int64_t>(outputs.size());
+    *   // Returning too many results is ok, but only as long as they're all
+    *   // undefined. Truncate the result vector in that case.
+    *   if (num_outputs > num_forward_inputs) {
+    *     bool all_undef = true;
+    *     for (const auto i : c10::irange(num_forward_inputs, num_outputs)) {
+    *       all_undef &= (!outputs[i].defined());
+    *     }
+    *     if (all_undef) {
+    *       outputs.resize(num_forward_inputs);
+    *       num_outputs = num_forward_inputs;
+    *     }
+    *   }
+    *
+    *   if (num_outputs != num_forward_inputs) {
+    *     std::string msg("function ");
+    *     msg += name() + " returned an incorrect number of gradients (expected ";
+    *     msg += c10::to_string(num_forward_inputs) + ", got ";
+    *     msg += c10::to_string(num_outputs) + ")";
+    *     throw std::runtime_error(msg);
+    *   }
+    *   '''
+    */
+
+    constexpr int kSameAsInputSize = 7;
+    std::vector<at::Tensor> outputs(kSameAsInputSize);
     outputs[0] = result;
     return outputs;
 
@@ -1606,7 +1788,35 @@
     at::Tensor neg_log_likelihood = at::empty({batch_size}, options);
     at::Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, options);
   backward_return_code: |
-    std::vector<at::Tensor> outputs(7);
+    /* Note: This kernel's output size will be checked by pytorch/torch/csrc/autograd/custom_function.h
+    *
+    *  ''' custom_function.h
+    *   auto num_outputs = static_cast<int64_t>(outputs.size());
+    *   // Returning too many results is ok, but only as long as they're all
+    *   // undefined. Truncate the result vector in that case.
+    *   if (num_outputs > num_forward_inputs) {
+    *     bool all_undef = true;
+    *     for (const auto i : c10::irange(num_forward_inputs, num_outputs)) {
+    *       all_undef &= (!outputs[i].defined());
+    *     }
+    *     if (all_undef) {
+    *       outputs.resize(num_forward_inputs);
+    *       num_outputs = num_forward_inputs;
+    *     }
+    *   }
+    *
+    *   if (num_outputs != num_forward_inputs) {
+    *     std::string msg("function ");
+    *     msg += name() + " returned an incorrect number of gradients (expected ";
+    *     msg += c10::to_string(num_forward_inputs) + ", got ";
+    *     msg += c10::to_string(num_outputs) + ")";
+    *     throw std::runtime_error(msg);
+    *   }
+    *   '''
+    */
+    
+    constexpr int kSameAsInputSize = 7;
+    std::vector<at::Tensor> outputs(kSameAsInputSize);
     outputs[0] = result;
     return outputs;
 
@@ -1679,7 +1889,12 @@
   interface: diopiClampMaxInp(ctx, self, max)
 
 - schema: "minimum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"
-  interface: diopiMinimum(ctx,out, self, other)
+  no_device_check_args: [self, other]
+  ins: [selfTemp, otherTemp]
+  custom_code_at_the_beginning: |
+    auto selfTemp = (self.numel() == 1 && self.is_cpu()) ? self.to(other.device()) : self;
+    auto otherTemp = (other.numel() == 1 && other.is_cpu()) ? other.to(self.device()) : other;
+  interface: diopiMinimum(ctx, out, selfTemp, otherTemp)
 
 - schema: "scatter.value_out(Tensor self, int dim, Tensor index, Scalar value, *, Tensor(a!) out) -> Tensor(a!)"
   interface: diopiScatterScalar(ctx, out, self, dim, value, index, "")
@@ -1746,7 +1961,7 @@
       indices_tensor_vec[i] = (indices[i].has_value() && indices[i].value().defined()) ? indices[i].value().to(self.device()) : at::Tensor();
       indices_vec[i] = diopi_helper::toDiopiTensorHandle(indices_tensor_vec[i]);
     }
-  interface: diopiIndex(ctx, &out_ptr, self, indices_vec.data(), indices_vec.size())
+  interface: diopiIndex(ctx, &out_ptr, self, indices_vec.data(), static_cast<int64_t>(indices_vec.size()))
   custom_code_before_return: |
     dipu::getCurrentDIPUStream().synchronize();
     out = *reinterpret_cast<at::Tensor*>(out_ptr);
@@ -1760,7 +1975,7 @@
       indices_tensor_vec[i] = (indices[i].has_value() && indices[i].value().defined()) ? indices[i].value().to(self.device()) : at::Tensor();
       indices_vec[i] = diopi_helper::toDiopiTensorHandle(indices_tensor_vec[i]);
     }
-  interface: diopiIndexPut(ctx, self, self, values, indices_vec.data(), indices_vec.size(), accumulate)
+  interface: diopiIndexPut(ctx, self, self, values, indices_vec.data(), static_cast<int64_t>(indices_vec.size()), accumulate)
 
 - schema: "_cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor"
   custom_code_at_the_beginning: |
@@ -1823,15 +2038,15 @@
 
     int num_blocks = 1;
     for(int i = 0; i < 2; i++){
-      num_blocks *= int((input_shape[i + 2] + 2 * padding[i] - dilation[i] * (kernel_size[i] - 1) - 1) / stride[i]) + 1;
+      num_blocks *= static_cast<int>((input_shape[i + 2] + 2 * padding[i] - dilation[i] * (kernel_size[i] - 1) - 1) / stride[i]) + 1;
     }
-    int channels = input_shape[1];
+    int channels = static_cast<int>(input_shape[1]);
     for(int i = 0; i < 2; i++){
-      channels *= kernel_size[i];
+      channels *= static_cast<int>(kernel_size[i]);
     }
 
     std::vector<int64_t> out_shape({channels, num_blocks});
-    if(batched_input == true){
+    if(batched_input){
       out_shape.insert(out_shape.begin(), input_shape[0]);
     }
     auto out = at::empty({out_shape}, self.options());
@@ -1847,13 +2062,13 @@
       input_shape.insert(input_shape.begin(), 1);
     }
 
-    int channels = input_shape[1];
+    int channels = static_cast<int>(input_shape[1]);
     for(int i = 0; i < 2; i++){
-      channels = channels / kernel_size[i];
+      channels = channels / static_cast<int>(kernel_size[i]);
     }
 
     std::vector<int64_t> out_shape({channels, output_size.at(0).expect_int(), output_size.at(1).expect_int()});
-    if(batched_input == true){
+    if(batched_input){
       out_shape.insert(out_shape.begin(), input_shape[0]);
     }
     auto out = at::empty({out_shape}, self.options());
@@ -1898,7 +2113,12 @@
     auto shape = input.size(1);
     auto out0 = at::empty({shape}, input.options().dtype(at::kFloat));
     auto out1 = at::empty({shape}, input.options().dtype(at::kFloat));
-  interface: diopiBatchNormGatherStatsWithCounts(ctx, out0, out1, input, mean, invstd, const_cast<diopiTensorHandle_t>(running_mean), const_cast<diopiTensorHandle_t>(running_var), momentum, eps, counts)
+  interface: diopiBatchNormGatherStatsWithCounts(ctx, out0, out1, input, mean, invstd, const_cast<diopiTensorHandle_t>(running_mean), const_cast<diopiTensorHandle_t>(running_var), static_cast<float>(momentum), static_cast<float>(eps), counts)
+  custom_code_before_call_diopi: |
+    // NOTE: const_cast here is safe according to pytorch's source code
+    // NOLINTBEGIN(cppcoreguidelines-pro-type-const-cast)
+  custom_code_before_return: |
+    // NOLINTEND(cppcoreguidelines-pro-type-const-cast)
 
 - schema: batch_norm_backward_reduce(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g) -> (Tensor, Tensor, Tensor, Tensor)
   custom_code_at_the_beginning: |
@@ -1908,8 +2128,8 @@
     at::Tensor out2;
     at::Tensor out3;
     if(input_g){
-      out0 = at::empty({shape}, input.options().dtype(at::kFloat));
-      out1 = at::empty({shape}, input.options().dtype(at::kFloat));
+      out0 = at::empty({shape}, input.options().dtype(at::kFloat), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+      out1 = at::empty({shape}, input.options().dtype(at::kFloat), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
     }
     if(weight_g){
       out2 = at::empty({shape}, input.options().dtype(at::kFloat));
@@ -1921,13 +2141,13 @@
 
 - schema: batch_norm_backward_elemt(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor sum_dy, Tensor sum_dy_xmu, Tensor count) -> Tensor
   custom_code_at_the_beginning: |
-    auto out = at::empty_like(grad_out);
+    auto out = at::empty_like(grad_out, grad_out.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
   interface: diopiBatchNormBackwardElemt(ctx, out, grad_out, input, mean, invstd, weight, sum_dy, sum_dy_xmu, count);
 
 - schema: batch_norm_elemt(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps) -> Tensor
   custom_code_at_the_beginning: |
-    auto out = at::empty_like(input);
-  interface: diopiBatchNormElemt(ctx, out, input, weight, bias, mean, invstd, eps);
+    auto out = at::empty_like(input, input.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+  interface: diopiBatchNormElemt(ctx, out, input, weight, bias, mean, invstd, static_cast<float>(eps));
 
 - schema: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!)
   interface: diopiSmoothL1Loss(ctx, out, self, target, static_cast<diopiReduction_t>(reduction), static_cast<double>(beta));
@@ -2134,7 +2354,7 @@
     auto selfVec = self.vec();
     auto scalarsCpu = scalars.cpu();
     for (size_t i = 0;i < self.size();i++) {
-      dipu_addcmul_(selfVec[i], tensor1[i], tensor2[i], scalarsCpu[i].item());
+      dipu_addcmul_(selfVec[i], tensor1[i], tensor2[i], scalarsCpu[static_cast<int64_t>(i)].item());
     }
     return;
   interface: diopiAddcmulInp(ctx, self, tensor1, tensor2, scalars)
@@ -2165,7 +2385,7 @@
     auto selfVec = self.vec();
     auto scalarsCpu = scalars.cpu();
     for (size_t i = 0;i < self.size();i++) {
-      dipu_addcdiv_(selfVec[i], tensor1[i], tensor2[i], scalarsCpu[i].item());
+      dipu_addcdiv_(selfVec[i], tensor1[i], tensor2[i], scalarsCpu[static_cast<int64_t>(i)].item());
     }
     return;
   interface: diopiAddcdivInp(ctx, self, tensor1, tensor2, scalars)
@@ -2215,7 +2435,7 @@
     return out;
   interface: diopiNorm(ctx, out, self, p, dimDiopiSize);
 
-# wrap_diopi_cast_dtype has no corresponding aten op and not registed, it's just a diopi func wrapper.
+# wrap_diopi_cast_dtype has no corresponding aten op and not registered, it's just a diopi func wrapper.
 # use this tricky method to support call multiple diopi-op in one aten-op
 - schema: "wrap_diopi_cast_dtype(Tensor(a) self, ScalarType dtype) -> Tensor(a)"
   register_op: False
@@ -2231,9 +2451,10 @@
 
 # this copy_ aten op may use both diopiCastDtype and diopiCopyInp. it's a proxy/composite op
 - schema: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
+  autocompare: disable
   dummy_call_diopi: True
   custom_fallback: True
-  device: [cuda, camb, ascend, droplet, supa]
+  device: [cuda, camb, ascend, droplet, supa, kunlunxin]
   custom_code_at_the_beginning: |
     dipu::getDipuCopyInstance()->run(self, src, non_blocking);
     return self;
@@ -2242,6 +2463,7 @@
 
 # vendor who has no fully implemented diopi and proper fallback DIPUCopy sub-class
 - schema: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
+  autocompare: disable
   custom_fallback: True
   dummy_call_diopi: True
   custom_code_at_the_beginning: |
@@ -2250,15 +2472,20 @@
   interface: diopiCopyInp(ctx, src, self)
 
 - schema: _amp_foreach_non_finite_check_and_unscale_(at::TensorList self, Tensor(b!) found_inf, Tensor inv_scale) -> void
+  autocompare: disable
   custom_fallback: True
   custom_code_at_the_beginning: |
     std::vector<diopiTensorHandle_t> diopiTensorHandles(self.size(), nullptr);
+    // NOTE: const_cast here is safe according to pytorch's source code
+    // NOLINTBEGIN(cppcoreguidelines-pro-type-const-cast)
     std::transform(self.begin(), self.end(), diopiTensorHandles.begin(), [](const at::Tensor& t){
         return dipu::diopi_helper::toDiopiTensorHandle(const_cast<at::Tensor&>(t));
     });
-  interface: diopiAmpForeachNonFiniteCheckAndUnscaleInp(ctx, diopiTensorHandles.data(), self.size(), found_inf, inv_scale)
+    // NOLINTEND(cppcoreguidelines-pro-type-const-cast)
+  interface: diopiAmpForeachNonFiniteCheckAndUnscaleInp(ctx, diopiTensorHandles.data(), static_cast<int64_t>(self.size()), found_inf, inv_scale)
+  # TODO(someone): fix this issue when `autocompare` is on
   autocompare: disable
 
 - schema: _amp_update_scale_(Tensor(a!) self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor(a!)
   custom_fallback: True
-  interface: diopiAmpUpdateScaleInp(ctx, self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval)
+  interface: diopiAmpUpdateScaleInp(ctx, self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, static_cast<int32_t>(growth_interval))
diff --git a/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py b/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py
index 7eda79b15c..1f4536cdd9 100644
--- a/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py
+++ b/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py
@@ -1,44 +1,92 @@
 # Copyright (c) 2023, DeepLink.
 diopi_wrapper_file_template_content = \
-"""
-// autogened file
-#include <ATen/Tensor.h>
-#include <ATen/ATen.h>
+"""// autogened file
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <iostream>
+#include <ostream>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <ATen/ExpandUtils.h>
 #include <ATen/Functions.h>
-#include <type_traits>
-
+#include <ATen/Tensor.h>
+#include <ATen/core/ATen_fwd.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/LegacyTypeDispatch.h>
+#include <ATen/core/List.h>
+#include <ATen/core/TensorBody.h>
+#include <ATen/native/ReduceOpsUtils.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/scalar_tensor.h>
+#include <ATen/ops/to_native.h>
+#include <ATen/ops/zeros.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/SymInt.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
+#include <c10/util/SmallVector.h>
+#include <c10/util/accumulate.h>
+#include <c10/util/string_view.h>
 #include <torch/csrc/autograd/custom_function.h>
+#include <torch/csrc/autograd/generated/variable_factories.h>
 #include <torch/types.h>
-#include "csrc_dipu/aten/DIPUATenFunctions.h"
+
+#include <diopi/diopirt.h>
+#include <diopi/functions.h>
+
 #include "csrc_dipu/aten/RegisterDIPU.hpp"
+#include "csrc_dipu/aten/ops/DIPUCopy.hpp"
+#include "csrc_dipu/aten/ops/NodispatchUtils.hpp"
+#include "csrc_dipu/aten/ops/OpUtils.hpp"
+#include "csrc_dipu/base/basedef.h"
 #include "csrc_dipu/diopirt/diopirt_impl.h"
 #include "csrc_dipu/profiler/profiler.h"
-#include <csrc_dipu/utils/Log.h>
+#include "csrc_dipu/runtime/core/DIPUGeneratorImpl.h"
+#include "csrc_dipu/runtime/core/DIPUStream.h"
+
 #include "CustomFallbackFunctions.hpp"
-#include "csrc_dipu/aten/ops/DIPUCopy.hpp"
 
 $header_include_code
 
-namespace dipu::native {
+// NOTE: Some kernels (e.g. _foreach_add_.List) have custom codes at the
+// beginning ending with early return. This is a workaround intended to skip
+// some of the autogened codes (e.g. type cast, calling DIOPI, etc.).
+//
+// NOLINTBEGIN(readability-redundant-control-flow)
 
-using dipu::diopi_helper::toDiopiGeneratorHandle;
+namespace dipu {
 
+namespace native {
 
-using namespace dipu::diopi_helper;
+using dipu::diopi_helper::toDiopiGeneratorHandle;
+using dipu::diopi_helper::toDiopiSize;
+using dipu::diopi_helper::toDiopiRoundMode;
 
 $functions_code
 
+}  // namespace native
+}  // namespace dipu
 
-}  // namespace dipu::native
+// NOLINTEND(readability-redundant-control-flow)
 
 namespace at {
 
 DIPU_LIBRARY_IMPL(aten, DIPU_DEVICE_TYPE_MACRO, m) {
-    $op_register_code
+  $op_register_code
 }
 
 DIPU_LIBRARY_IMPL(aten, DIPU_AUTOGRAD_DEVICE_TYPE_MACRO, m) {
-    $autograd_op_register_code
+  $autograd_op_register_code
 }
 
 }  // namespace at
@@ -49,34 +97,32 @@
 """
 //  $comment
 $cppsignautre {
-    dipu::profile::RecordBlockCreator _(__FUNCTION__);
-    $custom_code_at_the_beginning
+  dipu::profile::RecordBlockCreator _(__FUNCTION__);
+  $custom_code_at_the_beginning
 
-    ::diopiContext context(dipu::getCurrentDIPUStream().rawstream());
-    auto ctx = &context;
+  ::diopiContext context(dipu::getCurrentDIPUStream().rawstream());
+  auto ctx = &context;
 
-    $input_process_code
+  $input_process_code
 
-    $output_process_code
+  $output_process_code
 
-    $attrs_process_code
+  $attrs_process_code
 
-    $device_check_code
+  $device_check_code
 
-    $custom_code_before_call_diopi
+  $custom_code_before_call_diopi
 
-    dipu::profile::RecordBlockCreator dipuRecorder(R"($diopi_fun_call_code)");
-    ::diopiError_t ret = $diopi_fun_call_code
-    dipuRecorder.end();
-    if (checkDiopiReturnValue()) {
-        TORCH_CHECK(ret == ::diopiSuccess, __FILE__, ":", __LINE__, R"($diopi_fun_call_code)", " error, error code is ", ret, "error message is ", diopiGetLastErrorString());
-    }
+  dipu::profile::RecordBlockCreator dipuRecorder(R"($interface_name)");
+  ::diopiError_t ret = $diopi_fun_call_code
+  dipuRecorder.end();
+  TORCH_CHECK(ret == ::diopiSuccess, __FILE__, ":", __LINE__, R"($diopi_fun_call_code)", " error, error code is ", ret, "error message is ", diopiGetLastErrorString());
 
-    $custom_code_before_return
+  $custom_code_before_return
 
-    synchronizeIfEnable();
+  synchronizeIfEnable();
 
-    $return_code
+  $return_code
 }
 """
 
@@ -94,29 +140,29 @@
 """
 class $autograd_function_name : public torch::autograd::Function<$autograd_function_name> {
 public:
-    static $return_code forward(torch::autograd::AutogradContext *ctx, $param_list) {
-        $forward_process_code
+  static $return_code forward(torch::autograd::AutogradContext *ctx, $param_list) {
+    $forward_process_code
 
-        $save_for_backward_code
+    $save_for_backward_code
 
-        at::AutoDispatchBelowADInplaceOrView g;
-        return $call_forward_impl_code;
-    }
+    at::AutoDispatchBelowADInplaceOrView g;
+    return $call_forward_impl_code;
+  }
 
   static std::vector<at::Tensor> backward(torch::autograd::AutogradContext *ctx, std::vector<at::Tensor> grad_outputs) {
-      $load_saved_data_code
+    $load_saved_data_code
 
-      $cal_grad_code
+    $cal_grad_code
 
-      $call_backward_impl_code
+    $call_backward_impl_code
 
-      $backward_return_code
+    $backward_return_code
   }
 };
 
 $cppsignautre {
-    auto result = $autograd_function_name::apply($arg_name_list);
-    $wrappter_custom_return
+  auto result = $autograd_function_name::apply($arg_name_list);
+  $wrappter_custom_return
 }
 """
 
@@ -125,15 +171,15 @@ class $autograd_function_name : public torch::autograd::Function<$autograd_funct
 """
 //  $comment
 $cppsignautre {
-    std::cout << std::endl << __FUNCTION__ << std::endl;
-    $transform_input_to_cpu_code
+  std::cout << std::endl << __FUNCTION__ << std::endl;
+  $transform_input_to_cpu_code
 
-    $execute_op_on_cpu_code
+  $execute_op_on_cpu_code
 
-    $execute_op_on_device_code
+  $execute_op_on_device_code
 
-    $transform_result_to_cpu_code
+  $transform_result_to_cpu_code
 
-    $result_compare_code
+  $result_compare_code
 }
 """
diff --git a/dipu/scripts/autogen_diopi_wrapper/op_memory_format_converter.py b/dipu/scripts/autogen_diopi_wrapper/op_memory_format_converter.py
new file mode 100644
index 0000000000..80a8fccb4d
--- /dev/null
+++ b/dipu/scripts/autogen_diopi_wrapper/op_memory_format_converter.py
@@ -0,0 +1,115 @@
+import os
+import re
+import yaml
+
+accepted_interface = "ALL"
+
+class OpMemoryFormatConverter(object):
+    #The converter class, will do the converting memory format based on the convert_config.yaml loaded.
+    def __init__(self, convert_config):
+        assert(isinstance(convert_config, str))
+        if convert_config and len(convert_config):
+            with open(convert_config) as convert_config_yaml_file:
+                file_data = convert_config_yaml_file.read()
+                self.convert_config_yaml = yaml.load(file_data, Loader=yaml.FullLoader)
+                self.convert_config = ConvertConfig(self.convert_config_yaml)
+        else:
+            self.convert_config_yaml = list()
+            self.convert_config = ConvertConfig(self.convert_config_yaml)
+
+    def convert(self,custom_code,fun_config):
+        if "interface" in fun_config and (accepted_interface == "ALL" or (fun_config['interface'] in accepted_interface)):
+            return self.do_convert(custom_code,fun_config)
+        else:
+            return custom_code
+    
+    def do_convert(self,custom_code,fun_config):
+        # Do the covert job
+        def choose_default(matched):
+            value = str(matched.group("default"))
+            return value
+        
+        def choose_channelsLast3d(matched):
+            return "at::MemoryFormat::ChannelsLast3d"
+        
+        def choose_channelsLast(matched):
+            return "at::MemoryFormat::ChannelsLast"
+        
+        def choose_contiguous(matched):
+            return "at::MemoryFormat::Contiguous"
+
+        def choose_preserve(matched):
+            return "at::MemoryFormat::Preserve"
+
+        interface = fun_config["interface"]
+        custom_code = custom_code.split("\n")
+        memory_format = self.convert_config.interface2memoryformat(interface)
+        custom_code_new = list()
+        # match string like "${PREFERRED_MEMORY_FORMAT_PLACHOLDER_3D:-<default>}"
+        placeholder_3d_pattern = "\$\{PREFERRED_MEMORY_FORMAT_PLACEHOLDER_3D:-(?P<default>.*)\}"
+        # match string like "${PREFERRED_MEMORY_FORMAT_PLACHOLDER:-<default>}"
+        placeholder_pattern = "\$\{PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-(?P<default>.*)\}"
+        for line in custom_code:
+            if memory_format == "channellast":
+                line = re.sub(placeholder_3d_pattern, choose_channelsLast3d, line)
+                line = re.sub(placeholder_pattern, choose_channelsLast, line)
+            elif memory_format == "contiguous":
+                line = re.sub(placeholder_3d_pattern, choose_contiguous, line)
+                line = re.sub(placeholder_pattern, choose_contiguous, line)
+            elif memory_format == "preserve":
+                line = re.sub(placeholder_3d_pattern, choose_preserve, line)
+                line = re.sub(placeholder_pattern, choose_preserve, line)
+            elif memory_format == "empty":
+                line = re.sub(placeholder_3d_pattern, choose_default, line)
+                line = re.sub(placeholder_pattern, choose_default, line)
+            else:
+                print("UNABLE TO RECOGNIZE MEMORY FORMAT!!!")
+            custom_code_new.append(line)
+        custom_code = "\n".join(custom_code_new)
+        return custom_code
+
+class ConvertConfig(object):
+    #This class is used to load and parse the convert_config.yaml
+    def __init__(self, config_yaml):
+        self.convert_dict = dict()
+        self.convert_config_yaml = config_yaml
+        self.default_layout = "empty"
+        assert(isinstance(config_yaml, list))
+        for config in config_yaml:
+            assert(isinstance(config,dict))
+            for interface in config.keys():
+                if interface == "common_config":
+                    detail = config[interface]
+                    assert(isinstance(detail, dict))
+                    if "layout" in detail:
+                        self.default_layout = self.layout2memoryformat(detail["layout"])
+                    pass
+                    # may add common behavior
+            for interface in config.keys():
+                if interface != "common_config":
+                    self.convert_dict.setdefault(interface,dict())
+                    detail = config[interface]
+                    assert(isinstance(detail, dict))
+                    if "layout" in detail:
+                        self.convert_dict[interface]["layout"] = self.layout2memoryformat(detail["layout"])
+ 
+    def layout2memoryformat(self, layout):
+        #used when pasing convert_config.yaml, return the memory format based on NCHW/NHWC and other layout.
+        assert(isinstance(layout, str))
+        if "NCHW" in layout:
+            return "contiguous"
+        if "NLC" in layout:
+            return "channellast"
+        if "NHWC" in layout:
+            return "channellast"
+        if "NDHWC" in layout:
+            return "channellast"
+        return "preserve"
+     
+    def interface2memoryformat(self, interface):
+        #return the prefered memory format based on the DIOPI interface.
+        interface_stripped = interface.strip().split("(")[0]
+        if (interface_stripped not in self.convert_dict) or ("layout" not in self.convert_dict[interface_stripped]):
+            return self.default_layout
+        else:
+            return self.convert_dict[interface_stripped]["layout"]
diff --git a/dipu/scripts/ci/ascend/ci_ascend_env.sh b/dipu/scripts/ci/ascend/ci_ascend_env.sh
index d7e4d17d53..381d6eb4bc 100644
--- a/dipu/scripts/ci/ascend/ci_ascend_env.sh
+++ b/dipu/scripts/ci/ascend/ci_ascend_env.sh
@@ -14,6 +14,9 @@ export DIPU_PATH=${DIPU_ROOT}
 export PYTORCH_DIR=${ASCEND_TORCH_DIR}
 export PYTHONPATH=${PYTORCH_DIR}:${PYTHONPATH}
 
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+
 source /usr/local/Ascend/ascend-toolkit/set_env.sh
 
 ARCH=$(uname -m)
diff --git a/dipu/scripts/ci/camb/ci_camb_env.sh b/dipu/scripts/ci/camb/ci_camb_env.sh
index 7527809648..6b0de04a6a 100644
--- a/dipu/scripts/ci/camb/ci_camb_env.sh
+++ b/dipu/scripts/ci/camb/ci_camb_env.sh
@@ -1,9 +1,9 @@
 PLATFORM=/mnt/lustre/share/platform
-ENV_NAME=dipu_poc
+ENV_NAME=pt2.0_diopi
 export PATH=`python ${PLATFORM}/env/clear_path.py PATH`
 export LD_LIBRARY_PATH=`python ${PLATFORM}/env/clear_path.py LD_LIBRARY_PATH`
-GCC_ROOT=/mnt/lustre/share/platform/dep/gcc-7.5
-CONDA_ROOT=${PLATFORM}/env/miniconda3.8
+GCC_ROOT=/mnt/lustre/share/platform/dep/gcc-10.2
+CONDA_ROOT=${PLATFORM}/env/miniconda3.10
 
 export NEUWARE_HOME=/usr/local/neuware
 export CC=${GCC_ROOT}/bin/gcc
@@ -13,8 +13,8 @@ export CXX=${GCC_ROOT}/bin/g++
 export DIOPI_ROOT=$(pwd)/third_party/DIOPI/impl/lib/
 export DIPU_ROOT=$(pwd)/torch_dipu
 export LD_LIBRARY_PATH=$DIPU_ROOT:$LD_LIBRARY_PATH
-export PYTHONPATH=${PYTORCH_DIR}/install_path/lib/python3.8/site-packages:${PYTHONPATH}
-export PATH=${GCC_ROOT}/bin:${PYTORCH_DIR}/install_path/bin:${CONDA_ROOT}/envs/dipu_poc/bin:${CONDA_ROOT}/bin:${PATH}
+export PYTHONPATH=${PLATFORM}/dep/DIOPI_pytorch/pytorch2.0:${PYTHONPATH}
+export PATH=${GCC_ROOT}/bin:${CONDA_ROOT}/envs/dipu_poc/bin:${CONDA_ROOT}/bin:${PATH}
 export LD_PRELOAD=${GCC_ROOT}/lib64/libstdc++.so.6
 
 
@@ -33,6 +33,9 @@ export DIPU_HOST_MEMCACHING_ALGORITHM=BS
 #export DIPU_RAW_ALLOCATOR_MIN_ALLOCATE_SIZE=512
 export DIPU_CHECK_TENSOR_DEVICE=1
 
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+
 source activate $ENV_NAME
 
 echo  "python path : ${PYTHONPATH}"
diff --git a/dipu/scripts/ci/droplet/ci_droplet_env.sh b/dipu/scripts/ci/droplet/ci_droplet_env.sh
index 5140be7c41..1bf7defe90 100644
--- a/dipu/scripts/ci/droplet/ci_droplet_env.sh
+++ b/dipu/scripts/ci/droplet/ci_droplet_env.sh
@@ -16,5 +16,8 @@ export DIPU_PATH=${DIPU_ROOT}
 export LIBRARY_PATH=$DIPU_ROOT:$DIOPI_ROOT:$LIBRARY_PATH
 export LD_LIBRARY_PATH=$DIPU_ROOT:$DIOPI_ROOT:$LD_LIBRARY_PATH
 
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+
 echo $ENV_PATH
 source activate $ENV_PATH
diff --git a/dipu/scripts/ci/nv/ci_nv_env.sh b/dipu/scripts/ci/nv/ci_nv_env.sh
index d885dc983e..453a1da092 100644
--- a/dipu/scripts/ci/nv/ci_nv_env.sh
+++ b/dipu/scripts/ci/nv/ci_nv_env.sh
@@ -2,14 +2,14 @@ PLATFORM=/mnt/cache/share/platform
 ENV_NAME=pt2.0_diopi
 export PATH=`python ${PLATFORM}/env/clear_path.py PATH`
 export LD_LIBRARY_PATH=`python ${PLATFORM}/env/clear_path.py LD_LIBRARY_PATH`
-GCC_ROOT=${PLATFORM}/dep/gcc-7.5
-CONDA_ROOT=${PLATFORM}/env/miniconda3.8
+GCC_ROOT=${PLATFORM}/dep/gcc-10.2
+CONDA_ROOT=${PLATFORM}/env/miniconda3.10
 export CC=${GCC_ROOT}/bin/gcc
 export CXX=${GCC_ROOT}/bin/g++
 
-export CUDA_PATH=${PLATFORM}/dep/cuda11.7-cudnn8.5
-export MPI_ROOT=${PLATFORM}/dep/openmpi-4.0.5-cuda11.7
-export NCCL_ROOT=${PLATFORM}/dep/nccl-2.13.4-cuda11.7
+export CUDA_PATH=${PLATFORM}/dep/cuda11.8-cudnn8.9
+export MPI_ROOT=${PLATFORM}/dep/openmpi-4.0.5-cuda11.8
+export NCCL_ROOT=${PLATFORM}/dep/nccl-2.15.5-cuda11.8
 export GTEST_ROOT=${PLATFORM}/dep/googletest-gcc5.4
 
 
@@ -24,11 +24,10 @@ export DIOPI_ROOT=$(pwd)/third_party/DIOPI/impl/lib/
 export DIPU_ROOT=$(pwd)/torch_dipu
 export DIOPI_PATH=$(pwd)/third_party/DIOPI/proto
 export DIPU_PATH=${DIPU_ROOT}
-export PYTORCH_DIR=${PLATFORM}/env/miniconda3.8/envs/pt2.0_diopi/lib/python3.8/site-packages
+export PYTORCH_DIR=${PLATFORM}/dep/DIOPI_pytorch/pytorch2.0_cu118
 export LD_LIBRARY_PATH=$DIPU_ROOT:$LD_LIBRARY_PATH
 export PYTHONPATH=${PYTORCH_DIR}:${PYTHONPATH}
 export PATH=${GCC_ROOT}/bin:${CONDA_ROOT}/envs/dipu_poc/bin:${CONDA_ROOT}/bin:${PLATFORM}/dep/binutils-2.27/bin:${PATH}
-export LD_PRELOAD=${GCC_ROOT}/lib64/libstdc++.so.6
 export PYTORCH_TEST_DIR=${PLATFORM}/env/miniconda3.8/envs/pt2.0_diopi/pytorch2.0
 export CUBLAS_WORKSPACE_CONFIG=:4096:8
 
@@ -45,4 +44,10 @@ export DIPU_HOST_MEMCACHING_ALGORITHM=BF
 export DIPU_PATCH_CUDA_CACHED_ALLOCATOR=0
 export DIPU_CHECK_TENSOR_DEVICE=1
 
+# Setting OMP_NUM_THREADS environment variable for each process in default,
+# to avoid your system being overloaded, please further tune the variable
+# for optimal performance in your application as needed.
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+
 source activate $ENV_NAME
diff --git a/dipu/scripts/ci/topsrider/ci_topsrider_env.sh b/dipu/scripts/ci/topsrider/ci_topsrider_env.sh
index 250ba8284d..58d8b3787d 100644
--- a/dipu/scripts/ci/topsrider/ci_topsrider_env.sh
+++ b/dipu/scripts/ci/topsrider/ci_topsrider_env.sh
@@ -16,4 +16,7 @@ export VENDOR_INCLUDE_DIRS=/usr/include/tops
 export DIOPI_PATH=${DIPU_LOCAL_DIR}/third_party/DIOPI/proto
 export DIPU_PATH=${DIPU_ROOT}
 
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+
 # source activate $ENV_NAME
diff --git a/dipu/tests/python/README.md b/dipu/tests/python/README.md
index 0c68dc8cfd..31dbb7ef64 100644
--- a/dipu/tests/python/README.md
+++ b/dipu/tests/python/README.md
@@ -28,12 +28,12 @@
     - 对于带有随机性的 op，可以考虑考察其分布的特征（参考 multinomial、random 等）。
     - 可以考虑不使用 assertion，只检测 error 不检测 failure（加上注释说明）。
   - `torch.allclose` **不**检测 shape、dtype 等，请谨慎使用。
-  - 如果需要检查 C++ 库内部的输出，可以使用 `test.python.utils.stdout_redirector.stdout_redirector` 来捕获。
+  - 如果需要检查 C++ 库内部的输出，可以使用 `utils.stdout_redirector.stdout_redirector` 来捕获。
   - 如果需要使用输出辅助 debug，可以考虑在使用 unittest 的 assertion 函数时传入 [`msg` 参数](https://docs.python.org/3/library/unittest.html#unittest.TestCase.assertEqual)。
 - **请勿**做对全局空间有影响的事，例如：
   - 修改 import 库的内容；
   - 在全局空间中定义其他函数和变量（考虑挪至 class 内）；
-  - 修改环境变量（可使用 `test.python.utils.local_eviron.local_eviron`）；
+  - 修改环境变量（可使用 `utils.local_eviron.local_eviron`）；
 - 应根据 torch 的文档广泛地测试各种使用场景。
   - 尽量借助 setUp()、class 变量等方式简化代码，不要复制大量代码，以便后续维护。
 - 对于预期会失败的测例，可以使用 `onlyOn` 和 `skipOn` 修饰器设置在某些设备上跳过测例（参考 cdist）。
@@ -46,17 +46,21 @@
 
 独立测例应该是一个可独立运行的 python 脚本。这些测试脚本会被自动转为单元测试，脚本返回值为 0 说明测试成功，否则测试失败。
 
-如果需要自动化检测 C++ 库内部的输出，可以使用 `test.python.utils.stdout_redirector.stdout_redirector` 来捕获。
+如果需要自动化检测 C++ 库内部的输出，可以使用 `utils.stdout_redirector.stdout_redirector` 来捕获。
 
 独立测例可以包含 print。不过，在自动生成的单元测试中，独立测例中的输出会在测试通过的情况下被消除。
 
+可以使用 `utils.test_in_subprocess.run_individual_test_cases` 在同一个文件中进行多个独立测例的编写。
+
 #### 子进程的 coverage 收集
 
 使用 `multiprocessing.Process` 创建的子进程在 CI 上跑 coverage 时不会被统计，因此使用这种测试方式（e.g. `test_allocator.py`）的独立测例需要一些特别的处理。
 
 #### C++ `gcov`
 
-在调用 `multiprocessing.Process` 之前，**必须**调用 `multiprocessing.set_start_method('spawn', force=True)` 修改 multiprocessing 的默认进程生成方式。
+~~在调用 `multiprocessing.Process` 之前，**必须**调用 `multiprocessing.set_start_method("spawn", force=True)` 修改 multiprocessing 的默认进程生成方式。~~
+
+请使用 `utils.test_in_subprocess.run_individual_test_cases` 来创建子进程。
 
 ##### Python `coverage`
 
diff --git a/dipu/tests/python/individual_scripts/generate_unittest_for_individual_scripts.py b/dipu/tests/python/individual_scripts/generate_unittest_for_individual_scripts.py
index c5e17db78b..39ab90d229 100644
--- a/dipu/tests/python/individual_scripts/generate_unittest_for_individual_scripts.py
+++ b/dipu/tests/python/individual_scripts/generate_unittest_for_individual_scripts.py
@@ -9,7 +9,7 @@ def generate_unittest_for_individual_scripts():
 import io
 import os
 import unittest
-from stdout_redirector import stdout_redirector
+from utils.stdout_redirector import stdout_redirector
 
 
 class TestIndividualScripts(unittest.TestCase):
diff --git a/dipu/tests/python/individual_scripts/local_eviron.py b/dipu/tests/python/individual_scripts/local_eviron.py
deleted file mode 120000
index 7570555029..0000000000
--- a/dipu/tests/python/individual_scripts/local_eviron.py
+++ /dev/null
@@ -1 +0,0 @@
-../../../torch_dipu/testing/_internal/local_eviron.py
\ No newline at end of file
diff --git a/dipu/tests/python/individual_scripts/stdout_redirector.py b/dipu/tests/python/individual_scripts/stdout_redirector.py
deleted file mode 120000
index fe5e70337c..0000000000
--- a/dipu/tests/python/individual_scripts/stdout_redirector.py
+++ /dev/null
@@ -1 +0,0 @@
-../../../torch_dipu/testing/_internal/stdout_redirector.py
\ No newline at end of file
diff --git a/dipu/tests/python/individual_scripts/test_allocator.py b/dipu/tests/python/individual_scripts/test_allocator.py
index 9ebe2563f3..281c4d25fa 100644
--- a/dipu/tests/python/individual_scripts/test_allocator.py
+++ b/dipu/tests/python/individual_scripts/test_allocator.py
@@ -1,8 +1,15 @@
+import itertools
 import os
-from multiprocessing import Process, set_start_method
+from utils.test_in_subprocess import run_individual_test_cases
 
 
-def test_allocator(max_allocate, step, algorithm, log_mask, test_pin_memory=True):
+def test_allocator(
+    max_allocate: int,
+    step: int,
+    algorithm: str,
+    log_mask: int,
+    test_pin_memory: bool = True,
+):
     os.environ["DIPU_DEVICE_MEMCACHING_ALGORITHM"] = algorithm
     os.environ["DIPU_DEBUG_ALLOCATOR"] = str(log_mask)
     os.environ["DIPU_MEM_CHECK"] = "1"
@@ -67,35 +74,16 @@ def test_allocator(max_allocate, step, algorithm, log_mask, test_pin_memory=True
 
 
 if __name__ == "__main__":
-    set_start_method('spawn', force=True)
-    max_allocate = 1 << 15
-    p1 = Process(
-        target=test_allocator,
-        args=(max_allocate, 1, "BF", 0),
+    MAX_ALLOCATE = 1 << 15
+    run_individual_test_cases(
+        itertools.product(
+            (test_allocator,),
+            (
+                {"args": (MAX_ALLOCATE, 1, "BF", 0)},
+                {"args": (MAX_ALLOCATE, 1, "BS", 0)},
+                {"args": (MAX_ALLOCATE, 1, "RAW", 0)},
+                {"args": (MAX_ALLOCATE, 17919, "BF", 3, False)},
+            ),
+        ),
+        in_parallel=False,
     )
-    p1.start()
-    p1.join()
-
-    p2 = Process(
-        target=test_allocator,
-        args=(max_allocate, 1, "BS", 0),
-    )
-    p2.start()
-    p2.join()
-
-    p3 = Process(target=test_allocator, args=(max_allocate, 1, "RAW", 0))
-    p3.start()
-    p3.join()
-
-    max_allocate = 1 << 30
-    p4 = Process(
-        target=test_allocator,
-        args=(max_allocate, 17919, "BF", 3, False),
-    )
-    p4.start()
-    p4.join()
-
-    assert p1.exitcode == 0
-    assert p2.exitcode == 0
-    assert p3.exitcode == 0
-    assert p4.exitcode == 0
diff --git a/dipu/tests/python/individual_scripts/test_dipu_fallback.py b/dipu/tests/python/individual_scripts/test_dipu_fallback.py
index 8c4f65235e..f2dbf25027 100644
--- a/dipu/tests/python/individual_scripts/test_dipu_fallback.py
+++ b/dipu/tests/python/individual_scripts/test_dipu_fallback.py
@@ -1,32 +1,172 @@
 # Copyright (c) 2023, DeepLink.
 import io
-from stdout_redirector import stdout_redirector
-from local_eviron import local_eviron
+from typing import Callable, List
+import torch
+from utils.stdout_redirector import stdout_redirector
+from utils.local_eviron import local_eviron
+from utils.test_in_subprocess import run_individual_test_cases
 
 
-def _test_dipu_fallback():
+def test_fallback(
+    op_names: List[str],
+    diopi_protos: List[str],
+    test_fn: Callable[[], None],
+    extra_check_str_in_output: List[str] = [],
+) -> None:
     captured = io.BytesIO()
     with stdout_redirector(captured):
         with local_eviron(
             {
-                "DIPU_FORCE_FALLBACK_OPS_LIST": "add.out,sub.out",
+                "DIPU_FORCE_FALLBACK_OPS_LIST": ",".join(op_names),
                 "DIPU_DUMP_OP_ARGS": "1",
+                "DIPU_LOG_FALLBACK_INFO": "1",
             }
         ):
-            import torch
             import torch_dipu
 
-            x = torch.randn(3, 4).cuda()
-            _ = x + x
-            _ = x - x
-
+            test_fn()
     output = captured.getvalue().decode()
-    assert "force fallback has been set, add.out will be fallback to cpu" in output
-    assert "force fallback has been set, sub.out will be fallback to cpu" in output
-    assert "dipu_fallback" in output
-    assert "diopiAdd" not in output
-    assert "diopiSub" not in output
+    print(output, end="")
+    assert all(
+        f"force fallback has been set, {name} will be fallback to cpu" in output
+        for name in op_names
+    )
+    assert all(item not in output for item in diopi_protos)
+    if extra_check_str_in_output is not None:
+        assert all(item in output for item in extra_check_str_in_output)
+
+
+def _test_dipu_fallback():
+    def fn():
+        x = torch.randn(3, 4).cuda()
+        _ = x + x
+        _ = x - x
+
+    test_fallback(
+        ["add.out", "sub.out"], ["diopiAdd", "diopiSub"], fn, ["dipu_fallback"]
+    )
+
+
+def _test_cpu_fallback():
+    def fn():
+        device = "cuda"
+        m = torch.nn.BatchNorm2d(100, affine=False).to(device)
+        input = torch.randn(20, 100, 35, 45).to(device)
+        m(input)
+
+    test_fallback(
+        ["native_batch_norm"],
+        ["diopiBatchNorm"],
+        fn,
+        ["cpu_fallback:\taten::native_batch_norm", "dipu_fallback"],
+    )
+
+
+def _test_dipu_index_put_impl_fallback():
+    def fn():
+        dipu_tensor = torch.tensor([1, 2, 3, 4, 5]).cuda()
+        indices = torch.tensor([1, 3]).cuda()
+        values = torch.tensor([10, 40]).cuda()
+        torch._index_put_impl_(dipu_tensor, (indices,), values, accumulate=False)
+
+        tensor = dipu_tensor.cpu()
+        indices = indices.cpu()
+        values = values.cpu()
+        torch._index_put_impl_(tensor, (indices,), values, accumulate=False)
+
+        assert torch.allclose(tensor, dipu_tensor.cpu())
+
+    test_fallback(
+        ["_index_put_impl_"],
+        ["diopiIndexPut"],
+        fn,
+        ["custom fallback to cpu, name=_index_put_impl_"],
+    )
+
+
+def _test_dipu_copy_fallback_():
+    def fn():
+        source_tensor = torch.tensor([1.0, 2.0, 3.0]).cuda()
+        target_dipu = torch.zeros_like(source_tensor).cuda()
+        target_dipu.copy_(source_tensor)
+
+        source_tensor = source_tensor.cpu()
+        target_tensor = torch.zeros_like(source_tensor)
+        target_tensor.copy_(source_tensor)
+
+        assert torch.allclose(target_tensor, target_dipu.cpu())
+
+    test_fallback(
+        ["copy_"],
+        ["diopiCopyInp"],
+        fn,
+        ["custom fallback to dipu copy, name=copy_"],
+    )
+
+
+def _test_dipu_convolution_backward_overrideable_fallback():
+    def fn():
+        torch.manual_seed(42)
+        device = torch.device("dipu")
+        m = torch.nn.Conv2d(2, 3, 3, stride=2).to(device)
+        m.weight = torch.nn.Parameter(torch.ones_like(m.weight))
+        m.bias = torch.nn.Parameter(torch.ones_like(m.bias))
+        input_dipu = torch.randn(2, 2, 5, 5).to(device).requires_grad_(True)
+        output_dipu = m(input_dipu)
+        output_dipu.backward(torch.ones_like(output_dipu))
+
+        torch.manual_seed(42)
+        m = torch.nn.Conv2d(2, 3, 3, stride=2)
+        m.weight = torch.nn.Parameter(torch.ones_like(m.weight))
+        m.bias = torch.nn.Parameter(torch.ones_like(m.bias))
+        input_cpu = torch.randn(2, 2, 5, 5, requires_grad=True)
+        output_cpu = m(input_cpu)
+        output_cpu.backward(torch.ones_like(output_cpu))
+
+        assert torch.allclose(output_dipu.cpu(), output_cpu)
+        assert torch.allclose(input_dipu.grad.cpu(), input_cpu.grad)
+
+    test_fallback(
+        ["convolution_backward_overrideable"],
+        ["diopiConvolution2dBackward"],
+        fn,
+        ["custom fallback to cpu, name=convolution_backward_overrideable"],
+    )
+
+
+def _test_dipu_convolution_overrideable_fallback():
+    def fn():
+        m = torch.nn.Conv2d(2, 3, 3, stride=2).cuda()
+        m.weight = torch.nn.Parameter(torch.ones_like(m.weight))
+        m.bias = torch.nn.Parameter(torch.ones_like(m.bias))
+        input_dipu = torch.randn(2, 2, 5, 5).cuda()
+        output_dipu = m(input_dipu)
+
+        m = m.cpu()
+        m.weight = torch.nn.Parameter(torch.ones_like(m.weight))
+        m.bias = torch.nn.Parameter(torch.ones_like(m.bias))
+        input_cpu = input_dipu.cpu()
+        output_cpu = m(input_cpu)
+
+        assert torch.allclose(output_dipu.cpu(), output_cpu)
+
+    test_fallback(
+        ["convolution_overrideable"],
+        ["diopiConvolution2d"],
+        fn,
+        ["custom fallback to cpu, name=convolution_overrideable"],
+    )
 
 
 if __name__ == "__main__":
-    _test_dipu_fallback()
+    run_individual_test_cases(
+        [
+            _test_dipu_fallback,
+            _test_cpu_fallback,
+            _test_dipu_index_put_impl_fallback,
+            _test_dipu_copy_fallback_,
+            _test_dipu_convolution_backward_overrideable_fallback,
+            _test_dipu_convolution_overrideable_fallback,
+        ],
+        in_parallel=True,
+    )
diff --git a/dipu/tests/python/individual_scripts/test_dipu_op_register.py b/dipu/tests/python/individual_scripts/test_dipu_op_register.py
index 770c41cfd7..dd0f580e72 100644
--- a/dipu/tests/python/individual_scripts/test_dipu_op_register.py
+++ b/dipu/tests/python/individual_scripts/test_dipu_op_register.py
@@ -1,9 +1,11 @@
 # Copyright (c) 2023, DeepLink.
-from multiprocessing import Process, set_start_method
-from local_eviron import local_eviron
+import itertools
+from typing import Union
+from utils.local_eviron import local_eviron
+from utils.test_in_subprocess import run_individual_test_cases
 
 
-def _test_op_register(mode):
+def _test_op_register(mode: Union[int, str]) -> None:
     with local_eviron(
         {"DIPU_IMMEDIATE_REGISTER_OP": str(mode), "DIPU_DUMP_OP_ARGS": "1"}
     ):
@@ -15,28 +17,14 @@ def _test_op_register(mode):
 
 
 if __name__ == "__main__":
-    set_start_method('spawn', force=True)
-    p1 = Process(
-        target=_test_op_register,
-        args=(0,),
+    run_individual_test_cases(
+        itertools.product(
+            (_test_op_register,),
+            (
+                {"args": (0,)},
+                {"args": (1,)},
+                {"args": ("",)},
+            ),
+        ),
+        in_parallel=True,
     )
-    p1.start()
-    p1.join()
-
-    p2 = Process(
-        target=_test_op_register,
-        args=(1,),
-    )
-    p2.start()
-    p2.join()
-
-    p3 = Process(
-        target=_test_op_register,
-        args=("",),
-    )
-    p3.start()
-    p3.join()
-
-    assert p1.exitcode == 0
-    assert p2.exitcode == 0
-    assert p3.exitcode == 0
diff --git a/dipu/tests/python/individual_scripts/test_dipu_profiler.py b/dipu/tests/python/individual_scripts/test_dipu_profiler.py
new file mode 100644
index 0000000000..95dfbd8042
--- /dev/null
+++ b/dipu/tests/python/individual_scripts/test_dipu_profiler.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2023, DeepLink.
+import os
+os.environ["FORCE_USE_DIPU_PROFILER"] = "True"
+
+import tempfile
+import torch
+import torch_dipu
+import torchvision.models as models
+from torch.profiler import profile, ProfilerActivity
+from torch_dipu.testing._internal.common_utils import TestCase, run_tests, onlyOn
+from utils.local_eviron import local_eviron
+
+
+class TestProfiler(TestCase):
+    def test_profiler(self):
+        model = models.resnet18().cuda()
+        inputs = torch.randn(5, 3, 224, 224).cuda()
+
+        with local_eviron({"KINETO_LOG_LEVEL": "999"}):  # suppress profiler logs
+            with profile(
+                activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                profile_memory=True,
+                record_shapes=True,
+                with_modules=True,
+                with_stack=True,
+                experimental_config=torch._C._profiler._ExperimentalConfig(verbose=True)
+            ) as prof:
+                output = model(inputs)
+                output.sum().backward()
+
+        profile_output = prof.key_averages(group_by_input_shape=True).table(
+            sort_by="self_cuda_time_total", row_limit=1000
+        )
+        self.assertIn("diopiConvolution2dBackward", profile_output)
+        self.assertIn("dipu_convolution_", profile_output)
+        self.assertIn("LaunchKernel_dipu", profile_output)
+        self.assertIn("LaunchKernel_diopi", profile_output)
+        self.assertIn("Self CPU time total", profile_output)
+        self.assertIn("Self CUDA time total", profile_output)
+        self.assertIn("5, 3, 224, 224", profile_output)
+
+        profile_stack_output = prof.key_averages(group_by_stack_n=15).table(
+            sort_by="cuda_time_total", row_limit=1000)
+        self.assertIn("Source Location", profile_stack_output)
+        self.assertIn("resnet.py", profile_stack_output)
+
+        profile_memory_output = prof.key_averages().table(
+            sort_by="self_cuda_memory_usage", row_limit=1000)
+        self.assertIn("Self CPU Mem", profile_memory_output)
+        self.assertIn("Self CUDA Mem", profile_memory_output)
+        self.assertIn("Mb", profile_memory_output)
+        self.assertIn("Kb", profile_memory_output)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            prof.export_chrome_trace(f"{tmpdir}/dipu_resnet18_profiler.json")
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/dipu/tests/python/individual_scripts/test_dumparg.py b/dipu/tests/python/individual_scripts/test_dumparg.py
new file mode 100644
index 0000000000..a2e3829ddf
--- /dev/null
+++ b/dipu/tests/python/individual_scripts/test_dumparg.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2023, DeepLink.
+import io
+from utils.stdout_redirector import stdout_redirector
+from utils.local_eviron import local_eviron
+
+
+def _test_copy_dumparg():
+    captured = io.BytesIO()
+    with stdout_redirector(captured):
+        with local_eviron(
+            {
+                "DIPU_DUMP_OP_ARGS": "2",
+            }
+        ):
+            import torch
+            import torch_dipu
+
+            source_tensor = torch.tensor([1.0, 2.0, 3.0]).cuda()
+            target_tensor = torch.zeros_like(source_tensor).cuda()
+            target_tensor.copy_(source_tensor)
+
+    output = captured.getvalue().decode()
+    print(output)
+    assert "DIPUCopyInplace.run" in output
+    assert "numel: 3, sizes: [3], stride: [1], is_view: 0, dtype: float" in output
+
+
+if __name__ == "__main__":
+    _test_copy_dumparg()
diff --git a/dipu/tests/python/individual_scripts/test_memory_stats.py b/dipu/tests/python/individual_scripts/test_memory_stats.py
index 34b044a1f2..3b50b5a377 100644
--- a/dipu/tests/python/individual_scripts/test_memory_stats.py
+++ b/dipu/tests/python/individual_scripts/test_memory_stats.py
@@ -1,8 +1,9 @@
+import itertools
 import os
-from multiprocessing import Process, set_start_method
+from utils.test_in_subprocess import run_individual_test_cases
 
 
-def test_mem_stats(algorithm, log_mask):
+def test_mem_stats(algorithm: str, log_mask: int):
     os.environ["DIPU_DEVICE_MEMCACHING_ALGORITHM"] = algorithm
     os.environ["DIPU_DEBUG_ALLOCATOR"] = str(log_mask)
     print("allocator algorithm:", algorithm)
@@ -13,7 +14,7 @@ def test_mem_stats(algorithm, log_mask):
     ins = []
     pin_ins = []
     real_allocated = 0
-    for i in range(100):
+    for _ in range(100):
         numel = random.randint(0, 1 << 20)
         x = torch.randn(numel).to(torch.device("cuda:0"))
         y = torch.randn(numel).pin_memory()
@@ -37,7 +38,7 @@ def test_mem_stats(algorithm, log_mask):
 
     real_max_allocate = real_allocated
 
-    for i in range(len(ins)):
+    for _ in range(len(ins)):
         numel = ins[0].numel()
         real_allocated -= ((numel * 4 - 1) | 511) + 1
         ins.pop(0)
@@ -61,25 +62,14 @@ def test_mem_stats(algorithm, log_mask):
 
 
 if __name__ == "__main__":
-    set_start_method('spawn', force=True)
-    p1 = Process(
-        target=test_mem_stats,
-        args=("BF", 0),
+    run_individual_test_cases(
+        itertools.product(
+            (test_mem_stats,),
+            (
+                {"args": ("BF", 0)},
+                {"args": ("BS", 0)},
+                {"args": ("RAW", 0)},
+            ),
+        ),
+        in_parallel=False,
     )
-    p1.start()
-    p1.join()
-
-    p2 = Process(
-        target=test_mem_stats,
-        args=("BS", 0),
-    )
-    p2.start()
-    p2.join()
-
-    p3 = Process(target=test_mem_stats, args=("RAW", 0))
-    p3.start()
-    p3.join()
-
-    assert p1.exitcode == 0
-    assert p2.exitcode == 0
-    assert p3.exitcode == 0
diff --git a/dipu/tests/python/individual_scripts/test_profiler_communication.py b/dipu/tests/python/individual_scripts/test_profiler_communication.py
index dfc279b0a3..f3cce135f4 100644
--- a/dipu/tests/python/individual_scripts/test_profiler_communication.py
+++ b/dipu/tests/python/individual_scripts/test_profiler_communication.py
@@ -1,5 +1,8 @@
 import os
+os.environ["FORCE_USE_DIPU_PROFILER"] = "True"
+
 import random
+import tempfile
 import torch
 import torch.distributed as dist
 import torch.nn as nn
@@ -56,7 +59,8 @@ def demo_basic_ddp(rank, world_size, port):
     )
     assert("c10d::allreduce_" in profile_output)
     assert("LaunchKernel_DiclAllreduce" in profile_output)
-    prof.export_chrome_trace(f"./dipu_resnet18_profiler_{rank}.json")
+    with tempfile.TemporaryDirectory() as tmpdir:
+        prof.export_chrome_trace(f"{tmpdir}/dipu_resnet18_profiler_{rank}.json")
     cleanup()
 
 def test_profiler_communication():
diff --git a/dipu/tests/python/individual_scripts/utils b/dipu/tests/python/individual_scripts/utils
new file mode 120000
index 0000000000..468ba705ba
--- /dev/null
+++ b/dipu/tests/python/individual_scripts/utils
@@ -0,0 +1 @@
+../utils
\ No newline at end of file
diff --git a/dipu/tests/python/unittests/stdout_redirector.py b/dipu/tests/python/unittests/stdout_redirector.py
deleted file mode 120000
index fe5e70337c..0000000000
--- a/dipu/tests/python/unittests/stdout_redirector.py
+++ /dev/null
@@ -1 +0,0 @@
-../../../torch_dipu/testing/_internal/stdout_redirector.py
\ No newline at end of file
diff --git a/dipu/tests/python/unittests/test_conv2d.py b/dipu/tests/python/unittests/test_conv2d.py
index e93181c670..b33677aef3 100644
--- a/dipu/tests/python/unittests/test_conv2d.py
+++ b/dipu/tests/python/unittests/test_conv2d.py
@@ -39,6 +39,23 @@ def test_conv_2d(self):
         )
         # print("conv2d output compare successfully")
 
+    def test_conv2d_nhwc(self):
+        device = torch.device("dipu")
+
+        m = nn.Conv2d(2, 3, 3).to(device=device, memory_format=torch.channels_last)
+        self.assertTrue(m.weight.is_contiguous(memory_format=torch.channels_last))
+
+        x = torch.rand(2, 2, 5, 5).to(device=device, memory_format=torch.channels_last)
+        x.requires_grad_()
+        self.assertTrue(x.is_contiguous(memory_format=torch.channels_last))
+
+        y = m(x)
+        self.assertTrue(y.is_contiguous(memory_format=torch.channels_last))
+
+        y.backward(torch.rand_like(y))
+        self.assertTrue(x.grad.is_contiguous(memory_format=torch.channels_last))
+        self.assertTrue(m.weight.grad.is_contiguous(memory_format=torch.channels_last))
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/dipu/tests/python/unittests/test_layer_norm.py b/dipu/tests/python/unittests/test_layer_norm.py
index aec7d0aa97..bb6424a811 100644
--- a/dipu/tests/python/unittests/test_layer_norm.py
+++ b/dipu/tests/python/unittests/test_layer_norm.py
@@ -76,6 +76,21 @@ def test_layer_norm_no_affine(self):
         )
         self._run_layer_norm()
 
+    # maybe we don't want ChannelsLast -> Contiguous here, but just align with pytorch
+    # https://github.com/pytorch/pytorch/blob/v2.0.0/aten/src/ATen/native/cuda/layer_norm_kernel.cu#L1340-L1346
+    def test_layer_norm_out_format(self):
+        l = torch.nn.LayerNorm(4).cuda()
+        xs = [
+            torch.rand(2, 3, 5, 4, device='cuda').to(memory_format=torch.channels_last),
+            torch.rand(2, 4, 3, device='cuda').permute([0, 2, 1]),
+            torch.rand(2, 6, device='cuda')[:, 1:5],
+        ]
+        for x in xs:
+            y = l(x)
+            # seems can't get LEGACY_CONTIGUOUS_MEMORY_FORMAT in python,
+            # just assume it's MemoryFormat::Contiguous
+            self.assertTrue(y.is_contiguous())
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/dipu/tests/python/unittests/test_minimum_maximum.py b/dipu/tests/python/unittests/test_minimum_maximum.py
index eecc57bc18..a6b00383d4 100644
--- a/dipu/tests/python/unittests/test_minimum_maximum.py
+++ b/dipu/tests/python/unittests/test_minimum_maximum.py
@@ -15,6 +15,26 @@ def test_minimum(self):
         r_cpu = torch.minimum(a.to(self.cpu), b.to(self.cpu))
         self.assertEqual(r_dipu.to(self.cpu), r_cpu)
 
+    def test_minimum_scalar(self):
+        # special test cases from the inference of internlm
+        a = torch.randn((3, 4))
+        b = torch.tensor(torch.finfo(a.dtype).max)
+        # scalar on cpu
+        r_dipu1 = torch.minimum(a.to(self.dipu), b)
+        # scalar on device
+        r_dipu2 = torch.minimum(a.to(self.dipu), b.to(self.dipu))
+        r_cpu = torch.minimum(a, b)
+        self.assertEqual(r_dipu1.to(self.cpu), r_cpu)
+        self.assertEqual(r_dipu2.to(self.cpu), r_cpu)
+
+    def test_minimum_different_devices(self):
+        a = torch.tensor([1, -2, 3])
+        b = torch.tensor([4, 0, 2]).to(self.dipu)
+        with self.assertRaises(RuntimeError) as context:
+            torch.minimum(a, b)
+        self.assertIn(
+            'Expected all tensors to be on the same device', str(context.exception))
+
     def test_maximum(self):
         a = torch.tensor((1, 2, -1))
         b = torch.tensor((3, 0, 4))
@@ -22,6 +42,26 @@ def test_maximum(self):
         r_cpu = torch.maximum(a.to(self.cpu), b.to(self.cpu))
         self.assertEqual(r_dipu.to(self.cpu), r_cpu)
 
+    def test_maximum_scalar(self):
+        # special test cases from the inference of internlm
+        a = torch.randn((3, 4))
+        b = torch.tensor(torch.finfo(a.dtype).min)
+        # scalar on cpu
+        r_dipu1 = torch.maximum(a.to(self.dipu), b)
+        # scalar on device
+        r_dipu2 = torch.maximum(a.to(self.dipu), b.to(self.dipu))
+        r_cpu = torch.maximum(a, b)
+        self.assertEqual(r_dipu1.to(self.cpu), r_cpu)
+        self.assertEqual(r_dipu2.to(self.cpu), r_cpu)
+
+    def test_maximum_different_devices(self):
+        a = torch.tensor([1, -2, 3])
+        b = torch.tensor([4, 0, 2]).to(self.dipu)
+        with self.assertRaises(RuntimeError) as context:
+            torch.maximum(a, b)
+        self.assertIn(
+            'Expected all tensors to be on the same device', str(context.exception))
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/dipu/tests/python/unittests/test_mm.py b/dipu/tests/python/unittests/test_mm.py
index f3c8a7eb10..992ece4f82 100644
--- a/dipu/tests/python/unittests/test_mm.py
+++ b/dipu/tests/python/unittests/test_mm.py
@@ -9,7 +9,7 @@ def test_mm(self):
         dipu = torch.device("dipu")
         cpu = torch.device("cpu")
         mat1 = torch.randn(2, 3)
-        mat2 = torch.randn(3, 3)
+        mat2 = torch.randn(3, 4)
         r1 = torch.mm(mat1.to(dipu), mat2.to(dipu))
         r2 = torch.mm(mat1.to(cpu), mat2.to(cpu))
         self.assertEqual(r1.to(cpu), r2)
diff --git a/dipu/tests/python/unittests/test_prod.py b/dipu/tests/python/unittests/test_prod.py
index 5f0f4fa3fa..24964673a9 100644
--- a/dipu/tests/python/unittests/test_prod.py
+++ b/dipu/tests/python/unittests/test_prod.py
@@ -25,12 +25,12 @@ def test_prod_bool(self):
         input_arrays = [[True, True], [True, False], [False, False]]
         for input_array in input_arrays:
             input_tensor = torch.tensor(input_array)
-            out = torch.prod(input_tensor).item()
-            out_cuda = torch.prod(input_tensor.cuda()).item()
-            self.assertEqual(out, out_cuda)
+            out = torch.prod(input_tensor)
+            out_cuda = torch.prod(input_tensor.cuda())
+            self.assertEqual(out, out_cuda, exact_dtype=True)
 
     def test_prod_dtype(self):
-        test_dtypes = [torch.float16, torch.float32]
+        test_dtypes = [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64]
         for input_dtype in test_dtypes:
             input_tensor = torch.tensor(
                 [[1, 2, 3], [4, 5, 6]], dtype=input_dtype, device="dipu"
@@ -46,6 +46,20 @@ def test_prod_dtype(self):
                 out = torch.prod(input_tensor, 1, dtype=output_dtype)
                 self.assertEqual(out, expected_output, exact_dtype=True)
 
+    def test_prod_integer_promotion(self):
+        test_dtypes = [torch.int8, torch.int16, torch.int32]
+        for input_dtype in test_dtypes:
+            input_tensor = torch.tensor(
+                [[1, 2, 3], [4, 5, 6]], dtype=input_dtype, device="dipu"
+            )
+            expected_output = torch.tensor(720, dtype=torch.int64, device="dipu")
+            out = torch.prod(input_tensor)
+            self.assertEqual(out, expected_output, exact_dtype=True)
+
+            expected_output = torch.tensor([6, 120], dtype=torch.int64, device="dipu")
+            out = torch.prod(input_tensor, 1)
+            self.assertEqual(out, expected_output, exact_dtype=True)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/dipu/tests/python/unittests/test_profiler.py b/dipu/tests/python/unittests/test_profiler.py
index 5343ce1712..fbe75e3e68 100644
--- a/dipu/tests/python/unittests/test_profiler.py
+++ b/dipu/tests/python/unittests/test_profiler.py
@@ -1,10 +1,7 @@
 # Copyright (c) 2023, DeepLink.
 import torch
 import torch_dipu
-import torchvision.models as models
-from torch.profiler import profile, ProfilerActivity
 from torch_dipu.testing._internal.common_utils import TestCase, run_tests, onlyOn
-from torch_dipu.testing._internal.local_eviron import local_eviron
 import torch._dynamo as dynamo
 import subprocess
 
@@ -17,50 +14,7 @@ def check_string_in_directory(directory, search_string):
         return False
 
 
-
 class TestProfiler(TestCase):
-    def test_profiler(self):
-        model = models.resnet18().cuda()
-        inputs = torch.randn(5, 3, 224, 224).cuda()
-
-        with local_eviron({"KINETO_LOG_LEVEL": "999"}):  # suppress profiler logs
-            with profile(
-                activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
-                profile_memory=True,
-                record_shapes=True,
-                with_modules=True,
-                with_stack=True,
-                experimental_config=torch._C._profiler._ExperimentalConfig(verbose=True)
-            ) as prof:
-                output = model(inputs)
-                output.sum().backward()
-
-        profile_output = prof.key_averages(group_by_input_shape=True).table(
-            sort_by="self_cuda_time_total", row_limit=1000
-        )
-        self.assertIn("diopiConvolution2dBackward", profile_output)
-        self.assertIn("dipu_convolution_", profile_output)
-        self.assertIn("LaunchKernel_dipu", profile_output)
-        self.assertIn("LaunchKernel_diopi", profile_output)
-        self.assertIn("Self CPU time total", profile_output)
-        self.assertIn("Self CUDA time total", profile_output)
-        self.assertIn("5, 3, 224, 224", profile_output)
-
-        profile_stack_output = prof.key_averages(group_by_stack_n=15).table(
-            sort_by="cuda_time_total", row_limit=1000)
-        self.assertIn("Source Location", profile_stack_output)
-        self.assertIn("resnet.py", profile_stack_output)
-        self.assertIn("test_profiler.py", profile_stack_output)
-
-        profile_memory_output = prof.key_averages().table(
-            sort_by="self_cuda_memory_usage", row_limit=1000)
-        self.assertIn("Self CPU Mem", profile_memory_output)
-        self.assertIn("Self CUDA Mem", profile_memory_output)
-        self.assertIn("Mb", profile_memory_output)
-        self.assertIn("Kb", profile_memory_output)
-
-        prof.export_chrome_trace("./dipu_resnet18_profiler.json")
-
     @onlyOn("NPU")
     def test_aot_profiler(self):
         x = torch.randn(3, 4).cuda()
diff --git a/dipu/tests/python/unittests/test_profiler_cuda.py b/dipu/tests/python/unittests/test_profiler_cuda.py
new file mode 100644
index 0000000000..3937cf4e7b
--- /dev/null
+++ b/dipu/tests/python/unittests/test_profiler_cuda.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2023, DeepLink.
+import tempfile
+import torch
+import torch_dipu
+import torchvision.models as models
+from torch.profiler import profile, ProfilerActivity
+from torch_dipu.testing._internal.common_utils import TestCase, run_tests, onlyOn
+from utils.local_eviron import local_eviron
+
+
+class TestProfiler(TestCase):
+    @onlyOn("CUDA")
+    def test_profiler(self):
+        model = models.resnet18().cuda()
+        inputs = torch.randn(5, 3, 224, 224).cuda()
+
+        with local_eviron({"KINETO_LOG_LEVEL": "999"}):  # suppress profiler logs
+            with profile(
+                activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                profile_memory=True,
+                record_shapes=True,
+                with_modules=True,
+                with_stack=True,
+                experimental_config=torch._C._profiler._ExperimentalConfig(verbose=True)
+            ) as prof:
+                output = model(inputs)
+                output.sum().backward()
+
+        profile_output = prof.key_averages(group_by_input_shape=True).table(
+            sort_by="self_cuda_time_total", row_limit=1000
+        )
+        self.assertNotIn("diopiConvolution2dBackward", profile_output)
+        self.assertNotIn("dipu_convolution_", profile_output)
+        self.assertNotIn("LaunchKernel_dipu", profile_output)
+        self.assertNotIn("LaunchKernel_diopi", profile_output)
+        self.assertIn("aten::cudnn_convolution", profile_output)
+        self.assertIn("aten::add", profile_output)
+        self.assertIn("vectorized_elementwise_kernel", profile_output)
+        self.assertIn("Self CPU time total", profile_output)
+        self.assertIn("Self CUDA time total", profile_output)
+        self.assertIn("5, 3, 224, 224", profile_output)
+
+        profile_stack_output = prof.key_averages(group_by_stack_n=15).table(
+            sort_by="cuda_time_total", row_limit=1000)
+        self.assertIn("Source Location", profile_stack_output)
+        self.assertIn("resnet.py", profile_stack_output)
+
+        profile_memory_output = prof.key_averages().table(
+            sort_by="self_cuda_memory_usage", row_limit=1000)
+        self.assertIn("Self CPU Mem", profile_memory_output)
+        self.assertIn("Self CUDA Mem", profile_memory_output)
+        self.assertIn("Mb", profile_memory_output)
+        self.assertIn("Kb", profile_memory_output)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            prof.export_chrome_trace(f"{tmpdir}/resnet18_profiler.json")
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/dipu/tests/python/unittests/utils b/dipu/tests/python/unittests/utils
new file mode 120000
index 0000000000..468ba705ba
--- /dev/null
+++ b/dipu/tests/python/unittests/utils
@@ -0,0 +1 @@
+../utils
\ No newline at end of file
diff --git a/dipu/tests/python/utils/__init__.py b/dipu/tests/python/utils/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/dipu/torch_dipu/testing/_internal/local_eviron.py b/dipu/tests/python/utils/local_eviron.py
similarity index 100%
rename from dipu/torch_dipu/testing/_internal/local_eviron.py
rename to dipu/tests/python/utils/local_eviron.py
diff --git a/dipu/torch_dipu/testing/_internal/stdout_redirector.py b/dipu/tests/python/utils/stdout_redirector.py
similarity index 100%
rename from dipu/torch_dipu/testing/_internal/stdout_redirector.py
rename to dipu/tests/python/utils/stdout_redirector.py
index 903f023c51..64669caae9 100644
--- a/dipu/torch_dipu/testing/_internal/stdout_redirector.py
+++ b/dipu/tests/python/utils/stdout_redirector.py
@@ -48,12 +48,12 @@ def _redirect_stdout(to_fd):
         _redirect_stdout(tfile.fileno())
         # Yield to caller, then redirect stdout back to the saved fd
         yield
+    finally:
         _redirect_stdout(saved_stdout_fd)
         # Copy contents of temporary file to the given stream
         tfile.flush()
         tfile.seek(0, io.SEEK_SET)
         stream.write(tfile.read())
-    finally:
         tfile.close()
         os.close(saved_stdout_fd)
 
diff --git a/dipu/tests/python/utils/test_in_subprocess.py b/dipu/tests/python/utils/test_in_subprocess.py
new file mode 100644
index 0000000000..6268ea6997
--- /dev/null
+++ b/dipu/tests/python/utils/test_in_subprocess.py
@@ -0,0 +1,97 @@
+import io
+import os
+import pathlib
+import queue
+import sys
+from multiprocessing import Process, Queue, set_start_method
+from tempfile import TemporaryDirectory
+from typing import Callable, Iterable, List, Tuple, TypedDict, Union
+from .stdout_redirector import stdout_redirector
+
+
+class Args(TypedDict, total=False):
+    args: tuple
+    kwargs: dict
+
+
+def _run_individual_test_cases_sequential(
+    entry_points: Iterable[Tuple[Callable, Args]]
+) -> None:
+    all_tests_pass = True
+    for entry_point, args in entry_points:
+        p = Process(
+            target=entry_point, args=args.get("args", ()), kwargs=args.get("kwargs", {})
+        )
+        p.start()
+        p.join()
+        all_tests_pass = all_tests_pass and p.exitcode == 0
+    assert all_tests_pass
+
+
+def _entry_point_wrapper(
+    entry_point: Callable, future_output: Queue, log_dir: str, *args, **kwargs
+) -> None:
+    sys.stderr = open(f"{log_dir}/stderr_{os.getpid()}", "w")
+    captured = io.BytesIO()
+    try:
+        with stdout_redirector(captured):
+            entry_point(*args, **kwargs)
+    finally:
+        future_output.put(captured.getvalue().decode("utf-8"))
+
+
+def _run_individual_test_cases_parallel(
+    entry_points: Iterable[Tuple[Callable, Args]]
+) -> None:
+    with TemporaryDirectory() as tmpdir:
+        future_outputs: List[Queue] = []
+        ps: List[Process] = []
+        for entry_point, args in entry_points:
+            future_output = Queue()
+            p = Process(
+                target=_entry_point_wrapper,
+                args=(entry_point, future_output, tmpdir) + args.get("args", ()),
+                kwargs=args.get("kwargs", {}),
+            )
+            p.start()
+            future_outputs.append(future_output)
+            ps.append(p)
+
+        all_tests_pass = True
+        for p, future_output in zip(ps, future_outputs):
+            p.join()
+            try:
+                print(future_output.get_nowait(), end="")
+            except queue.Empty:
+                all_tests_pass = False
+            print(
+                pathlib.Path(f"{tmpdir}/stderr_{p.pid}").read_text(),
+                end="",
+                file=sys.stderr,
+            )
+            all_tests_pass = all_tests_pass and p.exitcode == 0
+        assert all_tests_pass
+
+
+def run_individual_test_cases(
+    entry_points: Iterable[Union[Callable, Tuple[Callable, Args]]],
+    in_parallel: bool = False,
+) -> None:
+    """
+    Run test cases in individual processes in parallel or sequential.
+    WARN: This function must be called within an `if __name__ == "__main__"` region.
+    ---
+    Args:
+        `entry_points`: A sequence of test cases. Each test case is either a function
+            or a tuple of a function and its arguments
+            `(func, {"args": [...], "kwargs": {...}})`.
+        `in_parallel`: Whether to run test cases in parallel.
+    """
+    set_start_method("spawn", force=True)  # this is required for gcov to work
+    uniform_entry_points: Iterable[Tuple[Callable, Args]] = map(
+        lambda x: x if isinstance(x, tuple) else (x, {}), entry_points
+    )
+    if in_parallel:
+        _run_individual_test_cases_parallel(uniform_entry_points)
+    else:
+        _run_individual_test_cases_sequential(uniform_entry_points)
diff --git a/dipu/third_party/DIOPI b/dipu/third_party/DIOPI
index 9b9589b226..385ce67f65 160000
--- a/dipu/third_party/DIOPI
+++ b/dipu/third_party/DIOPI
@@ -1 +1 @@
-Subproject commit 9b9589b226d3a18482582037d9707574fe39fd48
+Subproject commit 385ce67f65c1c785c9a3713465c6489025da7bf1
diff --git a/dipu/third_party/kineto b/dipu/third_party/kineto
index c1bed2f2dc..2923b3002a 160000
--- a/dipu/third_party/kineto
+++ b/dipu/third_party/kineto
@@ -1 +1 @@
-Subproject commit c1bed2f2dc3779dec2a63025ea1b72a957f4badf
+Subproject commit 2923b3002a179d6dfe202e6d032567bb2816eae7
diff --git a/dipu/torch_dipu/csrc_dipu/CMakeLists.txt b/dipu/torch_dipu/csrc_dipu/CMakeLists.txt
index 764c36c910..f12feb8558 100644
--- a/dipu/torch_dipu/csrc_dipu/CMakeLists.txt
+++ b/dipu/torch_dipu/csrc_dipu/CMakeLists.txt
@@ -29,12 +29,15 @@ add_custom_command(
   COMMAND
     python "${DIPU_AUTOGEN_DIOPI_WRAPPER_SCRIPT}" --config
     "${DIPU_AUTOGEN_DIOPI_WRAPPER_CONFIG}" --out "${DIPU_AUTOGENED_KERNELS_CPP}"
+    "$<$<NOT:$<STREQUAL:${UsedVendor},cuda>>:--convert_config=${CMAKE_SOURCE_DIR}/third_party/DIOPI/impl/${UsedVendor}/convert_config.yaml>"
     --use_diopi_adapter "False" --autocompare "False" --print_func_call_info "True"
     --print_op_arg "True" --fun_config_dict
     '{\"current_device\": \"${UsedVendor}\"}'
   DEPENDS ${DIPU_AUTOGEN_DIOPI_WRAPPER_SCRIPT}
           ${DIPU_AUTOGEN_DIOPI_WRAPPER_CONFIG}
-          ${DIPU_AUTOGEN_DIOPI_WRAPPER_TEMPLATE})
+          ${DIPU_AUTOGEN_DIOPI_WRAPPER_TEMPLATE}
+          "$<$<NOT:$<STREQUAL:${UsedVendor},cuda>>:${CMAKE_SOURCE_DIR}/third_party/DIOPI/impl/${UsedVendor}/convert_config.yaml>"
+)
 add_custom_target(autogen_diopi_kernels_cpp
                   DEPENDS ${DIPU_AUTOGENED_KERNELS_CPP})
 add_dependencies(${DIPU_AUTOGENED_KERNELS} autogen_diopi_kernels_cpp)
diff --git a/dipu/torch_dipu/csrc_dipu/aten/DIPUATenFunctions.h b/dipu/torch_dipu/csrc_dipu/aten/DIPUATenFunctions.h
index 010c07836c..36bc802fa3 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/DIPUATenFunctions.h
+++ b/dipu/torch_dipu/csrc_dipu/aten/DIPUATenFunctions.h
@@ -1,62 +1,65 @@
 // Copyright (c) 2023, DeepLink.
 #pragma once
 
-#include <ATen/ATen.h>
-#include <ATen/Dispatch.h>
-#include <ATen/Tensor.h>
+#include <cstddef>
+#include <cstdint>
+
+#include <ATen/core/ATen_fwd.h>
+#include <ATen/core/TensorBody.h>
+#include <c10/core/Device.h>
+#include <c10/core/Layout.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/Storage.h>
+#include <c10/core/StorageImpl.h>
+#include <c10/util/Optional.h>
 
 namespace dipu {
 namespace native {
+namespace dipu_aten {
+// dipu native func
+at::Tensor empty(at::IntArrayRef size, c10::optional<at::ScalarType> dtype_opt,
+                 c10::optional<at::Layout> layout_opt,
+                 c10::optional<at::Device> device_opt,
+                 c10::optional<bool> pin_memory_opt,
+                 c10::optional<at::MemoryFormat> memory_format_opt);
+at::Tensor empty_cpu(at::IntArrayRef size,
+                     c10::optional<at::ScalarType> dtype_opt,
+                     c10::optional<at::Layout> layout_opt,
+                     c10::optional<at::Device> device_opt,
+                     c10::optional<bool> pin_memory_opt,
+                     c10::optional<at::MemoryFormat> memory_format_opt);
+
+at::Tensor empty_strided(at::IntArrayRef size, at::IntArrayRef stride,
+                         c10::optional<at::ScalarType> dtype_opt,
+                         c10::optional<at::Layout> layout_opt,
+                         c10::optional<at::Device> device_opt,
+                         c10::optional<bool> pin_memory_opt);
+at::Tensor empty_strided_cpu(at::IntArrayRef size, at::IntArrayRef stride,
+                             c10::optional<at::ScalarType> dtype_opt,
+                             c10::optional<at::Layout> layout_opt,
+                             c10::optional<at::Device> device_opt,
+                             c10::optional<bool> pin_memory_opt);
+
+const at::Tensor& resize_(const at::Tensor& self, at::IntArrayRef size,
+                          c10::optional<at::MemoryFormat> memory_format);
+
+at::Scalar _local_scalar_dense_dipu(const at::Tensor& self);
+
+at::Tensor& set_storage_dipu_(at::Tensor& result, c10::Storage storage,
+                              int64_t storage_offset, at::IntArrayRef size,
+                              at::IntArrayRef stride);
+at::Tensor& set_dipu_(at::Tensor& self);
+
+void resize_bytes_dipu(c10::StorageImpl* storage, size_t newsize_bytes);
+
+bool is_pinned(const at::Tensor& self, c10::optional<at::Device> device);
+at::Tensor _pin_memory(const at::Tensor& self,
+                       c10::optional<at::Device> device);
 
-struct DIPUATenFunctions {
-  // dipu native func
-  static at::Tensor empty(at::IntArrayRef size,
-                          c10::optional<at::ScalarType> dtype_opt,
-                          c10::optional<at::Layout> layout_opt,
-                          c10::optional<at::Device> device_opt,
-                          c10::optional<bool> pin_memory_opt,
-                          c10::optional<at::MemoryFormat> memory_format_opt);
-  static at::Tensor empty_cpu(
-      at::IntArrayRef size, c10::optional<at::ScalarType> dtype_opt,
-      c10::optional<at::Layout> layout_opt,
-      c10::optional<at::Device> device_opt, c10::optional<bool> pin_memory_opt,
-      c10::optional<at::MemoryFormat> memory_format_opt);
-
-  static at::Tensor empty_strided(at::IntArrayRef size, at::IntArrayRef stride,
-                                  c10::optional<at::ScalarType> dtype_opt,
-                                  c10::optional<at::Layout> layout_opt,
-                                  c10::optional<at::Device> device_opt,
-                                  c10::optional<bool> pin_memory_opt);
-  static at::Tensor empty_strided_cpu(at::IntArrayRef size,
-                                      at::IntArrayRef stride,
-                                      c10::optional<at::ScalarType> dtype_opt,
-                                      c10::optional<at::Layout> layout_opt,
-                                      c10::optional<at::Device> device_opt,
-                                      c10::optional<bool> pin_memory_opt);
-
-  static const at::Tensor& resize_(
-      const at::Tensor& self, at::IntArrayRef size,
-      c10::optional<at::MemoryFormat> memory_format);
-
-  static at::Scalar _local_scalar_dense_dipu(const at::Tensor& self);
-
-  static at::Tensor& set_storage_dipu_(at::Tensor& result, c10::Storage storage,
-                                       int64_t storage_offset,
-                                       at::IntArrayRef size,
-                                       at::IntArrayRef stride);
-  static at::Tensor& set_dipu_(at::Tensor& self);
-
-  static void resize_bytes_dipu(c10::StorageImpl* storage,
-                                size_t newsize_bytes);
-
-  static bool is_pinned(const at::Tensor& self,
-                        c10::optional<at::Device> device);
-  static at::Tensor _pin_memory(const at::Tensor& self,
-                                c10::optional<at::Device> device);
-
-  // todo:: use same format as autogen
-  // diopi function defined in AutoGenedKernels.cpp,
-};
+// todo:: use same format as autogen
+// diopi function defined in AutoGenedKernels.cpp,
+};  // namespace dipu_aten
 
 }  // namespace native
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.cpp b/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.cpp
index 6898e83b0c..e03796b938 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.cpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.cpp
@@ -14,42 +14,57 @@
 #include <csrc_dipu/base/basedef.h>
 #include <csrc_dipu/profiler/profiler.h>
 
-using dnative = dipu::native::DIPUATenFunctions;
-
-static std::string force_fallback_operators_list = []() -> std::string {
-  std::ifstream stream(".dipu_force_fallback_op_list.config",
-                       std::ios_base::in | std::ios::binary);
-  std::string content;
-  const char* env = std::getenv("DIPU_FORCE_FALLBACK_OPS_LIST");
-  if (env != nullptr) {
-    content += env;
-  }
-  if (stream.is_open()) {
-    while (!stream.eof()) {
-      std::string line;
-      stream >> line;
-      content += "," + line;
+namespace dnative = dipu::native::dipu_aten;
+
+namespace dipu {
+namespace {
+
+void read_comma_separated_list(std::istream& input,
+                               std::vector<std::string>& output) {
+  auto line = std::string();
+  while (std::getline(input, line)) {
+    auto buffer = std::stringstream(line);
+    auto value = std::string();
+    while (std::getline(buffer, value, ',')) {
+      output.push_back(std::move(value));
     }
   }
-  return content;
-}();
+}
+
+std::vector<std::string> getFallbackList() {
+  auto fallback_list = std::vector<std::string>();
+  if (auto env = std::getenv("DIPU_FORCE_FALLBACK_OPS_LIST")) {
+    auto iss = std::stringstream(env);
+    read_comma_separated_list(iss, fallback_list);
+  }
+  auto file = std::ifstream(".dipu_force_fallback_op_list.config",
+                            std::ios_base::in | std::ios::binary);
+  read_comma_separated_list(file, fallback_list);
+
+  return fallback_list;
+}
+
+const std::vector<std::string> force_fallback_operators_list =
+    getFallbackList();
+
+}  // end of namespace
 
-namespace dipu {
 bool get_force_fallback(const char* opname) {
-  if (force_fallback_operators_list.size() <= 0 || opname == nullptr) {
+  if (force_fallback_operators_list.empty() || opname == nullptr) {
     return false;
-  } else {
-    std::stringstream strstream(force_fallback_operators_list);
-    std::string force_fallback_pattern;
-    while (std::getline(strstream, force_fallback_pattern, ',')) {
-      if (force_fallback_pattern.size() <= 0) {
-        continue;
-      }
+  }
+  for (auto& force_fallback_pattern : force_fallback_operators_list) {
+    if (force_fallback_pattern.empty()) {
+      continue;
+    }
+    try {
       bool force_fallback =
           std::regex_match(opname, std::regex(force_fallback_pattern));
       if (force_fallback) {
         return true;
       }
+    } catch (const std::regex_error& e) {
+      TORCH_CHECK(false, e.what());
     }
   }
   return false;
@@ -76,7 +91,7 @@ void dump_fallback_op_args(const c10::OperatorHandle& op,
   const auto num_arguments = schema_args.size();
   auto arguments = torch::jit::last(stack, num_arguments);
 
-  auto dumpTensor = [&](const at::Tensor tensor) {
+  auto dumpTensor = [&](const at::Tensor& tensor) {
     if (tensor.defined()) {
       std::cout << "numel: " << tensor.numel() << ", sizes: " << tensor.sizes()
                 << ", stride: " << tensor.strides()
@@ -97,7 +112,6 @@ void dump_fallback_op_args(const c10::OperatorHandle& op,
     }
   };
 
-  const auto arguments_begin = stack->size() - num_arguments;
   for (const auto idx : c10::irange(arguments.size())) {
     std::cout << "\t" << name << ": \t" << schema_args[idx].name() << ": ";
     const auto& ivalue = arguments[idx];
@@ -108,9 +122,9 @@ void dump_fallback_op_args(const c10::OperatorHandle& op,
     } else if (ivalue.isTensorList()) {
       const auto& tensorlist = ivalue.toTensorList();
       std::cout << std::endl;
-      for (size_t i = 0; i < tensorlist.size(); i++) {
+      for (const auto& tensor : tensorlist) {
         std::cout << "\t";
-        dumpTensor(tensorlist[i]);
+        dumpTensor(tensor);
         std::cout << std::endl;
       }
     } else {
@@ -149,16 +163,17 @@ void dipu_fallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys,
   }
 }
 
+// NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables)
 std::deque<std::tuple<torch::Library*, DIPUOpRegister::OpRegFunPtr>>
     DIPUOpRegister::dipuOpRegisterList;
 std::mutex DIPUOpRegister::mutex_;
+// NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables)
 
 void DIPUOpRegister::register_op() {
   std::lock_guard<std::mutex> guard(mutex_);
-  for (auto iter = dipuOpRegisterList.begin(); iter != dipuOpRegisterList.end();
-       ++iter) {
-    torch::Library* lib = std::get<0>(*iter);
-    DIPUOpRegister::OpRegFunPtr fun_ptr = std::get<1>(*iter);
+  for (auto& iter : dipuOpRegisterList) {
+    torch::Library* lib = std::get<0>(iter);
+    DIPUOpRegister::OpRegFunPtr fun_ptr = std::get<1>(iter);
     fun_ptr(*lib);
   }
   dipuOpRegisterList.clear();
@@ -288,6 +303,7 @@ at::Scalar wrapper_DIPU___local_scalar_dense(const at::Tensor& self) {
   return dnative::_local_scalar_dense_dipu(self);
 }
 
+// NOLINTBEGIN(performance-unnecessary-value-param)
 at::Tensor& wrapper_DIPU_source_Storage_set_(at::Tensor& self,
                                              at::Storage source) {
   // No device check
@@ -302,10 +318,11 @@ at::Tensor& wrapper_DIPU_source_Storage_offset_set_(
     c10::SymIntArrayRef size, c10::SymIntArrayRef stride) {
   // No device check
   // DeviceGuard omitted
-  return dnative::set_storage_dipu_(self, source, storage_offset.expect_int(),
-                                    C10_AS_INTARRAYREF_SLOW(size),
-                                    C10_AS_INTARRAYREF_SLOW(stride));
+  return dnative::set_storage_dipu_(
+      self, std::move(source), storage_offset.expect_int(),
+      C10_AS_INTARRAYREF_SLOW(size), C10_AS_INTARRAYREF_SLOW(stride));
 }
+// NOLINTEND(performance-unnecessary-value-param)
 
 at::Tensor& wrapper_DIPU_source_Tensor_set_(at::Tensor& self,
                                             const at::Tensor& source) {
@@ -413,7 +430,7 @@ DIPU_LIBRARY_IMPL(aten, DIPU_DEVICE_TYPE_MACRO, m) {
 
 class IgnoreWarningHandler : public c10::WarningHandler {
  public:
-  void process(const c10::Warning& warning) {
+  void process(const c10::Warning& warning) override {
     // do nothing
   }
 };
diff --git a/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.hpp b/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.hpp
index 9cef60995c..aa5acbb20b 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.hpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/RegisterDIPU.hpp
@@ -18,6 +18,7 @@ void dipu_fallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys,
                    torch::jit::Stack* stack);
 
 // Print the warning message only once for one process.
+// NOLINTBEGIN(bugprone-macro-parentheses): x cannot be in parentheses
 #define DIPU_LOG_WARNING_ONCE(x)     \
   do {                               \
     static bool should_print = true; \
@@ -26,6 +27,7 @@ void dipu_fallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys,
       should_print = false;          \
     }                                \
   } while (0)
+// NOLINTEND(bugprone-macro-parentheses)
 
 // Check the environment variable and call the DIPU_LOG_WARNING_ONCE
 #define DIPU_OP_LOG_WARNING_ONCE(...)                      \
@@ -53,8 +55,8 @@ void dipu_fallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys,
       } else {                                                               \
         DIPU_OP_LOG_WARNING_ONCE("force fallback has been set, ");           \
       }                                                                      \
-      DIPU_OP_LOG_WARNING_ONCE(opname << " will be fallback to cpu"          \
-                                      << std::endl);                         \
+      DIPU_OP_LOG_WARNING_ONCE((opname) << " will be fallback to cpu"        \
+                                        << "\n");                            \
     }                                                                        \
   } while (false);
 
@@ -62,7 +64,7 @@ void dipu_fallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys,
                                         wapper_func, custom_fallback_func)    \
   do {                                                                        \
     if ((reinterpret_cast<void*>(diopi_func) != nullptr) &&                   \
-        !(force_fallback || dipu::get_force_fallback(opname))) {              \
+        !((force_fallback) || dipu::get_force_fallback(opname))) {            \
       m.impl(opname, TORCH_FN(wapper_func));                                  \
     } else {                                                                  \
       if ((reinterpret_cast<void*>(diopi_func) == nullptr)) {                 \
@@ -70,22 +72,24 @@ void dipu_fallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys,
       } else {                                                                \
         DIPU_OP_LOG_WARNING_ONCE("force fallback has been set, ");            \
       }                                                                       \
-      DIPU_OP_LOG_WARNING_ONCE(opname << " will be fallback to cpu"           \
-                                      << std::endl);                          \
+      DIPU_OP_LOG_WARNING_ONCE((opname) << " will be fallback to cpu"         \
+                                        << "\n");                             \
       m.impl(opname, TORCH_FN(custom_fallback_func));                         \
     }                                                                         \
   } while (false);
 
 class DIPUOpRegister {
  public:
-  typedef void (*OpRegFunPtr)(torch::Library&);
+  using OpRegFunPtr = void (*)(torch::Library&);
 
  private:
   OpRegFunPtr fun_ptr_;
   torch::Library lib_;
+  // NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables)
   static std::deque<std::tuple<torch::Library*, OpRegFunPtr>>
       dipuOpRegisterList;
   static std::mutex mutex_;
+  // NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables)
 
  public:
   DIPUOpRegister(OpRegFunPtr fun_ptr, const char* ns,
@@ -97,7 +101,7 @@ class DIPUOpRegister {
       fun_ptr_(lib_);
     } else {
       std::lock_guard<std::mutex> guard(mutex_);
-      dipuOpRegisterList.push_back(std::make_tuple(&lib_, fun_ptr_));
+      dipuOpRegisterList.emplace_back(&lib_, fun_ptr_);
     }
   }
 
@@ -106,8 +110,6 @@ class DIPUOpRegister {
 
 }  // namespace at
 
-namespace {
-
 #define DIPU_LIBRARY_IMPL(ns, k, m) _DIPU_LIBRARY_IMPL(ns, k, m, C10_UID)
 
 #define _DIPU_LIBRARY_IMPL(ns, k, m, uid)                                 \
@@ -124,6 +126,4 @@ namespace {
           []() { return [](torch::Library&) -> void {}; }),               \
       #ns, c10::make_optional(c10::DispatchKey::k), __FILE__, __LINE__);  \
   void C10_CONCATENATE(DIPU_LIBRARY_IMPL_init_##ns##_##k##_,              \
-                       uid)(torch::Library & m)
-
-}  // namespace
+                       uid)(torch::Library & (m))
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctions.hpp b/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctions.hpp
index 7de896f582..955ef7a092 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctions.hpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctions.hpp
@@ -17,22 +17,23 @@ static c10::optional<at::Tensor> dipu_to_cpu(
   return cpu_tensor;
 }
 
-static at::Tensor to_cpu_no_half(const at::Tensor& devtensor) {
+static at::Tensor to_cpu_with_half_to_float(const at::Tensor& devtensor) {
   auto cpu_tensor = devtensor.cpu();
   auto intype = devtensor.options().dtype_opt()->toScalarType();
   if (intype == at::ScalarType::Half) {
     return cpu_tensor.to(at::ScalarType::Float);
-  } else {
-    return cpu_tensor;
   }
+  return cpu_tensor;
 }
 
 static at::Tensor& custom_fallback_dipu_silu_out(const at::Tensor& self,
                                                  at::Tensor& out) {
   DIPU_OP_LOG_WARNING_ONCE("custom fallback to cpu, name=silu_out"
                            << std::endl);
-  auto self_cpu = to_cpu_no_half(self);
-  auto out_cpu = to_cpu_no_half(self);
+  auto self_cpu = to_cpu_with_half_to_float(self);
+  auto out_cpu = to_cpu_with_half_to_float(self);
+
+  // NOLINTNEXTLINE(readability-suspicious-call-argument): It's the correct order
   out_cpu = at::silu_out(self_cpu, out_cpu);
   out.copy_(out_cpu);
   return out;
@@ -153,7 +154,9 @@ custom_fallback_dipu_convolution_backward_overrideable(
       grad_output_cpu, input_cpu, weight_cpu, c10::nullopt, stride, padding,
       dilation, transposed, output_padding, groups, output_mask_temp);
 
-  at::Tensor grad_input, grad_weight, grad_bias;
+  at::Tensor grad_input;
+  at::Tensor grad_weight;
+  at::Tensor grad_bias;
 
   if (output_mask[0]) {
     grad_input = std::get<0>(result).to(device);
@@ -226,8 +229,15 @@ custom_fallback_dipu_linear_backward(const at::Tensor& input,
   auto grad_output_cpu = grad_output.cpu();
   auto weight_cpu = weight.cpu();
 
-  at::Tensor grad_input_cpu, grad_weight_cpu, grad_bias_cpu;
-  at::Tensor grad_input, grad_weight, grad_bias;
+  at::Tensor grad_input;
+  at::Tensor grad_input_cpu;
+
+  at::Tensor grad_weight;
+  at::Tensor grad_weight_cpu;
+
+  at::Tensor grad_bias;
+  at::Tensor grad_bias_cpu;
+
   int64_t dims = input.dim();
   const auto device = input.device();
 
@@ -330,5 +340,64 @@ at::Tensor& custom_fallback_dipu__amp_update_scale_(at::Tensor& current_scale,
                                                     double backoff_factor,
                                                     int64_t growth_interval);
 
+static at::Tensor& custom_fallback_dipu_addmm_out(
+    const at::Tensor& self, const at::Tensor& mat1, const at::Tensor& mat2,
+    const at::Scalar& beta, const at::Scalar& alpha, at::Tensor& out) {
+  auto self_cpu = to_cpu_with_half_to_float(self);
+  auto mat1_cpu = to_cpu_with_half_to_float(mat1);
+  auto mat2_cpu = to_cpu_with_half_to_float(mat2);
+  auto out_cpu = to_cpu_with_half_to_float(out);
+  out_cpu = at::addmm_out(out_cpu, self_cpu, mat1_cpu, mat2_cpu, beta, alpha);
+  out.copy_(out_cpu);
+  return out;
+}
+
+static at::Tensor& custom_fallback_dipu_bmm_out(const at::Tensor& self,
+                                                const at::Tensor& mat2,
+                                                at::Tensor& out) {
+  auto self_cpu = to_cpu_with_half_to_float(self);
+  auto mat2_cpu = to_cpu_with_half_to_float(mat2);
+  auto out_cpu = to_cpu_with_half_to_float(out);
+  out_cpu = at::bmm_out(out_cpu, self_cpu, mat2_cpu);
+  out.copy_(out_cpu);
+  return out;
+}
+
+static at::Tensor& custom_fallback_dipu_mm_out(const at::Tensor& self,
+                                               const at::Tensor& mat2,
+                                               at::Tensor& out) {
+  auto self_cpu = to_cpu_with_half_to_float(self);
+  auto mat2_cpu = to_cpu_with_half_to_float(mat2);
+  auto out_cpu = to_cpu_with_half_to_float(out);
+  out_cpu = at::mm_out(out_cpu, self_cpu, mat2_cpu);
+  out.copy_(out_cpu);
+  return out;
+}
+
+static at::Tensor custom_fallback_dipu_linear(
+    const at::Tensor& input, const at::Tensor& weight,
+    const c10::optional<at::Tensor>& bias) {
+  auto input_cpu = to_cpu_with_half_to_float(input);
+  auto weight_cpu = to_cpu_with_half_to_float(weight);
+  c10::optional<at::Tensor> bias_cpu = c10::nullopt;
+
+  at::Tensor out;
+  at::Tensor out_cpu;
+
+  if (bias.has_value() && bias.value().defined()) {
+    if (bias.value().options().dtype_opt()->toScalarType() ==
+        at::ScalarType::Half) {
+      bias_cpu = bias.value().to(at::ScalarType::Float).cpu();
+    } else {
+      bias_cpu = bias.value().cpu();
+    }
+  }
+
+  out_cpu = at::linear(input_cpu, weight_cpu, bias_cpu);
+  out = out_cpu.to(input.device())
+            .to(input.options().dtype_opt()->toScalarType());
+  return out;
+}
+
 }  // namespace native
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctionsForAmpGradScaler.cpp b/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctionsForAmpGradScaler.cpp
index 2514e1e163..03a8fb2334 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctionsForAmpGradScaler.cpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctionsForAmpGradScaler.cpp
@@ -18,7 +18,7 @@ void _amp_non_finite_check_and_unscale_(at::Tensor& scaled_grad,
                                         const at::Tensor& inv_scale) {
   scaled_grad *= inv_scale.item();
   if (!scaled_grad.isfinite().all().item<bool>()) {
-    found_inf[0] = 1.f;
+    found_inf[0] = 1.F;
   }
 }
 
@@ -46,8 +46,7 @@ void custom_fallback_dipu__amp_foreach_non_finite_check_and_unscale_(
   TORCH_CHECK(inv_scale.numel() == 1, "inv_scale must be a 1-element tensor.");
   TORCH_CHECK(found_inf.numel() == 1, "found_inf must be a 1-element tensor.");
   for (const at::Tensor& t : scaled_grads) {
-    // NOLINTNEXTLINE: const_cast here is safe according to pytorch's source
-    // code
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast): const_cast here is safe according to pytorch's source code
     _amp_non_finite_check_and_unscale_(const_cast<at::Tensor&>(t), found_inf,
                                        inv_scale);
   }
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUAmp.cpp b/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUAmp.cpp
index b7c5c347d3..7181d9892e 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUAmp.cpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUAmp.cpp
@@ -237,8 +237,10 @@ DIPU_DEFINE_CAST_POLICY_CONVERSION(kPromote, promote);
 
 // This function will throw an error message when
 // torch.nn.functional.binary_cross_entropy is called within an autocast block
-Tensor DipuBinaryCrossEntropyBanned(const Tensor&, const Tensor&,
-                                    const c10::optional<Tensor>&, int64_t) {
+Tensor DipuBinaryCrossEntropyBanned(const Tensor& /*unused*/,
+                                    const Tensor& /*unused*/,
+                                    const c10::optional<Tensor>& /*unused*/,
+                                    int64_t /*unused*/) {
   AT_ERROR(
       R"(torch.nn.functional.binary_cross_entropy and torch.nn.BCELoss are unsafe to autocast.
 Many models use a sigmoid layer right before the binary cross entropy layer.
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUCopy.cpp b/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUCopy.cpp
index eb75a7b8cb..523533cddf 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUCopy.cpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUCopy.cpp
@@ -35,7 +35,8 @@ void setDipuCopyInstance(DIPUCopyBase* op) { dipu_copy_op() = op; }
 
 namespace dipu {
 namespace native {
-at::Scalar DIPUATenFunctions::_local_scalar_dense_dipu(const at::Tensor& self) {
+namespace dipu_aten {
+at::Scalar _local_scalar_dense_dipu(const at::Tensor& self) {
   at::Scalar r;
   AT_DISPATCH_ALL_TYPES_AND2(
       at::kHalf, at::kBool, self.scalar_type(), "_local_scalar_dense_dipu",
@@ -50,5 +51,6 @@ at::Scalar DIPUATenFunctions::_local_scalar_dense_dipu(const at::Tensor& self) {
       });
   return r;
 }
+}  // namespace dipu_aten
 }  // namespace native
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUCopy.hpp b/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUCopy.hpp
index 47f519984e..bd79a4abde 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUCopy.hpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUCopy.hpp
@@ -4,6 +4,7 @@
 #include <ATen/ATen.h>
 #include <ATen/MemoryOverlap.h>
 #include <ATen/Tensor.h>
+#include <c10/core/Stream.h>
 
 #include <csrc_dipu/aten/DIPUATenFunctions.h>
 #include <csrc_dipu/aten/ops/OpUtils.hpp>
@@ -15,12 +16,12 @@ namespace dipu {
 namespace native {
 // NOTICE: these 2 func defined in AutoGenedKernels.cpp
 // if dipu autogen support header file gen, remove this
-at::Tensor dipu_wrap_diopi_cast_dtype(const at::Tensor& src,
+at::Tensor dipu_wrap_diopi_cast_dtype(const at::Tensor& self,
                                       at::ScalarType dtype);
 
 // if dipu autogen support proxy one torch op to multiple diopi op, remove
 // this.
-at::Tensor& dipu_wrap_diopi_copy_inp(at::Tensor& dst, const at::Tensor& src,
+at::Tensor& dipu_wrap_diopi_copy_inp(at::Tensor& self, const at::Tensor& src,
                                      bool non_blocking);
 
 }  // namespace native
@@ -46,7 +47,7 @@ inline void tryRecordStream(const at::Tensor& tensor, DIPUStream& curStream,
                             bool is_default_stream) {
   if ((tensor.is_cpu() && tensor.options().pinned_memory()) ||
       !is_default_stream) {
-    tensor.record_stream(curStream);
+    tensor.record_stream(curStream.unwrap());
   }
 }
 
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/EmptyOpsKernel.cpp b/dipu/torch_dipu/csrc_dipu/aten/ops/EmptyOpsKernel.cpp
index 2837967029..0467b4a76d 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/EmptyOpsKernel.cpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/EmptyOpsKernel.cpp
@@ -1,21 +1,28 @@
 // Copyright (c) 2023, DeepLink.
 #include <ATen/EmptyTensor.h>
+#include <ATen/core/ATen_fwd.h>
+#include <c10/core/Allocator.h>
 #include <c10/core/CPUAllocator.h>
-#include <c10/core/TensorImpl.h>
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/DispatchKey.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/core/Layout.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/ScalarType.h>
 #include <c10/core/TensorOptions.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
 #include <c10/util/accumulate.h>
 
-#include <csrc_dipu/aten/DIPUATenFunctions.h>
-#include <csrc_dipu/profiler/profiler.h>
-#include <csrc_dipu/runtime/rthelper.h>
+#include "csrc_dipu/aten/DIPUATenFunctions.h"
+#include "csrc_dipu/base/basedef.h"
+#include "csrc_dipu/profiler/profiler.h"
+#include "csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h"
+#include "csrc_dipu/runtime/rthelper.h"
 
-using at::Layout;
-using c10::device_or_default;
-using c10::layout_or_default;
-using c10::StorageImpl;
-using c10::TensorImpl;
-
-namespace dipu::native {
+namespace dipu {
+namespace native {
 
 static c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
   if (pin_memory) {
@@ -24,11 +31,12 @@ static c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
   return c10::GetCPUAllocator();
 }
 
-at::Tensor DIPUATenFunctions::empty(
-    at::IntArrayRef size, c10::optional<at::ScalarType> dtype_opt,
-    c10::optional<at::Layout> layout_opt, c10::optional<at::Device> device_opt,
-    c10::optional<bool> pin_memory_opt,
-    c10::optional<at::MemoryFormat> memory_format_opt) {
+at::Tensor dipu_aten::empty(at::IntArrayRef size,
+                            c10::optional<at::ScalarType> dtype_opt,
+                            c10::optional<at::Layout> layout_opt,
+                            c10::optional<at::Device> device_opt,
+                            c10::optional<bool> pin_memory_opt,
+                            c10::optional<at::MemoryFormat> memory_format_opt) {
   dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(c10::device_or_default(device_opt).type() ==
                                    dipu::DIPU_DEVICE_TYPE);
@@ -42,7 +50,7 @@ at::Tensor DIPUATenFunctions::empty(
                                    memory_format_opt);
 }
 
-at::Tensor DIPUATenFunctions::empty_cpu(
+at::Tensor dipu_aten::empty_cpu(
     at::IntArrayRef size, c10::optional<at::ScalarType> dtype_opt,
     c10::optional<at::Layout> layout_opt, c10::optional<at::Device> device_opt,
     c10::optional<bool> pin_memory_opt,
@@ -61,15 +69,16 @@ at::Tensor DIPUATenFunctions::empty_cpu(
 }
 
 // use empty_generic, test
-at::Tensor DIPUATenFunctions::empty_strided(
-    at::IntArrayRef size, at::IntArrayRef stride,
-    c10::optional<at::ScalarType> dtype_opt,
-    c10::optional<at::Layout> layout_opt, c10::optional<at::Device> device_opt,
-    c10::optional<bool> pin_memory_opt) {
+at::Tensor dipu_aten::empty_strided(at::IntArrayRef size,
+                                    at::IntArrayRef stride,
+                                    c10::optional<at::ScalarType> dtype_opt,
+                                    c10::optional<at::Layout> layout_opt,
+                                    c10::optional<at::Device> device_opt,
+                                    c10::optional<bool> pin_memory_opt) {
   dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__);
   auto device = c10::device_or_default(device_opt);
   AT_ASSERT(device.type() == dipu::DIPU_DEVICE_TYPE);
-  AT_ASSERT(layout_or_default(layout_opt) == Layout::Strided);
+  AT_ASSERT(c10::layout_or_default(layout_opt) == at::Layout::Strided);
   auto dtype = dtype_or_default(dtype_opt);
 
   c10::Allocator* allocator = dipu::getAllocator(dipu::DIPU_DEVICE_TYPE);
@@ -78,11 +87,12 @@ at::Tensor DIPUATenFunctions::empty_strided(
                                            dtype);
 }
 
-at::Tensor DIPUATenFunctions::empty_strided_cpu(
-    at::IntArrayRef size, at::IntArrayRef stride,
-    c10::optional<at::ScalarType> dtype_opt,
-    c10::optional<at::Layout> layout_opt, c10::optional<at::Device> device_opt,
-    c10::optional<bool> pin_memory_opt) {
+at::Tensor dipu_aten::empty_strided_cpu(at::IntArrayRef size,
+                                        at::IntArrayRef stride,
+                                        c10::optional<at::ScalarType> dtype_opt,
+                                        c10::optional<at::Layout> layout_opt,
+                                        c10::optional<at::Device> device_opt,
+                                        c10::optional<bool> pin_memory_opt) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(c10::device_or_default(device_opt).type() ==
                                    c10::DeviceType::CPU);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(c10::layout_or_default(layout_opt) ==
@@ -96,4 +106,5 @@ at::Tensor DIPUATenFunctions::empty_strided_cpu(
                                            dtype);
 }
 
-}  // namespace dipu::native
+}  // namespace native
+}  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/NodispatchUtils.hpp b/dipu/torch_dipu/csrc_dipu/aten/ops/NodispatchUtils.hpp
new file mode 100644
index 0000000000..38fed18260
--- /dev/null
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/NodispatchUtils.hpp
@@ -0,0 +1,40 @@
+// Copyright (c) 2023, DeepLink.
+//
+// This file contains useful wrappers for DIPU ATen functions.
+// You should use `nodispatch::foo` instead of calling `at::foo` whenever
+// possible to avoid dispatch overhead.
+
+#pragma once
+
+#include <ATen/core/ATen_fwd.h>
+#include <ATen/core/TensorBody.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/ScalarTypeToTypeMeta.h>
+#include <c10/util/Optional.h>
+
+#include "csrc_dipu/aten/DIPUATenFunctions.h"
+
+namespace dipu {
+namespace native {
+namespace nodispatch {
+// add any other `at::foo` functions you need here
+
+// an equivalent to `at::empty` but without dispatch
+inline at::Tensor empty(
+    at::IntArrayRef size, at::TensorOptions options = {},
+    c10::optional<at::MemoryFormat> memory_format = c10::nullopt) {
+  return dipu_aten::empty(
+      size, c10::optTypeMetaToScalarType(options.dtype_opt()),
+      options.layout_opt(), options.device_opt(), options.pinned_memory_opt(),
+      c10::impl::check_tensor_options_and_extract_memory_format(options,
+                                                                memory_format));
+}
+
+// an simplified version of `at::empty_like` but without dispatch
+inline at::Tensor empty_like(const at::Tensor& self) {
+  return empty(self.sizes(), self.options());
+}
+
+}  // namespace nodispatch
+}  // namespace native
+}  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp b/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp
index 213f3ff3e3..b710fccd44 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp
@@ -1,15 +1,30 @@
 #pragma once
 
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <ATen/core/ATen_fwd.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/List.h>
+#include <ATen/core/TensorBody.h>
+#include <ATen/ops/abs.h>
+#include <ATen/ops/allclose.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Optional.h>
+#include <c10/util/OptionalArrayRef.h>
+#include <c10/util/string_view.h>
+
+#include "csrc_dipu/runtime/core/DIPUStream.h"
 #include <csrc_dipu/runtime/rthelper.h>
 #include <csrc_dipu/utils/Log.h>
 
-namespace dipu::native {
-
-inline bool checkDiopiReturnValue() {
-  static bool enable =
-      std::getenv("DIPU_DISABLE_CHECK_DIOPI_RETURN_VALUE") == nullptr;
-  return enable;
-}
+namespace dipu {
+namespace native {
 
 inline bool checkTensorDevice() {
   static bool enable = []() {
@@ -17,7 +32,7 @@ inline bool checkTensorDevice() {
     if (env_ptr == nullptr) {
       return false;
     }
-    return std::atoi(env_ptr) > 0 ? true : false;
+    return std::atoi(env_ptr) > 0;
   }();
   return enable;
 }
@@ -28,27 +43,26 @@ inline void synchronizeIfEnable() {
     DIPU_LOG_ONCE << "The synchronous operation is performed after "
                   << "the diopi function call because the DIPU_SYNC_EXEC_MODE "
                      "environment variable is set"
-                  << std::endl;
+                  << '\n';
     dipu::getCurrentDIPUStream().synchronize();
   }
-  return;
 }
 
 inline int dumpOpArgLevel() {
-  const char* env_ptr = std::getenv("DIPU_DUMP_OP_ARGS");
-  int level = env_ptr ? std::atoi(env_ptr) : 0;
+  static const char* env_ptr = std::getenv("DIPU_DUMP_OP_ARGS");
+  static int level = env_ptr ? std::atoi(env_ptr) : 0;
   return level;
 }
 
 template <typename T>
-static std::string dumpArg(const T& t) {
+std::string dumpArg(const T& t) {
   std::stringstream stream;
   stream << t;
   return stream.str();
 }
 
 template <typename T1>
-static std::string dumpArg(const c10::optional<T1>& opt_t) {
+std::string dumpArg(const c10::optional<T1>& opt_t) {
   std::stringstream stream;
   if (opt_t.has_value()) {
     stream << dumpArg(opt_t.value());
@@ -57,7 +71,7 @@ static std::string dumpArg(const c10::optional<T1>& opt_t) {
 }
 
 template <typename T>
-static std::string dumpArg(const c10::OptionalArrayRef<T>& opt_t) {
+std::string dumpArg(const c10::OptionalArrayRef<T>& opt_t) {
   std::stringstream stream;
   if (opt_t.has_value()) {
     stream << dumpArg(opt_t.value());
@@ -66,7 +80,7 @@ static std::string dumpArg(const c10::OptionalArrayRef<T>& opt_t) {
 }
 
 template <typename T1, template <typename elem> class container>
-static std::string dumpArg(const container<T1>& t) {
+std::string dumpArg(const container<T1>& t) {
   std::stringstream stream;
   for (auto iter = t.begin(); iter != t.end(); ++iter) {
     stream << dumpArg(*iter) << ", ";
@@ -75,10 +89,10 @@ static std::string dumpArg(const container<T1>& t) {
 }
 
 template <>
-std::string dumpArg(const at::Tensor& tensor) {
+inline std::string dumpArg(const at::Tensor& tensor) {
   std::stringstream stream;
   if (tensor.defined()) {
-    stream << "numel: " << tensor.numel() << ",sizes: " << tensor.sizes()
+    stream << "numel: " << tensor.numel() << ", sizes: " << tensor.sizes()
            << ", stride: " << tensor.strides()
            << ", is_view: " << tensor.is_view() << ", dtype: " << tensor.dtype()
            << ", device:" << tensor.device() << ", layout:" << tensor.layout()
@@ -87,7 +101,7 @@ std::string dumpArg(const at::Tensor& tensor) {
            << ", memory_format: " << tensor.suggest_memory_format()
            << ",  data_ptr: " << tensor.data_ptr();
     if (dumpOpArgLevel() > 2) {
-      stream << std::endl << tensor;
+      stream << '\n' << tensor;
     }
   } else {
     stream << "undefined";
@@ -96,24 +110,24 @@ std::string dumpArg(const at::Tensor& tensor) {
 }
 
 template <>
-std::string dumpArg(const at::Scalar& scalar) {
+inline std::string dumpArg(const at::Scalar& t) {
   std::stringstream stream;
-  stream << scalar;
+  stream << t;
   return stream.str();
 }
 
 template <>
-std::string dumpArg(const c10::string_view& str) {
-  return dumpArg(std::string(str.data()));
+inline std::string dumpArg(const c10::string_view& t) {
+  return dumpArg(std::string(t.data()));
 }
 
 template <>
-std::string dumpArg(const at::Generator& generator) {
+inline std::string dumpArg(const at::Generator& t) {
   return "";
 }
 
 template <typename T, size_t N>
-static std::string dumpArg(const std::array<T, N>& t) {
+std::string dumpArg(const std::array<T, N>& t) {
   std::stringstream stream;
   for (auto iter = t.begin(); iter != t.end(); ++iter) {
     stream << dumpArg(*iter) << " ";
@@ -122,27 +136,27 @@ static std::string dumpArg(const std::array<T, N>& t) {
 }
 
 template <>
-std::string dumpArg(const c10::List<c10::optional<at::Tensor>>& t) {
+inline std::string dumpArg(const c10::List<c10::optional<at::Tensor>>& t) {
   std::stringstream stream;
-  stream << "size:" << t.size() << std::endl;
+  stream << "size:" << t.size() << '\n';
   for (int i = 0; i < t.size(); ++i) {
     bool has_value = t[i].has_value();
     stream << "\t" << i << "th: has_value:" << has_value << " ";
     if (has_value) {
       stream << dumpArg(t[i].value());
     }
-    stream << std::endl;
+    stream << '\n';
   }
   return stream.str();
 }
 
 template <typename T1, typename T2, template <typename elem1> class container1,
           template <typename elem2> class container2>
-static std::vector<int64_t> infer_reduce_op_shape(
-    const container1<T1>& input_shape, const container2<T2>& dims,
-    bool keepdim) {
+std::vector<int64_t> infer_reduce_op_shape(const container1<T1>& input_shape,
+                                           const container2<T2>& dims,
+                                           bool keepdim) {
   if (dims.size() <= 0) {
-    return std::vector<int64_t>();
+    return {};
   }
   if (keepdim) {
     std::vector<int64_t> output_shape(input_shape.begin(), input_shape.end());
@@ -152,52 +166,52 @@ static std::vector<int64_t> infer_reduce_op_shape(
       output_shape[dim] = 1;
     }
     return output_shape;
-  } else {
-    std::vector<int64_t> output_shape;
-    output_shape.reserve(input_shape.size() - dims.size());
-    for (int i = 0; i < input_shape.size(); ++i) {
-      bool reduce_dim = false;
-      for (auto iter = dims.begin(); iter != dims.end(); ++iter) {
-        auto dim = *iter;
-        dim += dim < 0 ? input_shape.size() : 0;
-        if (dim == i) {
-          reduce_dim = true;
-          break;
-        }
-      }
-      if (reduce_dim == false) {
-        output_shape.push_back(input_shape.at(i));
+  }
+  std::vector<int64_t> output_shape;
+  output_shape.reserve(input_shape.size() - dims.size());
+  for (int i = 0; i < input_shape.size(); ++i) {
+    bool reduce_dim = false;
+    for (auto iter = dims.begin(); iter != dims.end(); ++iter) {
+      auto dim = *iter;
+      dim += dim < 0 ? input_shape.size() : 0;
+      if (dim == i) {
+        reduce_dim = true;
+        break;
       }
     }
-    return output_shape;
+    if (!reduce_dim) {
+      output_shape.push_back(input_shape.at(i));
+    }
   }
+  return output_shape;
 }
 
-static std::string _allclose(const at::Tensor& a, const at::Tensor& b) {
+inline std::string _allclose(const at::Tensor& a, const at::Tensor& b) {
   if (a.defined() && b.defined()) {
     try {
-      if (at::allclose(a.cpu(), b.cpu(), 1e-4, 1e-5, true)) {
+      constexpr double tolerance_absolute = 1e-4;
+      constexpr double tolerance_relative = 1e-5;
+      if (at::allclose(a.cpu(), b.cpu(), tolerance_absolute, tolerance_relative,
+                       true)) {
         return "allclose";
-      } else {
-        auto diff = at::abs(a.cpu() - b.cpu());
-        auto mae = diff.mean().item<double>();
-        auto max_diff = diff.max().item<double>();
-        return "not_close, max diff: " + std::to_string(max_diff) +
-               ", MAE: " + std::to_string(mae);
       }
+      auto diff = at::abs(a.cpu() - b.cpu());
+      auto mae = diff.mean().item<double>();
+      auto max_diff = diff.max().item<double>();
+      return "not_close, max diff: " + std::to_string(max_diff) +
+             ", MAE: " + std::to_string(mae);
     } catch (...) {
       return "compare_error: not_close";
     }
   } else {
     if (a.defined() != b.defined()) {
       return "not_close, one of tensor inputs is empty";
-    } else {
-      return "allclose";
     }
+    return "allclose";
   }
 }
 
-static std::string _allclose(const c10::ArrayRef<at::Tensor>& a,
+inline std::string _allclose(const c10::ArrayRef<at::Tensor>& a,
                              const c10::ArrayRef<at::Tensor>& b) {
   if (a.size() != b.size()) {
     return "not_allclose:";
@@ -209,4 +223,5 @@ static std::string _allclose(const c10::ArrayRef<at::Tensor>& a,
   return result;
 }
 
-}  // namespace dipu::native
+}  // namespace native
+}  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/PinMemoryKernel.cpp b/dipu/torch_dipu/csrc_dipu/aten/ops/PinMemoryKernel.cpp
index 36e411c9c5..96e5decb88 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/PinMemoryKernel.cpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/PinMemoryKernel.cpp
@@ -8,10 +8,11 @@
 #include <csrc_dipu/aten/DIPUATenFunctions.h>
 #include <csrc_dipu/runtime/rthelper.h>
 
-namespace dipu::native {
+namespace dipu {
+namespace native {
+namespace dipu_aten {
 
-bool DIPUATenFunctions::is_pinned(const at::Tensor& self,
-                                  c10::optional<at::Device> device) {
+bool is_pinned(const at::Tensor& self, c10::optional<at::Device> device) {
   // Only CPU tensors can be pinned
   if (!self.is_cpu()) {
     return false;
@@ -22,13 +23,13 @@ bool DIPUATenFunctions::is_pinned(const at::Tensor& self,
   return dipu::isPinnedPtr(self.storage().data());
 }
 
-at::Tensor DIPUATenFunctions::_pin_memory(const at::Tensor& self,
-                                          c10::optional<at::Device> device) {
+at::Tensor _pin_memory(const at::Tensor& self,
+                       c10::optional<at::Device> device) {
   auto allocator = dipu::getAllocator(at::DeviceType::CPU);
   auto storage =
       c10::Storage(c10::Storage::use_byte_size_t(),
-                   at::detail::computeStorageNbytes(
-                       self.sizes(), self.strides(), self.dtype().itemsize()),
+                   static_cast<int64_t>(at::detail::computeStorageNbytes(
+                       self.sizes(), self.strides(), self.dtype().itemsize())),
                    allocator, false);
   auto tensor = at::cpu::empty({0}, self.options())
                     .set_(storage, 0, self.sizes(), self.strides());
@@ -36,4 +37,6 @@ at::Tensor DIPUATenFunctions::_pin_memory(const at::Tensor& self,
   return tensor;
 }
 
-}  // namespace dipu::native
+}  // namespace dipu_aten
+}  // namespace native
+}  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/StorageShapeKernel.cpp b/dipu/torch_dipu/csrc_dipu/aten/ops/StorageShapeKernel.cpp
index 4b33d68fa4..85a0f251b4 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/StorageShapeKernel.cpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/StorageShapeKernel.cpp
@@ -12,25 +12,18 @@
 #include <csrc_dipu/runtime/core/MemChecker.h>
 #include <csrc_dipu/runtime/rthelper.h>
 
-using at::IntArrayRef;
-using at::Layout;
-using c10::device_or_default;
-using c10::layout_or_default;
-using c10::MemoryFormat;
-using c10::StorageImpl;
-using c10::TensorImpl;
-using dipu::devproxy::current_device;
+namespace dipu {
+namespace native {
+namespace dipu_aten {
 
-namespace dipu::native {
-void DIPUATenFunctions::resize_bytes_dipu(StorageImpl* storage,
-                                          size_t newsize_bytes) {
+void resize_bytes_dipu(c10::StorageImpl* storage, size_t newsize_bytes) {
   TORCH_CHECK(storage->resizable(),
               "Trying to resize dipu storage that is not resizable");
   auto allocator = storage->allocator();
   TORCH_CHECK(allocator != nullptr,
               "Trying to resize dipu storage without an allocator");
 
-  auto device = current_device();
+  auto device = dipu::devproxy::current_device();
   dipu::DIPUStream stream = dipu::getCurrentDIPUStream();
   if (newsize_bytes == 0) {
     storage->set_data_ptr_noswap(
@@ -53,8 +46,9 @@ void DIPUATenFunctions::resize_bytes_dipu(StorageImpl* storage,
   storage->set_nbytes(newsize_bytes);
 }
 
-static inline TensorImpl* _resize_impl_dipu_(TensorImpl* self, IntArrayRef size,
-                                             at::OptionalIntArrayRef stride) {
+static inline c10::TensorImpl* _resize_impl_dipu_(
+    c10::TensorImpl* self, at::IntArrayRef size,
+    at::OptionalIntArrayRef stride) {
   if (self->sizes() == size && (!stride || self->strides() == stride)) {
     return self;
   }
@@ -75,13 +69,12 @@ static inline TensorImpl* _resize_impl_dipu_(TensorImpl* self, IntArrayRef size,
   const c10::Storage& storage = self->unsafe_storage();
   TORCH_CHECK(storage, "Tensor: invalid null storage");
   if (self->numel() > 0 && new_storage_size > storage.nbytes()) {
-    DIPUATenFunctions::resize_bytes_dipu(storage.unsafeGetStorageImpl(),
-                                         new_storage_size);
+    resize_bytes_dipu(storage.unsafeGetStorageImpl(), new_storage_size);
   }
   return self;
 }
 
-const at::Tensor& DIPUATenFunctions::resize_(
+const at::Tensor& resize_(
     const at::Tensor& self, at::IntArrayRef size,
     c10::optional<at::MemoryFormat> optional_memory_format) {
   if (self.has_names()) {
@@ -89,22 +82,21 @@ const at::Tensor& DIPUATenFunctions::resize_(
   }
   auto* self_ = self.unsafeGetTensorImpl();
   // not support stride now
-  _resize_impl_dipu_(self_, size, /*strides=*/c10::nullopt);
+  _resize_impl_dipu_(self_, size, /*stride=*/c10::nullopt);
   if (optional_memory_format.has_value()) {
     auto memory_format = optional_memory_format.value();
-    TORCH_CHECK(memory_format != MemoryFormat::Preserve,
+    TORCH_CHECK(memory_format != at::MemoryFormat::Preserve,
                 "Unsupported memory format", memory_format);
     self_->empty_tensor_restride(memory_format);
   }
   return self;
 }
 
-at::Tensor& DIPUATenFunctions::set_storage_dipu_(at::Tensor& result,
-                                                 c10::Storage storage,
-                                                 int64_t storage_offset,
-                                                 at::IntArrayRef size,
-                                                 at::IntArrayRef stride) {
-  at::native::checkSetStorage(result, storage, storage_offset, size, stride);
+at::Tensor& set_storage_dipu_(at::Tensor& result, c10::Storage storage,
+                              int64_t storage_offset, at::IntArrayRef size,
+                              at::IntArrayRef stride) {
+  at::native::checkSetStorage(result, std::move(storage), storage_offset, size,
+                              stride);
 
   result.unsafeGetTensorImpl()->set_storage_offset(storage_offset);
   at::OptionalIntArrayRef stride_opt =
@@ -113,12 +105,15 @@ at::Tensor& DIPUATenFunctions::set_storage_dipu_(at::Tensor& result,
   return result;
 }
 
-at::Tensor& DIPUATenFunctions::set_dipu_(at::Tensor& result) {
-  caffe2::TypeMeta dtype = result.dtype();
+at::Tensor& set_dipu_(at::Tensor& self) {
+  caffe2::TypeMeta dtype = self.dtype();
   c10::Storage storage(c10::Storage::use_byte_size_t(), 0,
                        dipu::getAllocator(dipu::DIPU_DEVICE_TYPE), true);
-  DIPUATenFunctions::set_storage_dipu_(result, storage, 0, {0}, {});
-  TORCH_INTERNAL_ASSERT(dtype == result.dtype());
-  return result;
+  set_storage_dipu_(self, storage, 0, {0}, {});
+  TORCH_INTERNAL_ASSERT(dtype == self.dtype());
+  return self;
 }
-}  // namespace dipu::native
+
+}  // namespace dipu_aten
+}  // namespace native
+}  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/base/DIPUGlobals.cpp b/dipu/torch_dipu/csrc_dipu/base/DIPUGlobals.cpp
index bb8fc899df..721431a5ac 100644
--- a/dipu/torch_dipu/csrc_dipu/base/DIPUGlobals.cpp
+++ b/dipu/torch_dipu/csrc_dipu/base/DIPUGlobals.cpp
@@ -20,8 +20,8 @@ static void printPromptAtStartup() {
 }
 
 static void initResourceImpl() {
-  static std::atomic_bool called(false);
-  if (called == true) {
+  static bool called(false);
+  if (called) {
     return;
   }
   called = true;
@@ -33,8 +33,8 @@ static void initResourceImpl() {
 }
 
 static void releaseAllResourcesImpl() {
-  static std::atomic_bool called(false);
-  if (called == true) {
+  static bool called(false);
+  if (called) {
     return;
   }
   called = true;
diff --git a/dipu/torch_dipu/csrc_dipu/base/basedef.h b/dipu/torch_dipu/csrc_dipu/base/basedef.h
index 69de4ddede..755f745b5f 100644
--- a/dipu/torch_dipu/csrc_dipu/base/basedef.h
+++ b/dipu/torch_dipu/csrc_dipu/base/basedef.h
@@ -6,8 +6,6 @@
 
 #include <csrc_dipu/runtime/device/basedef.h>
 
-auto static constexpr C10_COMPILE_TIME_MAX_DIPUS = 16;
-
 #define DIPU_DEVICE_TYPE_MACRO XPU
 #define DIPU_AUTOGRAD_DEVICE_TYPE_MACRO \
   C10_CONCATENATE(Autograd, DIPU_DEVICE_TYPE_MACRO)
diff --git a/dipu/torch_dipu/csrc_dipu/binding/DIPUpybind.h b/dipu/torch_dipu/csrc_dipu/binding/DIPUpybind.h
index 34baa917d9..048d26a992 100644
--- a/dipu/torch_dipu/csrc_dipu/binding/DIPUpybind.h
+++ b/dipu/torch_dipu/csrc_dipu/binding/DIPUpybind.h
@@ -17,7 +17,7 @@ at::ScalarType dtypeToScalarType(PyObject* dtype_obj) {
   // In PyTorch they would write:
   //   return reinterpret_cast<THPDtype*>(dtype_obj)->scalar_type;
   // But we do care about aliasing.
-  THPDtype dtype;
+  THPDtype dtype{};
   std::memcpy(&dtype, dtype_obj, sizeof(dtype));
   return dtype.scalar_type;
 }
@@ -61,7 +61,7 @@ struct type_caster<at::ScalarType> {
  public:
   PYBIND11_TYPE_CASTER(at::ScalarType, _("torch.dtype"));
 
-  bool load(py::handle src, bool) {
+  bool load(py::handle src, bool /*unused*/) {
     // Convert Python torch.dtype to at::ScalarType
     PyObject* obj = src.ptr();
     if (isDtype(obj)) {
@@ -75,7 +75,7 @@ struct type_caster<at::ScalarType> {
                          py::return_value_policy /* policy */,
                          py::handle /* parent */) {
     // Convert at::ScalarType to Python torch.dtype
-    return py::handle(scalarTypeToDtype(src));
+    return {{py::handle(scalarTypeToDtype(src))}};
   }
 };
 
diff --git a/dipu/torch_dipu/csrc_dipu/binding/ExportRT.cpp b/dipu/torch_dipu/csrc_dipu/binding/ExportRT.cpp
index 7f0853d5bf..a1c3cc7326 100644
--- a/dipu/torch_dipu/csrc_dipu/binding/ExportRT.cpp
+++ b/dipu/torch_dipu/csrc_dipu/binding/ExportRT.cpp
@@ -15,14 +15,14 @@
 
 #include "DIPUpybind.h"
 #include "exportapi.h"
+
 using dipu::DIPUEvent;
 using dipu::DIPUStream;
-using dipu::getDIPUStreamFromPool;
 namespace py = pybind11;
 
 namespace dipu {
 
-static constexpr size_t kMega = 1024 * 1024;
+static constexpr auto kMega = static_cast<const size_t>(1024 * 1024);
 using dipu::devapis::DIPUDeviceProperties;
 using dipu::devapis::DIPUDeviceStatus;
 
@@ -112,13 +112,14 @@ static void exportStream(py::module& m) {
                              dipu::DIPU_DEVICE_TYPE);
                }
                return DIPUStream(device_index, stream_id);
-             } else if (stream_ptr) {
+             }
+             if (stream_ptr) {
                return dipu::getStreamFromExternal(
+                   // NOLINTNEXTLINE(performance-no-int-to-ptr)
                    reinterpret_cast<deviceStream_t>(stream_ptr),
                    devproxy::current_device());
-             } else {
-               return getDIPUStreamFromPool();
              }
+             return getDIPUStreamFromPool();
            }),
            py::arg("priority") = 0, py::arg("stream_id") = 0,
            py::arg("device_index") = 0, py::arg("device_type") = 0,
@@ -149,10 +150,11 @@ static void exportStream(py::module& m) {
           [](DIPUStream& stream) -> int64_t {
             return static_cast<int64_t>(stream.device().type());
           })
-      .def_property_readonly("dipu_stream",
-                             [](DIPUStream& stream) -> uint64_t {
-                               return (uint64_t)stream.rawstream();
-                             })
+      .def_property_readonly(
+          "dipu_stream",
+          [](DIPUStream& stream) -> uint64_t {
+            return reinterpret_cast<uint64_t>(stream.rawstream());
+          })
       // use type_caster<at::Device>
       .def_property_readonly("device", [](DIPUStream& stream) -> at::Device {
         return stream.device();
@@ -202,7 +204,9 @@ static void exportEvent(py::module& m) {
 
       .def_property_readonly(
           "dipu_event",
-          [](DIPUEvent& self) { return (uint64_t)self.rawevent(); })
+          [](DIPUEvent& self) {
+            return reinterpret_cast<uint64_t>(self.rawevent());
+          })
       .def_property_readonly("device", [](DIPUEvent& self) {
         auto device = self.device().value();
         return device;
@@ -268,7 +272,7 @@ static void patchStorage(py::module& m) {
                         "support other device type ",
                         stor.device_type());
           } else {
-            dipu::native::DIPUATenFunctions::resize_bytes_dipu(
+            dipu::native::dipu_aten::resize_bytes_dipu(
                 stor.unsafeGetStorageImpl(), newsize);
             return stor;
           }
@@ -276,8 +280,9 @@ static void patchStorage(py::module& m) {
 }
 
 static void patchTensor(py::module& m) {
-  m.def("is_dipu",
-        [](at::Tensor self) -> bool { return dipu::isDeviceTensor(self); });
+  m.def("is_dipu", [](const at::Tensor& self) -> bool {
+    return dipu::isDeviceTensor(self);
+  });
 }
 
 static void exportGenerator(py::module& m) {
@@ -293,13 +298,13 @@ static void exportGenerator(py::module& m) {
         [](at::DeviceIndex idx) -> at::Tensor { return get_rng_state(idx); });
 
   m.def("_set_rng_state", [](at::DeviceIndex idx, at::Tensor state) {
-    set_rng_state(idx, state);
+    set_rng_state(idx, std::move(state));
   });
 
   m.def("_is_in_bad_fork", []() -> bool { return is_in_bad_fork(); });
 
   m.def("_create_dipu_generator", [](int idx) -> at::Generator {
-    at::DeviceIndex index = static_cast<at::DeviceIndex>(idx);
+    auto index = static_cast<at::DeviceIndex>(idx);
     return createDIPUGenerator(index);
   });
 }
diff --git a/dipu/torch_dipu/csrc_dipu/binding/ExportTensor.cpp b/dipu/torch_dipu/csrc_dipu/binding/ExportTensor.cpp
index e5e57384e3..9e5893f520 100644
--- a/dipu/torch_dipu/csrc_dipu/binding/ExportTensor.cpp
+++ b/dipu/torch_dipu/csrc_dipu/binding/ExportTensor.cpp
@@ -35,6 +35,7 @@ static at::Tensor dispatch_to(
       non_blocking, copy);
 }
 
+// TODO(fandaoyi): check memory leak
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
 static std::shared_ptr<PyObject* [2]> splitArgs(PyObject* args) {
   ssize_t rawSize = PyTuple_Size(args);
@@ -179,13 +180,15 @@ static PyObject* dipuMockCudaTensors(PyObject* _unused, PyObject* noargs) {
 // we prefer to use pybind11 to export patch func, cpython is used only patching
 // tensor-func which has complex dynamic parameters not easy to parsed by
 // pybind.
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables)
-static PyMethodDef TorchTensorMethods[] = {
-    {"dipu", castPyCFunctionWithKeywords(THPVariable_dipu),
-     METH_VARARGS | METH_KEYWORDS, nullptr},
-    {"_mockCudaTensor", reinterpret_cast<PyCFunction>(dipuMockCudaTensors),
-     METH_NOARGS, nullptr},
-    {nullptr, nullptr, 0, nullptr}};
-
-DIPU_API PyMethodDef* exportTensorFunctions() { return TorchTensorMethods; }
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+static std::array<PyMethodDef, 3> TorchTensorMethods = {
+    {{"dipu", castPyCFunctionWithKeywords(THPVariable_dipu),
+      METH_VARARGS | METH_KEYWORDS, nullptr},
+     {"_mockCudaTensor", reinterpret_cast<PyCFunction>(dipuMockCudaTensors),
+      METH_NOARGS, nullptr},
+     {nullptr, nullptr, 0, nullptr}}};
+
+DIPU_API PyMethodDef* exportTensorFunctions() {
+  return TorchTensorMethods.data();
+}
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/diopirt/diopi_helper.cpp b/dipu/torch_dipu/csrc_dipu/diopirt/diopi_helper.cpp
index 2d49804221..ce1857a814 100644
--- a/dipu/torch_dipu/csrc_dipu/diopirt/diopi_helper.cpp
+++ b/dipu/torch_dipu/csrc_dipu/diopirt/diopi_helper.cpp
@@ -1,5 +1,5 @@
 // Copyright (c) 2023, DeepLink.
-#include <stdio.h>
+#include <cstdio>
 
 #include "./diopirt_impl.h"
 
@@ -24,7 +24,9 @@ ::diopiConstTensorHandle_t toDiopiTensorHandle(const at::Tensor* tensor) {
 
 ::diopiConstTensorHandle_t toDiopiTensorHandle(
     const c10::optional<at::Tensor>& tensor) {
-  if (!tensor.has_value()) return nullptr;
+  if (!tensor.has_value()) {
+    return nullptr;
+  }
   return toDiopiTensorHandle(tensor.value());
 }
 
@@ -36,7 +38,9 @@ ::diopiGeneratorHandle_t toDiopiGeneratorHandle(at::Generator& generator) {
 
 ::diopiGeneratorHandle_t toDiopiGeneratorHandle(
     c10::optional<at::Generator>& generator) {
-  if (!generator.has_value()) return nullptr;
+  if (!generator.has_value()) {
+    return nullptr;
+  }
   return toDiopiGeneratorHandle(generator.value());
 }
 
@@ -73,11 +77,13 @@ ::diopiScalar_t toDiopiScalar(const at::Scalar& scalar,
     result.stype = ::diopiDtype_t::diopi_dtype_int64;
     result.ival = static_cast<int64_t>(scalar.toBool());
     return result;
-  } else if (c10::isFloatingType(type)) {
+  }
+  if (c10::isFloatingType(type)) {
     result.stype = ::diopiDtype_t::diopi_dtype_float64;
     result.fval = scalar.toDouble();
     return result;
-  } else if (c10::isIntegralType(type, false)) {
+  }
+  if (c10::isIntegralType(type, false)) {
     result.stype = ::diopiDtype_t::diopi_dtype_int64;
     result.ival = static_cast<int64_t>(scalar.toLong());
     return result;
@@ -152,6 +158,7 @@ caffe2::TypeMeta toATenType(::diopiDtype_t dt) {
   }
 }
 
+// NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
 int64_t getElemSize(::diopiDtype_t dt) {
   switch (dt) {
     case diopi_dtype_int32:
@@ -179,6 +186,7 @@ int64_t getElemSize(::diopiDtype_t dt) {
       TORCH_CHECK(false, "invalid diopi type, diopi type is ", dt);
   }
 }
+// NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
 
 c10::DeviceType toATenDevice(::diopiDevice_t device) {
   switch (device) {
@@ -191,11 +199,11 @@ c10::DeviceType toATenDevice(::diopiDevice_t device) {
   }
 }
 
-::diopiSize_t toDiopiSize(const at::OptionalIntArrayRef& input) {
+::diopiSize_t toDiopiSize(const at::OptionalIntArrayRef& dim) {
   ::diopiSize_t diopi_size{nullptr, 0};
-  if (input.has_value()) {
-    diopi_size.data = input.value().data();
-    diopi_size.len = input.value().size();
+  if (dim.has_value()) {
+    diopi_size.data = dim.value().data();
+    diopi_size.len = static_cast<int64_t>(dim.value().size());
   }
   return diopi_size;
 }
@@ -203,17 +211,19 @@ ::diopiSize_t toDiopiSize(const at::OptionalIntArrayRef& input) {
 ::diopiSize_t toDiopiSize(at::IntArrayRef input) {
   ::diopiSize_t diopi_size{nullptr, 0};
   diopi_size.data = input.data();
-  diopi_size.len = input.size();
+  diopi_size.len = static_cast<int64_t>(input.size());
   return diopi_size;
 }
 
 ::diopiRoundMode_t toDiopiRoundMode(const std::string& rounding_mode) {
   if (rounding_mode == "none" || rounding_mode == "None" ||
-      rounding_mode.size() <= 0) {
+      rounding_mode.empty()) {
     return RoundModeNone;
-  } else if (rounding_mode == "floor") {
+  }
+  if (rounding_mode == "floor") {
     return RoundModeFloor;
-  } else if (rounding_mode == "trunc") {
+  }
+  if (rounding_mode == "trunc") {
     return RoundModeTrunc;
   }
   TORCH_CHECK(false,
diff --git a/dipu/torch_dipu/csrc_dipu/profiler/CorrelationIDManager.cpp b/dipu/torch_dipu/csrc_dipu/profiler/CorrelationIDManager.cpp
index 1db7ec9d37..98e7141985 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/CorrelationIDManager.cpp
+++ b/dipu/torch_dipu/csrc_dipu/profiler/CorrelationIDManager.cpp
@@ -22,14 +22,18 @@ void CorrelationIDManager::popCorrelationID(
   type_.pop_back();
 }
 
-uint64_t CorrelationIDManager::getCorrelationID() const {
+uint64_t CorrelationIDManager::getCorrelationID() {
   DeviceActivityInterface::CorrelationFlowType type = type_.back();
   return external_ids_[type].back();
 }
 
-thread_local std::deque<uint64_t> CorrelationIDManager::external_ids_
-    [DeviceActivityInterface::CorrelationFlowType::End];
+thread_local std::array<std::deque<uint64_t>,
+                        DeviceActivityInterface::CorrelationFlowType::End>
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+    CorrelationIDManager::external_ids_;
+
 thread_local std::deque<DeviceActivityInterface::CorrelationFlowType>
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
     CorrelationIDManager::type_;
 
 }  // namespace profile
diff --git a/dipu/torch_dipu/csrc_dipu/profiler/CorrelationIDManager.h b/dipu/torch_dipu/csrc_dipu/profiler/CorrelationIDManager.h
index 0db965b980..79e478757a 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/CorrelationIDManager.h
+++ b/dipu/torch_dipu/csrc_dipu/profiler/CorrelationIDManager.h
@@ -1,8 +1,9 @@
 #pragma once
 
-#include <DeviceActivityInterface.h>
+#include <cstdint>
 #include <deque>
-#include <stdint.h>
+
+#include "DeviceActivityInterface.h"
 
 namespace dipu {
 namespace profile {
@@ -15,21 +16,25 @@ class CorrelationIDManager {
   // CorrelationIDManager designed as a singleton
   static CorrelationIDManager& instance();
 
-  void pushCorrelationID(
+  static void pushCorrelationID(
       uint64_t id,
       libkineto::DeviceActivityInterface::CorrelationFlowType type);
-  void popCorrelationID(
+  static void popCorrelationID(
       libkineto::DeviceActivityInterface::CorrelationFlowType type);
-  uint64_t getCorrelationID() const;
+  static uint64_t getCorrelationID();
 
  private:
   CorrelationIDManager() = default;
 
- private:
-  thread_local static std::deque<uint64_t> external_ids_
-      [libkineto::DeviceActivityInterface::CorrelationFlowType::End];
+  thread_local static std::array<
+      std::deque<uint64_t>,
+      libkineto::DeviceActivityInterface::CorrelationFlowType::End>
+      // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+      external_ids_;
+
   thread_local static std::deque<
       libkineto::DeviceActivityInterface::CorrelationFlowType>
+      // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
       type_;
 };
 
diff --git a/dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.cpp b/dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.cpp
index 43a02954b7..c337e2a9b5 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.cpp
+++ b/dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.cpp
@@ -11,7 +11,6 @@
 namespace dipu {
 namespace profile {
 
-using libkineto::DeviceActivityInterface;
 using libkineto::GenericTraceActivity;
 
 DIPUDeviceActivity::~DIPUDeviceActivity() {
@@ -34,12 +33,12 @@ void DIPUDeviceActivity::popCorrelationID(
 }
 
 void DIPUDeviceActivity::enableActivities(
-    const std::set<libkineto::ActivityType>& selectedActivities) {}
+    const std::set<libkineto::ActivityType>& selected_activities) {}
 
 void DIPUDeviceActivity::disableActivities(
-    const std::set<libkineto::ActivityType>& selectedActivities) {
-  if (selectedActivities.find(libkineto::ActivityType::CONCURRENT_KERNEL) !=
-      selectedActivities.end()) {
+    const std::set<libkineto::ActivityType>& selected_activities) {
+  if (selected_activities.find(libkineto::ActivityType::CONCURRENT_KERNEL) !=
+      selected_activities.end()) {
     setProfileOpen(false);
   }
 }
@@ -52,18 +51,18 @@ void DIPUDeviceActivity::clearActivities() {
 
 int32_t DIPUDeviceActivity::processActivities(
     libkineto::ActivityLogger& logger,
-    std::function<const libkineto::ITraceActivity*(int32_t)> linkedActivity,
-    int64_t startTime, int64_t endTime) {
+    std::function<const libkineto::ITraceActivity*(int32_t)> linked_activity,
+    int64_t start_time, int64_t end_time) {
   FlushAllRecords();
-
+  constexpr size_t kMillisecondPerSecond = 1000;
   auto records = RecordsImpl::get().getAllRecordList();
   for (const auto& record : records) {
     GenericTraceActivity act;
-    act.startTime = record.begin / 1000;
-    act.endTime = record.end / 1000;
-    act.id = record.opId;
-    act.device = record.pid;
-    act.resource = record.threadIdx;
+    act.startTime = static_cast<int64_t>(record.begin / kMillisecondPerSecond);
+    act.endTime = static_cast<int64_t>(record.end / kMillisecondPerSecond);
+    act.id = static_cast<int32_t>(record.opId);
+    act.device = static_cast<int32_t>(record.pid);
+    act.resource = static_cast<int32_t>(record.threadIdx);
     act.flow.id = record.opId;
     if (record.isKernel) {
       act.activityType = libkineto::ActivityType::CONCURRENT_KERNEL;
@@ -76,17 +75,17 @@ int32_t DIPUDeviceActivity::processActivities(
     act.flow.id = record.opId;
     act.flow.type = libkineto::kLinkAsyncCpuGpu;
     auto link_cor_id = record.linkCorrelationId;
-    act.linked = linkedActivity(link_cor_id);
+    act.linked = linked_activity(static_cast<int32_t>(link_cor_id));
     logger.handleGenericActivity(act);
   }
 
   std::map<std::pair<int64_t, int64_t>, libkineto::ResourceInfo>
       resource_infos = RecordsImpl::get().getResourceInfo();
   for (const auto& kv : resource_infos) {
-    logger.handleResourceInfo(kv.second, startTime);
+    logger.handleResourceInfo(kv.second, start_time);
   }
 
-  return records.size();
+  return static_cast<int32_t>(records.size());
 }
 
 void DIPUDeviceActivity::teardownContext() {}
@@ -98,6 +97,7 @@ void DIPUDeviceActivity::setMaxBufferSize(int32_t size) {}
 
 namespace libkineto {
 
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 DeviceActivityInterface* device_activity_singleton =
     &dipu::profile::DIPUDeviceActivity::instance();
 
diff --git a/dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.h b/dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.h
index cb5869bdd3..d7e193bf93 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.h
+++ b/dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.h
@@ -39,7 +39,6 @@ class DIPUDeviceActivity : public libkineto::DeviceActivityInterface {
  private:
   DIPUDeviceActivity() = default;
 
- private:
   std::unordered_map<uint64_t, std::unique_ptr<libkineto::GenericTraceActivity>>
       cpu_activities_;
   std::unordered_map<uint64_t, std::unique_ptr<libkineto::GenericTraceActivity>>
diff --git a/dipu/torch_dipu/csrc_dipu/profiler/collection.cpp b/dipu/torch_dipu/csrc_dipu/profiler/collection.cpp
index 90d82005a4..60ff2912fc 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/collection.cpp
+++ b/dipu/torch_dipu/csrc_dipu/profiler/collection.cpp
@@ -31,23 +31,18 @@ constexpr bool kKinetoAvailable{true};
 
 using torch::profiler::perf_counters_t;
 using torch::profiler::impl::ActivityType;
-using torch::profiler::impl::AppendOnlyList;
 using torch::profiler::impl::approx_time_t;
 using torch::profiler::impl::ExtraFields;
 using torch::profiler::impl::KinetoObserverContext;
 using torch::profiler::impl::op_input_t;
 using torch::profiler::impl::ProfilerConfig;
-using torch::profiler::impl::ProfilerState;
-using torch::profiler::impl::RawTensorMetadata;
 using torch::profiler::impl::Result;
-using torch::profiler::impl::stacksToStr;
 using torch::profiler::impl::TensorMetadata;
 using torch::profiler::impl::kineto::ActivityTraceWrapper;
 using torch::profiler::impl::kineto::DeviceAndResource;
 using torch::profiler::impl::kineto::interface_trace_t;
 using torch::profiler::impl::kineto::kineto_ids;
 using torch::profiler::impl::python_tracer::CompressedEvent;
-using torch::profiler::impl::python_tracer::PythonTracerBase;
 
 using result_ptr_t = std::shared_ptr<torch::profiler::impl::Result>;
 using trace_ptr_t =
@@ -61,8 +56,8 @@ void DIPUInputOutputEncoder::push(c10::ArrayRef<const c10::IValue> values) {
       tags_.emplace_back(Tag::Scalar);
       // Scalars are small enough that they are stored in ivalues without an
       // extra memory alloc
-      // TODO: further optimize this by maybe giving Profiler access to the
-      // guts of IValue.
+      // TODO(caikun-pjlab): further optimize this by maybe giving Profiler
+      // access to the guts of IValue.
       ivalues_.emplace_back(value);
     } else if (value.isTensorList()) {
       tags_.emplace_back(Tag::TensorListBegin);
@@ -78,7 +73,7 @@ void DIPUInputOutputEncoder::push(c10::ArrayRef<const c10::IValue> values) {
 }
 
 void DIPUInputOutputEncoder::push(const at::Tensor& t) {
-  if (t.defined() && !t.is_nested()) {  // TODO fix nested sizes
+  if (t.defined() && !t.is_nested()) {  // TODO(caikun-pjlab) fix nested sizes
     tags_.emplace_back(Tag::Tensor);
     tensor_metadata_.emplace_back(t);
     tensor_sizes_strides_.copy(t.sizes());
@@ -196,8 +191,8 @@ uint64_t DIPUThreadLocalSubqueue::TorchOpStorage::EventBlock<
 // ---------------------------------
 std::unique_ptr<KinetoObserverContext> DIPUThreadLocalSubqueue::begin_op(
     const at::RecordFunction& fn) {
-  KinetoObserverContext::Event* event;
-  uint64_t corr_id;
+  KinetoObserverContext::Event* event = nullptr;
+  uint64_t corr_id = 0;
   std::tie(event, corr_id) = torch_ops_.op_events_.emplace_back(
       fn.seqNr(), fn.forwardThreadId(), fn.scope(), fn.isAsync(),
       fn.debugHandle(), fn.name());
@@ -212,7 +207,7 @@ std::unique_ptr<KinetoObserverContext> DIPUThreadLocalSubqueue::begin_op(
   }
 
   // backward nodes source range corresponds to the forward node
-  // TODO: consider using C++ stack trace
+  // TODO(caikun-pjlab): consider using C++ stack trace
   if (config_.with_stack && fn.scope() != at::RecordScope::BACKWARD_FUNCTION) {
     auto cs =
         torch::profiler::impl::prepareCallstack(torch::jit::currentCallstack());
@@ -244,7 +239,7 @@ std::unique_ptr<KinetoObserverContext> DIPUThreadLocalSubqueue::begin_op(
 namespace {
 template <typename T>
 struct StealOrDefault {
-  StealOrDefault(T& container)
+  explicit StealOrDefault(T& container)
       : container_{container}, it_{container.begin()} {}
 
   ~StealOrDefault() { container_.get().clear(); }
@@ -252,11 +247,10 @@ struct StealOrDefault {
   typename T::Iterator::value_type operator()() {
     if (it_.exhausted()) {
       return typename T::Iterator::value_type();
-    } else {
-      auto result = std::move(*it_);
-      ++it_;
-      return result;
     }
+    auto result = std::move(*it_);
+    ++it_;
+    return result;
   }
 
   std::reference_wrapper<T> container_;
@@ -266,8 +260,8 @@ struct StealOrDefault {
 
 void DIPUThreadLocalSubqueue::TorchOpStorage::materialize(
     std::vector<std::shared_ptr<Result>>& out,
-    const std::function<time_t(approx_time_t)> time_converter,
-    const uint64_t tid, const DeviceAndResource& kineto_info) {
+    const std::function<time_t(approx_time_t)>& time_converter, uint64_t tid,
+    const DeviceAndResource& kineto_info) {
   // Plumb Autograd info to the top level annotation.
   auto it = op_events_.begin();
   for (C10_UNUSED const auto _ :
@@ -297,7 +291,8 @@ void DIPUThreadLocalSubqueue::TorchOpStorage::materialize(
 
   auto input_getter = inputs_outputs_.getNextShapesAndDtypes();
 
-  // TODO: CTAD will take care of template args when we move to C++17
+  // TODO(caikun-pjlab): CTAD will take care of template args when we move to
+  // C++17
   auto jit_stack = StealOrDefault<decltype(jit_stack_)>(jit_stack_);
   auto jit_module = StealOrDefault<decltype(jit_modules_)>(jit_modules_);
   auto extra_args = StealOrDefault<decltype(extra_args_)>(extra_args_);
@@ -338,7 +333,9 @@ struct SubQueueThreadCache {
 // `sub_queue_cache_.key_` before attempting to access `ref_`, and if `key_`
 // does not match the DIPURecordQueue's *unique* `id_` it will evict
 // `sub_queue_cache_` and fall back to a different mechanism.
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 std::atomic<uint32_t> queue_id_{0};
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 thread_local SubQueueThreadCache sub_queue_cache_{0, nullptr};
 
 std::string toString(
@@ -380,7 +377,7 @@ auto kinetoEventCorrelationID(
 }
 }  // namespace
 
-DIPUThreadLocalSubqueue::DIPUThreadLocalSubqueue(const uint64_t tid,
+DIPUThreadLocalSubqueue::DIPUThreadLocalSubqueue(uint64_t tid,
                                                  const ProfilerConfig& config)
     : tid_{tid}, config_{config}, kineto_info_{kineto_ids()} {
   libkineto::api().activityProfiler().recordThreadInfo();
@@ -400,7 +397,7 @@ DIPURecordQueue::DIPURecordQueue(const ProfilerConfig& config,
 }
 
 bool DIPURecordQueue::tracePython() const {
-  return config_.with_stack && activities_.count(ActivityType::CPU);
+  return config_.with_stack && (activities_.count(ActivityType::CPU) != 0U);
 }
 
 DIPUThreadLocalSubqueue* DIPURecordQueue::getSubqueue() {
@@ -443,19 +440,22 @@ void mark_finished(std::shared_ptr<Result>& r) {
   TORCH_INTERNAL_ASSERT(r->endTimeNS() >= r->start_time_ns_, r->name());
 }
 
-static constexpr const char* indexKey = "Ev Idx";
+constexpr const char* indexKey = "Ev Idx";
 
 void passEventsToKineto(const std::vector<std::shared_ptr<Result>>& results,
                         uint64_t start_time_us, uint64_t end_time_us) {
-  using namespace torch::profiler::impl::kineto;
-  TraceWrapper cpu_trace(start_time_us, "PyTorch Profiler");
+  using torch::profiler::impl::kineto::addMetadata;
+  using torch::profiler::impl::kineto::TraceWrapper;
+  TraceWrapper cpu_trace(static_cast<time_t>(start_time_us),
+                         "PyTorch Profiler");
 
   // Generate Kineto events for each event recorded by the PyTorch profiler.
+  constexpr time_t kNsPerUs = 1000;
   for (const auto i : c10::irange(results.size())) {
     const auto& e = results[i];
     const auto* activity = cpu_trace.addCPUActivity(
         e->name(), e->kinetoType(), e->kineto_info_, e->correlationID(),
-        e->start_time_ns_ / 1000, e->endTimeNS() / 1000);
+        e->start_time_ns_ / kNsPerUs, e->endTimeNS() / kNsPerUs);
 
     TORCH_INTERNAL_ASSERT(activity || !kKinetoAvailable);
     if (activity) {
@@ -464,7 +464,7 @@ void passEventsToKineto(const std::vector<std::shared_ptr<Result>>& results,
   }
 
   // Kineto adds the events that it collected.
-  cpu_trace.transferCpuTrace(end_time_us);
+  cpu_trace.transferCpuTrace(static_cast<int64_t>(end_time_us));
 }
 
 // There are two mechanisms that we use to connect Profiler and Kineto events.
@@ -508,7 +508,7 @@ class TransferEvents {
   }
 
  private:
-  static long long extractIndex(const std::string& metadata_json) {
+  static int64_t extractIndex(const std::string& metadata_json) {
     static const auto prefix = fmt::format("\"{}\": ", indexKey);
     auto pos = metadata_json.find(prefix);
     return (pos == std::string::npos) ? unmatchedIndex : [&]() {
@@ -561,7 +561,7 @@ class TransferEvents {
     }
   }
 
-  std::shared_ptr<Result> resultFromActivity(const itrace_t* activity) {
+  static std::shared_ptr<Result> resultFromActivity(const itrace_t* activity) {
     TORCH_INTERNAL_ASSERT(activity != nullptr);
 
     // Kineto is inconsistent with types, so we have to cast to int32.
@@ -569,8 +569,9 @@ class TransferEvents {
         static_cast<int32_t>(activity->deviceId()),
         static_cast<int32_t>(activity->resourceId())};
 
+    constexpr size_t kNsPerUs = 1000;
     auto event = Result::create(
-        activity->timestamp() * 1000,
+        activity->timestamp() * kNsPerUs,
         noTID,  // Placeholder
         device_and_resource,
         ExtraFields<torch::profiler::impl::EventType::Kineto>{
@@ -580,7 +581,7 @@ class TransferEvents {
             activity->type(),
             {/*id=*/static_cast<uint32_t>(activity->flowId()),
              /*type=*/static_cast<uint32_t>(activity->flowType()),
-             /*start=*/activity->flowStart()}});
+             /*start=*/static_cast<uint32_t>(activity->flowStart())}});
 
     // NB: It's tempting to set `event->kineto_activity_`; however we can only
     // guarantee that the events we passed to Kineto are of type
@@ -668,9 +669,10 @@ class TransferEvents {
       e->visit(c10::overloaded(
           [&](const ExtraFields<torch::profiler::impl::EventType::Kineto>& i) {
             // Flow takes priority over linked event.
-            const auto it = flow_map.find(i.flow.id);
+            const auto it = flow_map.find(static_cast<int>(i.flow.id));
             if (it != flow_map.end() &&
-                i.flow.type == libkineto::kLinkAsyncCpuGpu && !i.flow.start) {
+                i.flow.type == libkineto::kLinkAsyncCpuGpu &&
+                (i.flow.start == 0U)) {
               e->parent_ = it->second;
             }
 
@@ -692,7 +694,7 @@ class TransferEvents {
     }
   }
 
-  static constexpr long long unmatchedIndex = -1;
+  static constexpr int64_t unmatchedIndex = -1;
   static constexpr auto noTID = std::numeric_limits<uint64_t>::max();
   std::reference_wrapper<std::vector<std::shared_ptr<Result>>> results_;
   std::vector<const itrace_t*> trace_activities_;
@@ -706,7 +708,6 @@ ActivityTraceWrapper stopTrace() {
 trace_ptr_t addKinetoEvents(std::vector<std::shared_ptr<Result>>& results,
                             uint64_t start_time_us, uint64_t end_time_us,
                             const ProfilerConfig& config) {
-  using namespace torch::profiler::impl::kineto;
   passEventsToKineto(results, start_time_us, end_time_us);
 
   // In on demand mode kineto is directly controlled by other machinery.
@@ -739,58 +740,60 @@ void set_in_tree_building(std::vector<result_ptr_t>& results,
   }
 }
 
-void build_tree(std::vector<std::shared_ptr<Result>>& sorted_events) {
-  set_in_tree_building(sorted_events, true);
-
+void push_event(std::shared_ptr<Result>& event,
+                ska::flat_hash_map<uint64_t, std::shared_ptr<Result>>& stacks,
+                std::priority_queue<result_ptr_t, std::vector<result_ptr_t>,
+                                    ResultGreater>& end_events_) {
+  // Kineto builds subtrees using correlation ids and flows, so some Kineto
+  // events are already marked finished before the main tree building
+  // algorithm. It's fine to ignore them; the root event of these subtrees
+  // not a Kineto op and will be handled normally.
   using op_fields = ExtraFields<torch::profiler::impl::EventType::TorchOp>;
-  ska::flat_hash_map<uint64_t, std::shared_ptr<Result>> stacks;
-  std::priority_queue<result_ptr_t, std::vector<result_ptr_t>, ResultGreater>
-      end_events_;
 
-  auto push_event = [&stacks, &end_events_](std::shared_ptr<Result>& event) {
-    // Kineto builds subtrees using correlation ids and flows, so some Kineto
-    // events are already marked finished before the main tree building
-    // algorithm. It's fine to ignore them; the root event of these subtrees
-    // not a Kineto op and will be handled normally.
-    if (c10::holds_alternative<
-            ExtraFields<torch::profiler::impl::EventType::Kineto>>(
-            event->extra_fields_) &&
-        event->finished_) {
-      return;
-    }
+  if (c10::holds_alternative<
+          ExtraFields<torch::profiler::impl::EventType::Kineto>>(
+          event->extra_fields_) &&
+      event->finished_) {
+    return;
+  }
 
-    TORCH_INTERNAL_ASSERT(event->parent_.expired());
-    for (const auto& child : event->children_) {
-      TORCH_INTERNAL_ASSERT(child->finished_);
-    }
-    TORCH_INTERNAL_ASSERT(!event->finished_);
-
-    auto parent_it = stacks.find(event->start_tid_);
-    if (parent_it == stacks.end()) {
-      auto fwd_tid = event->visit(
-          c10::overloaded([](const op_fields& i) { return i.forward_tid_; },
-                          [](const auto&) -> uint64_t { return 0; }));
-      if (fwd_tid) {
-        parent_it = stacks.find(fwd_tid);
-      }
+  TORCH_INTERNAL_ASSERT(event->parent_.expired());
+  for (const auto& child : event->children_) {
+    TORCH_INTERNAL_ASSERT(child->finished_);
+  }
+  TORCH_INTERNAL_ASSERT(!event->finished_);
+
+  auto parent_it = stacks.find(event->start_tid_);
+  if (parent_it == stacks.end()) {
+    auto fwd_tid = event->visit(
+        c10::overloaded([](const op_fields& i) { return i.forward_tid_; },
+                        [](const auto&) -> uint64_t { return 0; }));
+    if (fwd_tid) {
+      parent_it = stacks.find(fwd_tid);
     }
+  } else {
+    event->parent_ = parent_it->second;
+    parent_it->second->children_.push_back(event);
+  }
 
-    if (parent_it != stacks.end()) {
-      event->parent_ = parent_it->second;
-      parent_it->second->children_.push_back(event);
-    }
+  if (event->endTimeNS() > event->start_time_ns_) {
+    stacks[event->start_tid_] = event;
+    end_events_.push(event);
+  } else if (event->endTimeNS() == std::numeric_limits<time_t>::min()) {
+    // We use min time to indicate the lack of a termination event, so if we
+    // encounter such a case we don't push to `end_events_`.
+    stacks[event->start_tid_] = event;
+  } else {
+    mark_finished(event);
+  }
+}
 
-    if (event->endTimeNS() > event->start_time_ns_) {
-      stacks[event->start_tid_] = event;
-      end_events_.push(event);
-    } else if (event->endTimeNS() == std::numeric_limits<time_t>::min()) {
-      // We use min time to indicate the lack of a termination event, so if we
-      // encounter such a case we don't push to `end_events_`.
-      stacks[event->start_tid_] = event;
-    } else {
-      mark_finished(event);
-    }
-  };
+void build_tree(std::vector<std::shared_ptr<Result>>& sorted_events) {
+  set_in_tree_building(sorted_events, true);
+
+  ska::flat_hash_map<uint64_t, std::shared_ptr<Result>> stacks;
+  std::priority_queue<result_ptr_t, std::vector<result_ptr_t>, ResultGreater>
+      end_events_;
 
   auto pop_event = [&stacks](std::shared_ptr<Result> event) {
     if (event->finished_) {
@@ -823,7 +826,7 @@ void build_tree(std::vector<std::shared_ptr<Result>>& sorted_events) {
       pop_event(end_events_.top());
       end_events_.pop();
     }
-    push_event(event);
+    push_event(event, stacks, end_events_);
   }
 
   // Cleanup remaining exit events.
@@ -869,12 +872,10 @@ int64_t adjust_durations_dfs(std::shared_ptr<Result>& r) {
                         r->name());
           }));
       return children_total_duration;
-    } else {
-      return original_duration;
     }
-  } else {
-    return 0;
+    return original_duration;
   }
+  return 0;
 }
 
 /**
@@ -961,6 +962,7 @@ DIPURecordQueue::getRecords(std::function<time_t(approx_time_t)> time_converter,
 
   // Used as a replacement of if-constexpr (C++ 17) to implement static
   // polymorphism.
+  constexpr time_t kNsPerUs = 1000;
   struct {
     std::reference_wrapper<decltype(converter)> convert;
     using Event = torch::profiler::impl::EventType;
@@ -968,7 +970,7 @@ DIPURecordQueue::getRecords(std::function<time_t(approx_time_t)> time_converter,
       return convert(i.start_time_);
     }
     time_t operator()(const ExtraFields<Event::Backend>& i) const {
-      return i.start_time_us_ * 1000;
+      return i.start_time_us_ * kNsPerUs;
     }
   } start_time_of{std::ref(converter)};
 
@@ -1008,8 +1010,9 @@ DIPURecordQueue::getRecords(std::function<time_t(approx_time_t)> time_converter,
   }
 
   if (python_tracer_) {
-    for (const auto& i : python_tracer_->getEvents(converter, python_enters,
-                                                   end_time_us * 1000)) {
+    for (const auto& i : python_tracer_->getEvents(
+             converter, python_enters,
+             static_cast<time_t>(end_time_us) * kNsPerUs)) {
       out.push_back(i);
     }
     python_tracer_.reset();
diff --git a/dipu/torch_dipu/csrc_dipu/profiler/collection.h b/dipu/torch_dipu/csrc_dipu/profiler/collection.h
index dc220b6f08..c937d796f6 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/collection.h
+++ b/dipu/torch_dipu/csrc_dipu/profiler/collection.h
@@ -43,7 +43,7 @@ class DIPUInputOutputEncoder final {
   enum class Tag {
     Tensor = 0,
     UndefinedTensor,
-    TensorListBegin,  // TODO: generalize to other lists.
+    TensorListBegin,  // TODO(caikun-pjlab): generalize to other lists.
     Scalar,
     Other,
     TERMINATOR
@@ -68,7 +68,7 @@ class DIPUInputOutputEncoder final {
 
 class DIPUThreadLocalSubqueue {
  public:
-  DIPUThreadLocalSubqueue(const uint64_t tid,
+  DIPUThreadLocalSubqueue(uint64_t tid,
                           const torch::profiler::impl::ProfilerConfig& config);
 
   std::unique_ptr<torch::profiler::impl::KinetoObserverContext> begin_op(
@@ -124,9 +124,9 @@ class DIPUThreadLocalSubqueue {
     // NB: This is a destructive operation.
     void materialize(
         std::vector<std::shared_ptr<torch::profiler::impl::Result>>& out,
-        const std::function<time_t(torch::profiler::impl::approx_time_t)>
+        const std::function<time_t(torch::profiler::impl::approx_time_t)>&
             time_converter,
-        const uint64_t tid,
+        uint64_t tid,
         const torch::profiler::impl::kineto::DeviceAndResource& kineto_info);
 
     template <typename T, size_t ChunkSize>
diff --git a/dipu/torch_dipu/csrc_dipu/profiler/patch.cpp b/dipu/torch_dipu/csrc_dipu/profiler/patch.cpp
index 51cee0fdc6..7d99301ded 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/patch.cpp
+++ b/dipu/torch_dipu/csrc_dipu/profiler/patch.cpp
@@ -35,6 +35,7 @@ ApproximateClockToUnixTimeConverter::measurePair() {
 }
 
 ApproximateClockToUnixTimeConverter::time_pairs
+// NOLINTNEXTLINE(readability-convert-member-functions-to-static)
 ApproximateClockToUnixTimeConverter::measurePairs() {
   static constexpr auto n_warmup = 5;
   for (C10_UNUSED const auto _ : c10::irange(n_warmup)) {
@@ -58,7 +59,8 @@ ApproximateClockToUnixTimeConverter::makeConverter() {
   for (const auto i : c10::irange(replicates)) {
     auto delta_ns = end_times[i].t_ - start_times_[i].t_;
     auto delta_approx = end_times[i].approx_t_ - start_times_[i].approx_t_;
-    scale_factors[i] = (double)delta_ns / (double)delta_approx;
+    scale_factors[i] =
+        static_cast<double>(delta_ns) / static_cast<double>(delta_approx);
   }
   std::sort(scale_factors.begin(), scale_factors.end());
   long double scale_factor = scale_factors[replicates / 2 + 1];
@@ -76,14 +78,18 @@ ApproximateClockToUnixTimeConverter::makeConverter() {
   for (const auto i : c10::irange(replicates)) {
     auto dt = start_times_[i].t_ - t0;
     auto dt_approx =
-        (double)(start_times_[i].approx_t_ - t0_approx) * scale_factor;
-    t0_correction[i] = dt - (time_t)dt_approx;
+        static_cast<double>(start_times_[i].approx_t_ - t0_approx) *
+        scale_factor;
+    t0_correction[i] = static_cast<double>(dt - static_cast<time_t>(dt_approx));
   }
-  t0 += t0_correction[t0_correction.size() / 2 + 1];
+  t0 += static_cast<time_t>(t0_correction[t0_correction.size() / 2 + 1]);
 
   return [=](approx_time_t t_approx) {
     // See above for why this is more stable than `A * t_approx + B`.
-    auto result = (time_t)((double)(t_approx - t0_approx) * scale_factor) + t0;
+    auto result =
+        static_cast<time_t>(static_cast<double>(t_approx - t0_approx) *
+                            scale_factor) +
+        t0;
     return result;
   };
 }
@@ -98,12 +104,13 @@ namespace linux_perf {
 /*
  * Syscall wrapper for perf_event_open(2)
  */
-inline long perf_event_open(struct perf_event_attr* hw_event, pid_t pid,
-                            int cpu, int group_fd, unsigned long flags) {
+inline int64_t perf_event_open(struct perf_event_attr* hw_event, pid_t pid,
+                               int cpu, int group_fd, uint64_t flags) {
   return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
 }
 
-// TODO sync with Kineto level abstract events in profiler/events.h
+// TODO(caikun-pjlab): sync with Kineto level abstract events in
+// profiler/events.h
 static const std::unordered_map<
     std::string, std::pair<perf_type_id, /* perf event type */ uint32_t>>
     EventTable{{"cycles",
@@ -156,7 +163,7 @@ void PerfEvent::Init() {
   pid_t pid = getpid();  // this pid
   int cpu = -1;          // all cpus
   int group_fd = -1;
-  unsigned long flags = 0;
+  uint64_t flags = 0;
 
   fd_ = static_cast<int>(perf_event_open(&attr, pid, cpu, group_fd, flags));
   if (fd_ == -1) {
@@ -168,7 +175,7 @@ void PerfEvent::Init() {
 
 uint64_t PerfEvent::ReadCounter() const {
   PerfCounter counter{};
-  long n = read(fd_, &counter, sizeof(PerfCounter));
+  int64_t n = read(fd_, &counter, sizeof(PerfCounter));
   TORCH_CHECK(n == sizeof(counter),
               "Read failed for Perf event fd, event : ", name_,
               ", error: ", std::strerror(errno));
@@ -197,7 +204,7 @@ void PerfProfiler::Configure(std::vector<std::string>& event_names) {
     events_.back().Init();
   }
 
-  // TODO
+  // TODO(caikun-pjlab):
   // Reset pthreadpool here to make sure we can attach to new children
   // threads
 }
@@ -265,7 +272,7 @@ activity_t* TraceWrapper::addCPUActivity(
   auto& act = libkineto::CpuTraceBuffer::toRef(cpu_trace_->activities.back());
   act.device = device_and_resource.device;
   act.resource = device_and_resource.resource;
-  act.id = correlation_id;
+  act.id = static_cast<int32_t>(correlation_id);
   act.startTime = start_time;
   if (type != libkineto::ActivityType::CPU_INSTANT_EVENT) {
     act.endTime = end_time;
@@ -318,9 +325,11 @@ void ActivityTraceWrapper::save(const std::string& path) {
 
 void addMetadata(const activity_t* activity, const std::string& key,
                  const std::string& value) {
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
   const_cast<activity_t*>(activity)->addMetadata(key, value);
 }
 
+// NOLINTNEXTLINE(readability-const-return-type)
 const DeviceAndResource kineto_ids() {
 #ifdef USE_KINETO
   return {/*device=*/libkineto::processId(),
@@ -330,14 +339,14 @@ const DeviceAndResource kineto_ids() {
 #endif  // USE_KINETO
 }
 
-struct RegisterLibKinetoClient {
+const struct RegisterLibKinetoClient {
   RegisterLibKinetoClient() { libkineto::api(); }
 } register_libkineto_client;
 
 }  // namespace kineto
 
 namespace {
-static constexpr TensorImplAddress NoTensorImpl{nullptr};
+constexpr TensorImplAddress NoTensorImpl{nullptr};
 
 struct RawTensorInfo {
   TensorImplAddress impl_;
@@ -378,10 +387,51 @@ struct RawTensors {
   }
 
   template <typename T>
-  void operator()(T&) {}
+  void operator()(T& t) {}
 
   std::vector<RawTensorInfo> tensors_;
 };
+
+void FlattenToUniformRepresentation(
+    std::vector<std::shared_ptr<Result>>& sorted_results,
+    std::vector<RawTensorInfo>& tensors) {
+  RawTensors raw_tensors;
+  // The python tracer caches values, so it's only safe to use the first case.
+  ska::flat_hash_set<PyModuleSelf> seen_modules;
+  ska::flat_hash_set<PyOptimizerSelf> seen_optimizers;
+  for (auto& result : sorted_results) {
+    result->visit(c10::overloaded(
+        [&](ExtraFields<EventType::TorchOp>& torch_op) {
+          for (auto& i : torch_op.inputs_) {
+            c10::visit(raw_tensors, i);
+          }
+        },
+        [&](ExtraFields<EventType::PyCall>& py_call) {
+          // torch.nn.Module
+          if (py_call.module_.has_value() &&
+              seen_modules.insert(py_call.module_->self_).second) {
+            for (auto& p : py_call.module_->parameters_) {
+              raw_tensors(p.metadata_);
+              raw_tensors(p.grad_metadata_);
+            }
+          }
+
+          // torch.optim.Optimizer
+          if (py_call.optimizer_.has_value() &&
+              seen_optimizers.insert(py_call.optimizer_->self_).second) {
+            for (auto& p : py_call.optimizer_->parameters_) {
+              raw_tensors(p.metadata_);
+              raw_tensors(p.grad_metadata_);
+              for (auto& state_i : p.state_) {
+                raw_tensors(state_i.second);
+              }
+            }
+          }
+        },
+        [&](auto& i) { raw_tensors(i); }));
+  }
+  tensors = std::move(raw_tensors.tensors_);
+}
 }  // namespace
 
 void calculateUniqueTensorIDs(
@@ -393,45 +443,7 @@ void calculateUniqueTensorIDs(
 
   // Flatten results to a uniform representation.
   // --------------------------------------------------------------------------
-  {
-    RawTensors raw_tensors;
-
-    // The python tracer caches values, so it's only safe to use the first case.
-    ska::flat_hash_set<PyModuleSelf> seen_modules;
-    ska::flat_hash_set<PyOptimizerSelf> seen_optimizers;
-    for (auto& result : sorted_results) {
-      result->visit(c10::overloaded(
-          [&](ExtraFields<EventType::TorchOp>& torch_op) {
-            for (auto& i : torch_op.inputs_) {
-              c10::visit(raw_tensors, i);
-            }
-          },
-          [&](ExtraFields<EventType::PyCall>& py_call) {
-            // torch.nn.Module
-            if (py_call.module_.has_value() &&
-                seen_modules.insert(py_call.module_->self_).second) {
-              for (auto& p : py_call.module_->parameters_) {
-                raw_tensors(p.metadata_);
-                raw_tensors(p.grad_metadata_);
-              }
-            }
-
-            // torch.optim.Optimizer
-            if (py_call.optimizer_.has_value() &&
-                seen_optimizers.insert(py_call.optimizer_->self_).second) {
-              for (auto& p : py_call.optimizer_->parameters_) {
-                raw_tensors(p.metadata_);
-                raw_tensors(p.grad_metadata_);
-                for (auto& state_i : p.state_) {
-                  raw_tensors(state_i.second);
-                }
-              }
-            }
-          },
-          [&](auto& i) { raw_tensors(i); }));
-    }
-    tensors = std::move(raw_tensors.tensors_);
-  }
+  FlattenToUniformRepresentation(sorted_results, tensors);
 
   // Assign IDs to solve ABA for Storage.
   // --------------------------------------------------------------------------
@@ -441,7 +453,7 @@ void calculateUniqueTensorIDs(
     ska::flat_hash_map<key_t, size_t, HashCombine> versions;
     for (auto& t : tensors) {
       auto inserted = versions.insert({{t.storage_, t.device_}, counter});
-      counter += inserted.second;
+      counter += static_cast<size_t>(inserted.second);
       t.allocation_id_ref_.get().emplace(AllocationID(inserted.first->second));
       if (t.is_free_) {
         versions.erase(inserted.first);
@@ -503,7 +515,7 @@ void calculateUniqueTensorIDs(
     size_t current_id{0};
     for (const auto& i : unique_pairs) {
       auto inserted = id_map.insert({i.first, current_id});
-      current_id += inserted.second;
+      current_id += static_cast<size_t>(inserted.second);
       id_map.insert({i.second, inserted.first->second});
     }
   }
diff --git a/dipu/torch_dipu/csrc_dipu/profiler/profiler.cpp b/dipu/torch_dipu/csrc_dipu/profiler/profiler.cpp
index ea23bf43f0..f87855158a 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/profiler.cpp
+++ b/dipu/torch_dipu/csrc_dipu/profiler/profiler.cpp
@@ -1,12 +1,17 @@
 #include "profiler.h"
 
-#include <ThreadUtil.h>
 #include <cstdio>
-#include <fstream>
+#include <memory>
+#include <utility>
 
 #include <c10/util/Exception.h>
+#include <c10/util/string_view.h>
 #include <torch/csrc/profiler/util.h>
 
+#include "csrc_dipu/profiler/CorrelationIDManager.h"
+
+#include "ThreadUtil.h"
+
 namespace dipu {
 
 namespace profile {
@@ -15,7 +20,7 @@ static const int32_t DEFAULT_FLUSH_READY_INTERVAL = 1000;
 
 class DeviceEvent final {
  private:
-  deviceEvent_t evt_;
+  deviceEvent_t evt_{};
 
  public:
   DeviceEvent() { dipu::devproxy::createEvent(&evt_); }
@@ -34,26 +39,25 @@ class StreamTimeOffsetTracker final {
   DeviceEvent begin_;
   deviceStream_t stream_;
   size_t beginOffset_;
-  float ratio_ = 0.f;
+  float ratio_ = 0.F;
 
  public:
-  explicit StreamTimeOffsetTracker(deviceStream_t stream) {
-    stream_ = stream;
+  explicit StreamTimeOffsetTracker(deviceStream_t stream)
+      : stream_(stream), beginOffset_(torch::profiler::impl::getTime()) {
     devproxy::recordEvent(begin_.get(), stream_);
     devproxy::waitEvent(begin_.get());
-    beginOffset_ = torch::profiler::impl::getTime();
   }
 
   ~StreamTimeOffsetTracker() = default;
 
   void sync() {
     DeviceEvent end;
-    float time;
+    float time = 0.F;
     dipu::devproxy::recordEvent(end.get(), stream_);
     dipu::devproxy::waitEvent(end.get());
     dipu::devproxy::eventElapsedTime(&time, begin_.get(), end.get());
     size_t endOffset = torch::profiler::impl::getTime();
-    ratio_ = 1.0f * (endOffset - beginOffset_) / time;
+    ratio_ = static_cast<float>(endOffset - beginOffset_) / time;
   }
 
   const DeviceEvent& begin() const { return begin_; }
@@ -118,6 +122,7 @@ RecordsImpl::getResourceInfo() const {
   return resourceInfo_;
 }
 
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 thread_local RecordsImpl::records_t* RecordsImpl::pRecords = nullptr;
 
 class DeviceRecordsImpl final {
@@ -128,12 +133,14 @@ class DeviceRecordsImpl final {
   std::vector<Record> ready_records_;
   std::unique_ptr<StreamTimeOffsetTracker> pTracker_;
 
- private:
   DeviceRecordsImpl() {}
 
   static bool enableFlushReadyEvent() {
+    // There is no limit for cuda events on nv, so regular flushing is not
+    // necessary, thus reducing operator time consumption
     static bool enable_flush_ready =
-        (std::getenv("DIPU_DISABLE_FLUSH_READY_EVENT") == nullptr);
+        (std::getenv("DIPU_DISABLE_FLUSH_READY_EVENT") == nullptr &&
+         VENDOR_TYPE != devapis::VendorDeviceType::CUDA);
     return enable_flush_ready;
   }
 
@@ -151,7 +158,7 @@ class DeviceRecordsImpl final {
   }
 
   size_t getTime(const DeviceEvent& evt, float scale = 1., size_t shift = 0) {
-    float time;
+    float time = 0.F;
     dipu::devproxy::waitEvent(evt.get());
     dipu::devproxy::eventElapsedTime(&time, beginEvent(), evt.get());
     return static_cast<size_t>(time * scale) + shift;
@@ -160,17 +167,16 @@ class DeviceRecordsImpl final {
  public:
   ~DeviceRecordsImpl() { reset(); }
 
- public:
   void ensureSetup(deviceStream_t stream) {
     if (!pTracker_) {
       std::lock_guard<std::mutex> lk(mtx_);
       if (!pTracker_) {
-        pTracker_.reset(new StreamTimeOffsetTracker(stream));
+        pTracker_ = std::make_unique<StreamTimeOffsetTracker>(stream);
       }
     }
   }
 
-  void addDeviceRecord(DeviceRecord record) {
+  void addDeviceRecord(const DeviceRecord& record) {
     std::lock_guard<std::mutex> lk(mtx_);
     TORCH_CHECK(pTracker_, "dipu profiler error with pTracker is not inited");
     records_.push_back(record);
@@ -181,7 +187,7 @@ class DeviceRecordsImpl final {
   }
 
   void flushReady() {
-    while (records_.size() > 0) {
+    while (!records_.empty()) {
       auto& r = records_.front();
       auto start_status = dipu::devproxy::getEventStatus(r.start->get());
       auto end_status = dipu::devproxy::getEventStatus(r.stop->get());
@@ -191,30 +197,36 @@ class DeviceRecordsImpl final {
           origin_status != devapis::EventStatus::READY) {
         break;
       }
-      float t1 = 0.0f;
-      float t2 = 0.0f;
+      float t1 = 0.F;
+      float t2 = 0.F;
+      constexpr double kMillisecondPerSecond = 1e3;
       dipu::devproxy::eventElapsedTime(&t1, beginEvent(), r.start->get());
       dipu::devproxy::eventElapsedTime(&t2, r.start->get(), r.stop->get());
-      ready_records_.push_back(
-          Record({r.name, r.opId, static_cast<size_t>(t1 * 1e3),
-                  static_cast<size_t>((t1 + t2) * 1e3), r.deviceId, r.streamId,
-                  true, r.linkCorrelationId, r.extraInfo}));
+      ready_records_.push_back(Record(
+          {r.name, r.opId, static_cast<size_t>(t1 * kMillisecondPerSecond),
+           static_cast<size_t>((t1 + t2) * kMillisecondPerSecond), r.deviceId,
+           r.streamId, true, r.linkCorrelationId}));
       records_.pop_front();
     }
   }
 
   void flush() {
     std::lock_guard<std::mutex> lk(mtx_);
-    if (records_.size() > 0) {
+    if (!records_.empty()) {
       TORCH_CHECK(pTracker_, "dipu profiler error with pTracker is not inited");
       auto& trakcer = *pTracker_;
       trakcer.sync();
       float ratio = trakcer.ratio();
       size_t offset = trakcer.offset();
 
+      constexpr double kSecondPerMillisecond = 1e-3;
       for (auto& r : ready_records_) {
-        r.begin = static_cast<size_t>(r.begin * 1e-3 * ratio) + offset;
-        r.end = static_cast<size_t>(r.end * 1e-3 * ratio) + offset;
+        r.begin = static_cast<size_t>(static_cast<double>(r.begin) *
+                                      kSecondPerMillisecond * ratio) +
+                  offset;
+        r.end = static_cast<size_t>(static_cast<double>(r.end) *
+                                    kSecondPerMillisecond * ratio) +
+                offset;
         RecordsImpl::get().addRecord(r);
       }
       ready_records_.clear();
@@ -223,7 +235,7 @@ class DeviceRecordsImpl final {
         RecordsImpl::get().addRecord(
             Record({r.name, r.opId, getTime(*r.start, ratio, offset),
                     getTime(*r.stop, ratio, offset), r.deviceId, r.streamId,
-                    true, r.linkCorrelationId, r.extraInfo}));
+                    true, r.linkCorrelationId}));
       }
       records_.clear();
     }
@@ -244,6 +256,7 @@ class DeviceRecordsImpl final {
   }
 };
 
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 bool gEnableFlag = false;
 
 bool isEnable() { return gEnableFlag; }
@@ -252,8 +265,9 @@ void setProfileOpen(bool profileFlag) { gEnableFlag = profileFlag; }
 
 void FlushAllRecords() { DeviceRecordsImpl::get().flush(); }
 
-static size_t kInitModuleId = 10000;
-std::atomic<size_t> moduleId(kInitModuleId);
+constexpr size_t kInitModuleId = 10000;
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+std::atomic_size_t moduleId(kInitModuleId);
 
 size_t generateId() { return ++moduleId; }
 
@@ -265,42 +279,36 @@ void abandonAllRecords() {
   resetId();
 }
 
-RecordCreator::RecordCreator(const string_t& name, size_t opId,
-                             uint64_t linkCorrelationId,
-                             const ExtraRecordInfo& extraInfo) {
+RecordCreator::RecordCreator(string_t name, size_t opId,
+                             uint64_t linkCorrelationId) {
   if (isEnable()) {
-    name_ = name;
+    name_ = std::move(name);
     opId_ = opId;
     begin_ = torch::profiler::impl::getTime();
     end_ = false;
     linkCorrelationId_ = linkCorrelationId;
-    extraInfo_ = extraInfo;
   }
 }
 
-RecordCreator::~RecordCreator() { end(); }
-
-void RecordCreator::end() {
+void RecordCreator::end() noexcept {
   if (!end_) {
     RecordsImpl::get().addRecord(
         Record{name_, opId_, begin_,
                static_cast<size_t>(torch::profiler::impl::getTime()),
                static_cast<size_t>(libkineto::processId()),
                static_cast<size_t>(libkineto::systemThreadId()), false,
-               linkCorrelationId_, extraInfo_});
+               linkCorrelationId_});
   }
   end_ = true;
 }
 
 DeviceRecordCreator::DeviceRecordCreator(string_t name, deviceStream_t stream,
                                          int streamId, size_t opId,
-                                         uint64_t linkCorrelationId,
-                                         const ExtraRecordInfo& extraInfo) {
+                                         uint64_t linkCorrelationId) {
   if (isEnable()) {
     DeviceRecordsImpl::get().ensureSetup(stream);
-    name_ = name;
+    name_ = std::move(name);
     opId_ = opId;
-    extraInfo_ = extraInfo;
     stream_ = stream;
     streamId_ = streamId;
     pStart_.reset(new DeviceEvent());
@@ -311,66 +319,31 @@ DeviceRecordCreator::DeviceRecordCreator(string_t name, deviceStream_t stream,
   }
 }
 
-DeviceRecordCreator::~DeviceRecordCreator() { end(); }
-
-void DeviceRecordCreator::end() {
+void DeviceRecordCreator::end() noexcept {
   if (!end_) {
     TORCH_CHECK(pStart_, "dipu profiler error with pStart_ is not inited");
     TORCH_CHECK(pStop_, "dipu profiler error with pStop_ is not inited");
     dipu::devproxy::recordEvent(pStop_->get(), stream_);
     auto deviceId = dipu::devproxy::current_device();
-    DeviceRecordsImpl::get().addDeviceRecord(
-        DeviceRecord{pStart_, pStop_, static_cast<size_t>(deviceId),
-                     static_cast<size_t>(streamId_), name_, opId_,
-                     linkCorrelationId_, extraInfo_});
+    DeviceRecordsImpl::get().addDeviceRecord(DeviceRecord{
+        pStart_, pStop_, static_cast<size_t>(deviceId),
+        static_cast<size_t>(streamId_), name_, opId_, linkCorrelationId_});
     RecordsImpl::get().recordStream(deviceId, streamId_);
   }
   end_ = true;
 }
 
-static std::string extraceFunction(const std::string& functionName) {
-  auto start = functionName.find_first_not_of(":");
-  if (start == std::string::npos) {
-    return "";
-  }
-
-  auto end = functionName.find_first_of("(");
-  if (end == std::string::npos) {
-    end = functionName.size();
-  }
-
-  if (end <= start) {
-    return "";
-  }
-  return functionName.substr(start, end - start);
-}
-
-RecordBlockCreator::RecordBlockCreator(string_t name,
-                                       const ExtraRecordInfo& extraInfo,
-                                       deviceStream_t stream, int streamId,
-                                       bool enProfile) {
-  if (enProfile && isEnable()) {
-    size_t opId = generateId();
-    uint64_t correlationId =
-        CorrelationIDManager::instance().getCorrelationID();
-    name = extraceFunction(name);
-    pHostRecord_.reset(new RecordCreator("LaunchKernel_" + name, opId,
-                                         correlationId, extraInfo));
-    pDeviceRecord_.reset(new DeviceRecordCreator(name, stream, streamId, opId,
-                                                 correlationId, extraInfo));
-  }
-}
+void RecordBlockCreator::initialize(string_t name, deviceStream_t stream,
+                                    c10::StreamId streamId) {
+  size_t opId = generateId();
+  uint64_t correlationId = CorrelationIDManager::instance().getCorrelationID();
 
-void RecordBlockCreator::end() {
-  if (!finish_) {
-    pHostRecord_.reset();
-    pDeviceRecord_.reset();
-  }
-  finish_ = true;
+  pHostRecord_ = std::make_unique<RecordCreator>("LaunchKernel_" + name, opId,
+                                                 correlationId);
+  pDeviceRecord_ = std::make_unique<DeviceRecordCreator>(
+      std::move(name), stream, streamId, opId, correlationId);
 }
 
-RecordBlockCreator::~RecordBlockCreator() { end(); }
-
 }  // namespace profile
 
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/profiler/profiler.h b/dipu/torch_dipu/csrc_dipu/profiler/profiler.h
index eed733567c..a52055bfac 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/profiler.h
+++ b/dipu/torch_dipu/csrc_dipu/profiler/profiler.h
@@ -1,23 +1,23 @@
 #pragma once
 
-#include <IActivityProfiler.h>
-#include <chrono>
-#include <deque>
+#include <cstdint>
 #include <list>
 #include <map>
 #include <memory>
 #include <mutex>
-#include <stdint.h>
 #include <string>
-#include <thread>
 #include <unordered_map>
 #include <utility>
-#include <vector>
 
+#include <c10/core/Stream.h>
+#include <c10/util/Optional.h>
+#include <c10/util/string_view.h>
+
+#include "csrc_dipu/vendor/vendorapi.h"
 #include <csrc_dipu/base/basedef.h>
 #include <csrc_dipu/runtime/rthelper.h>
 
-#include "CorrelationIDManager.h"
+#include "IActivityProfiler.h"
 
 namespace dipu {
 namespace profile {
@@ -38,29 +38,6 @@ void setProfileOpen(bool profileFlag);
 void FlushAllRecords();
 void abandonAllRecords();
 
-struct ExtraRecordInfo {
-  string_t scope;
-  size_t opSeqId;
-  string_t attrs;
-
-  ExtraRecordInfo() : scope(""), opSeqId(0), attrs("") {}
-
-  ExtraRecordInfo& setScope(const string_t& scopeName) {
-    scope = scopeName;
-    return *this;
-  }
-
-  ExtraRecordInfo& setSeqId(size_t seqId) {
-    opSeqId = seqId;
-    return *this;
-  }
-
-  ExtraRecordInfo& setAttrs(const string_t& sAttrs) {
-    attrs = sAttrs;
-    return *this;
-  }
-};
-
 struct Record {
   string_t name;
   size_t opId;
@@ -71,7 +48,6 @@ struct Record {
   size_t threadIdx;
   bool isKernel = false;
   uint64_t linkCorrelationId = 0;
-  ExtraRecordInfo extraInfo;
 };
 
 class RecordsImpl final {
@@ -82,11 +58,12 @@ class RecordsImpl final {
   mutable mutex_t mtx_;
   // tid -> record list
   std::unordered_map<int32_t, std::unique_ptr<records_t>> allRecordLists_;
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
   thread_local static records_t* pRecords;
 
   std::map<std::pair<int64_t, int64_t>, libkineto::ResourceInfo> resourceInfo_;
 
- private:
   RecordsImpl() = default;
 
  public:
@@ -109,17 +86,15 @@ class RecordCreator final {
   size_t begin_;
   bool end_ = true;
   uint64_t linkCorrelationId_ = 0;
-  ExtraRecordInfo extraInfo_;
 
  public:
-  explicit RecordCreator(const string_t& name, size_t opId,
-                         uint64_t linkCorrelationId,
-                         const ExtraRecordInfo& extraInfo = ExtraRecordInfo());
+  RecordCreator() = default;
+  RecordCreator(string_t name, size_t opId, uint64_t linkCorrelationId);
 
-  ~RecordCreator();
+  ~RecordCreator() { end(); }
 
  private:
-  void end();
+  void end() noexcept;
 };
 
 class DeviceEvent;
@@ -131,7 +106,6 @@ struct DeviceRecord {
   string_t name;
   size_t opId;
   uint64_t linkCorrelationId = 0;
-  ExtraRecordInfo extraInfo;
 };
 
 class DeviceRecordCreator final {
@@ -143,32 +117,52 @@ class DeviceRecordCreator final {
   std::shared_ptr<DeviceEvent> pStart_, pStop_;
   bool end_ = true;
   uint64_t linkCorrelationId_ = 0;
-  ExtraRecordInfo extraInfo_;
 
  public:
+  DeviceRecordCreator() = default;
   DeviceRecordCreator(string_t name, deviceStream_t stream, int streamId,
-                      size_t opId, uint64_t linkCorrelationId,
-                      const ExtraRecordInfo& extraInfo = ExtraRecordInfo());
+                      size_t opId, uint64_t linkCorrelationId);
 
-  ~DeviceRecordCreator();
+  ~DeviceRecordCreator() { end(); }
 
  private:
-  void end();
+  void end() noexcept;
 };
 
 class RecordBlockCreator {
  public:
+  RecordBlockCreator() = default;
+  // TODO(lljbash): maybe use std::string_view and std::optional after c++17
   explicit RecordBlockCreator(
-      string_t name, const ExtraRecordInfo& extraInfo = ExtraRecordInfo(),
-      deviceStream_t stream = dipu::getCurrentDIPUStream(),
-      int streamId = dipu::getCurrentDIPUStream().id(),
-      bool enProfile = isEnable());
+      c10::string_view name,
+      c10::optional<deviceStream_t> stream = c10::nullopt,
+      c10::optional<c10::StreamId> streamId = c10::nullopt,
+      c10::optional<bool> enProfile = c10::nullopt) {
+    if (enProfile.value_or(isEnable())) {
+      if (!stream) {
+        auto dipu_stream = getCurrentDIPUStream();
+        if (!streamId) {
+          streamId = dipu_stream.id();
+        }
+        stream = dipu_stream.rawstream();
+      }
+      initialize(string_t(name), *stream, *streamId);
+    }
+  }
 
-  void end();
+  void end() noexcept {
+    if (!finish_) {
+      pHostRecord_.reset();
+      pDeviceRecord_.reset();
+      finish_ = true;
+    }
+  }
 
-  ~RecordBlockCreator();
+  ~RecordBlockCreator() { end(); }
 
  private:
+  void initialize(string_t name, deviceStream_t stream, c10::StreamId streamId);
+
   std::unique_ptr<RecordCreator> pHostRecord_ = nullptr;
   std::unique_ptr<DeviceRecordCreator> pDeviceRecord_ = nullptr;
   bool finish_ = false;
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUDeviceInfo.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUDeviceInfo.cpp
index ab144e7fd4..db4e3c6b99 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUDeviceInfo.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUDeviceInfo.cpp
@@ -15,25 +15,29 @@ using c10::DeviceIndex;
 using dipu::devapis::DIPUDeviceProperties;
 using std::shared_ptr;
 
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 DeviceIndex num_gpus = -1;
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 c10::once_flag init_flag;
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 std::deque<c10::once_flag> device_flags;
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 std::vector<shared_ptr<DIPUDeviceProperties>> device_properties;
 
-static void initDIPUContextVectors() {
+void initDIPUContextVectors() {
   num_gpus = dipu::devproxy::getDeviceCount();
   device_flags.resize(num_gpus);
   device_properties.resize(num_gpus);
 }
 
-static void initDeviceProperty(DeviceIndex device_index) {
+void initDeviceProperty(DeviceIndex device_index) {
   DIPUDeviceProperties device_prop =
       dipu::devproxy::getDeviceProperties(device_index);
   device_properties[device_index] =
       std::make_shared<DIPUDeviceProperties>(device_prop);
 }
 
-static inline void checkDevice(int32_t device_index) {
+inline void checkDevice(int32_t device_index) {
   c10::call_once(init_flag, initDIPUContextVectors);
   if (device_index == -1) {
     device_index = dipu::devproxy::current_device();
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEvent.h b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEvent.h
index 73846f0d4f..87063a8200 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEvent.h
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEvent.h
@@ -81,14 +81,14 @@ class DIPU_API DIPUEvent {
                 device_index_, " does not match recording stream's device ",
                 stream.device_index(), ".");
     DIPUGuard guard(device_index_);
-    devproxy::recordEvent(event_, stream);
+    devproxy::recordEvent(event_, stream.rawstream());
     was_recorded_ = true;
   }
 
   void wait(const DIPUStream& stream) {
     if (isCreated()) {
       DIPUGuard guard(stream.device_index());
-      devproxy::streamWaitEvent(stream, event_);
+      devproxy::streamWaitEvent(stream.rawstream(), event_);
     }
   }
 
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp
index eede27bf36..d3ad103fa4 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp
@@ -65,7 +65,7 @@ EventPool<deviceEvent_t>* getEventPool() {
   const int index = devproxy::current_device();
 // GlobalEventPool for different cards , construct when really needed
 #define dispatch_event_pool(device_id)                               \
-  if (index == device_id) {                                          \
+  if (index == (device_id)) {                                        \
     static EventPool<deviceEvent_t> gDIPUEventPool(                  \
         [](deviceEvent_t& event) { devapis::createEvent(&event); },  \
         [](deviceEvent_t& event) { devapis::destroyEvent(event); }); \
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.cpp
index 40530510c9..9ee8d69dc5 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.cpp
@@ -74,6 +74,7 @@ at::Generator createDIPUGenerator(at::DeviceIndex device_index) {
 DIPUGeneratorImpl::DIPUGeneratorImpl(at::DeviceIndex device_index)
     : c10::GeneratorImpl{at::Device(dipu::DIPU_DEVICE_TYPE, device_index),
                          at::DispatchKeySet(dipu::DIPU_DISPATCH_KEY)},
+      offset_(0),
       state_need_reset_(true) {}
 
 /**
@@ -125,7 +126,9 @@ std::shared_ptr<DIPUGeneratorImpl> DIPUGeneratorImpl::clone() const {
  * See Note [Acquire lock when using random generators]
  */
 DIPUGeneratorImpl* DIPUGeneratorImpl::clone_impl() const {
-  auto gen = new DIPUGeneratorImpl(this->device().index());
+  auto gen = dynamic_cast<DIPUGeneratorImpl*>(
+      createDIPUGenerator(this->device().index()).unsafeReleaseGeneratorImpl());
+  TORCH_CHECK(gen != nullptr);
   gen->set_current_seed(this->seed_);
   auto state = this->state_;
   const auto& state_clone = state.clone();
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.h b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.h
index aa8dee96b2..ae282f1e14 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.h
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.h
@@ -19,15 +19,16 @@ class DIPUGeneratorImpl : public c10::GeneratorImpl {
   uint64_t seed() override;
   static at::DeviceType device_type();
   c10::intrusive_ptr<c10::TensorImpl> get_state() const override;
-  void set_state(const c10::TensorImpl& state) override{};
-  virtual void set_offset(uint64_t offset){};
-  virtual uint64_t get_offset() const { return 0; };
+
+  virtual void set_offset(uint64_t offset) { offset_ = offset; }
+  virtual uint64_t get_offset() const { return offset_; }
 
  protected:
   void set_state_flag(bool flag);
-  virtual void update_state() const {}
+  virtual void update_state() const = 0;
 
   DIPUGeneratorImpl* clone_impl() const override;
+  volatile uint64_t offset_;
   uint64_t seed_ = c10::default_rng_seed_val;
   mutable at::Tensor state_;
   mutable bool state_need_reset_;
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.cpp
index 4057e0dd28..fd47614563 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.cpp
@@ -5,7 +5,8 @@
 #include <atomic>
 #include <cstdint>
 #include <cstring>
-#include <iostream>
+#include <memory>
+#include <string>
 #include <sys/time.h>
 #include <unistd.h>
 #include <vector>
@@ -14,43 +15,32 @@
 
 #include "DIPUGuard.h"
 
-using dipu::devapis::deviceId_t;
 namespace dipu {
-
 namespace {
+
 enum class StreamIdType : uint8_t {
-  DEFAULT = 0x0,
-  POOL = 0x1,
+  DEFAULT = 0,
+  POOL = 1,
 };
 
-std::ostream& operator<<(std::ostream& stream, StreamIdType s) {
+std::string to_string(StreamIdType s) {
   switch (s) {
     case StreamIdType::DEFAULT:
-      stream << "DEFAULT";
-      break;
+      return "DEFAULT";
     case StreamIdType::POOL:
-      stream << "POOL";
-      break;
+      return "POOL";
     default:
-      stream << static_cast<uint8_t>(s);
-      break;
+      return std::to_string(static_cast<uint8_t>(s));
   }
-  return stream;
 }
-// follow old pytorch cuda, seems new version use an opposite strategy.
-static constexpr int kStreamsPerPoolBits = 3;
-static constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
 
-// Global stream state and constants
-static c10::DeviceIndex num_dipus = -1;
-// Default streams
-static std::once_flag global_init_flag;
-
-// streamid contains streamtype and/or raw stream id in DIPUStreamDevice pool
-static thread_local std::unique_ptr<c10::StreamId[]> current_streams = nullptr;
+// follow old pytorch cuda, seems new version use an opposite strategy.
+constexpr int kStreamsPerPoolBits = 3;
+constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
 
-static c10::StreamId makeC10StreamId(StreamIdType sType, size_t id) {
-  return ((uint32_t) static_cast<c10::StreamId>(sType) << kStreamsPerPoolBits) |
+c10::StreamId makeC10StreamId(StreamIdType sType, size_t id) {
+  return (static_cast<uint32_t>(static_cast<c10::StreamId>(sType)
+                                << kStreamsPerPoolBits)) |
          static_cast<c10::StreamId>(id);
 }
 
@@ -60,31 +50,32 @@ struct DIPUStreamDevice {
   // Default streams
   std::once_flag pool_flag;
   std::once_flag default_flag;
-  deviceId_t devidx_;
+  devapis::deviceId_t devidx_;
   // seems pytorch 2.0 giveup default stream and enable cuda per_thread stream
-  // feature at compile time. it cannot be applied to othe device.
-  deviceStream_t default_stream = nullptr;
-
-  std::atomic<uint32_t> next_pool_pos;
-  std::array<deviceStream_t, kStreamsPerPool> pool_streams;
+  // feature at compile time. it cannot be applied to other device.
+  deviceStream_t default_stream{};
+  std::atomic<uint32_t> next_pool_pos{};
+  std::array<deviceStream_t, kStreamsPerPool> pool_streams{};
 
   inline uint32_t getNextPoolIdx() {
     auto raw_idx = next_pool_pos++;
     return raw_idx % kStreamsPerPool;
   }
 
-  inline StreamIdType getStreamIdType(c10::StreamId s) {
-    return static_cast<StreamIdType>((uint32_t)s >> kStreamsPerPoolBits);
+  static StreamIdType getStreamIdType(c10::StreamId s) {
+    return static_cast<StreamIdType>(static_cast<uint32_t>(s) >>
+                                     kStreamsPerPoolBits);
   }
 
-  inline size_t getStreamIdIndex(c10::StreamId s) {
-    return static_cast<size_t>((uint32_t)s & ((1 << kStreamsPerPoolBits) - 1));
+  static size_t getStreamIdIndex(c10::StreamId s) {
+    return static_cast<size_t>(static_cast<uint32_t>(s) &
+                               ((1 << kStreamsPerPoolBits) - 1));
   }
+
   void _doInitPool() {
     DIPUGuard device_guard{devidx_};
-    for (auto i = decltype(kStreamsPerPool){0}; i < kStreamsPerPool; ++i) {
-      auto& raw_device_stream = pool_streams[i];
-      devproxy::createStream(&raw_device_stream);
+    for (auto& stream : pool_streams) {
+      devproxy::createStream(&stream);
     }
   }
 
@@ -96,17 +87,15 @@ struct DIPUStreamDevice {
   }
 
  public:
-  DIPUStreamDevice(deviceId_t devidx) {
-    devidx_ = devidx;
-    next_pool_pos = 0;
-  }
+  explicit DIPUStreamDevice(devapis::deviceId_t device_id)
+      : devidx_(device_id) {}
 
   DIPUStream getDIPUStreamfromPool() {
     const auto idx = getNextPoolIdx();
     return DIPUStream(devidx_, makeC10StreamId(StreamIdType::POOL, idx));
   }
 
-  DIPUStream getDefaultDIPUStream() {
+  DIPUStream getDefaultDIPUStream() const {
     return DIPUStream(devidx_, makeC10StreamId(StreamIdType::DEFAULT, 0));
   }
 
@@ -129,8 +118,8 @@ struct DIPUStreamDevice {
       case StreamIdType::POOL:
         return pool_streams[sidx];
       default:
-        AT_ASSERTM(0, "Unrecognized stream ", stream_id,
-                   " (I didn't recognize the stream type, ", st, ")");
+        // TODO(assert): AT_ERROR is deprecated.
+        AT_ERROR("Invalid stream", stream_id, " (type=", to_string(st), ")");
     }
   }
   void initPool() {
@@ -141,73 +130,74 @@ struct DIPUStreamDevice {
   }
 };
 
-static std::array<std::unique_ptr<DIPUStreamDevice>, C10_COMPILE_TIME_MAX_DIPUS>
-    streamDeviceList;
-
-static void initGlobalStreamState() {
-  num_dipus = devproxy::getDeviceCount();
-  // Check if the number of DIPU matches the expected compile-time max number
-  // of DIPU.
-  AT_ASSERTM(
-      num_dipus <= C10_COMPILE_TIME_MAX_DIPUS,
-      "Number of DIPU devices on the machine is larger than the compiled "
-      "max number of dipus expected (",
-      C10_COMPILE_TIME_MAX_DIPUS, "). Increase that and recompile.");
-
-  for (int i = 0; i < num_dipus; i++) {
-    streamDeviceList[i] =
-        std::move(std::unique_ptr<DIPUStreamDevice>(new DIPUStreamDevice(i)));
-  }
+auto StreamDeviceList()
+    -> std::vector<std::unique_ptr<DIPUStreamDevice>> const& {
+  auto make_list = [] {
+    auto number_of_device = devproxy::getDeviceCount();
+    auto list = std::vector<std::unique_ptr<DIPUStreamDevice>>();
+    list.reserve(number_of_device);
+    for (auto i = 0; i < number_of_device; ++i) {
+      list.emplace_back(std::make_unique<DIPUStreamDevice>(i));
+    }
+    return list;
+  };
+
+  auto static device_list = make_list();
+  return device_list;
 }
 
-static c10::DeviceIndex initDIPUGlobal(c10::DeviceIndex devIdx) {
-  // Inits default streams (once, globally)
-  std::call_once(global_init_flag, initGlobalStreamState);
+auto LocalStreams() -> std::vector<c10::StreamId>& {
+  auto static thread_local streams = std::vector<c10::StreamId>(
+      StreamDeviceList().size(), makeC10StreamId(StreamIdType::DEFAULT, 0));
 
-  // check device id
-  if (devIdx == -1) {
-    devIdx = devproxy::current_device();
-  }
-  AT_ASSERT(devIdx >= 0 && devIdx < num_dipus);
-  streamDeviceList[devIdx]->initDevice();
-
-  // current_streams is thread local. so check every time.
-  if (current_streams) {
-    return devIdx;
-  }
-  current_streams = std::make_unique<c10::StreamId[]>(num_dipus);
+  return streams;
+}
 
-  // Inits current streams (thread local) to default streams
-  for (const auto i : c10::irange(num_dipus)) {
-    current_streams[i] = makeC10StreamId(StreamIdType::DEFAULT, 0);
-  }
-  // set device default stream in init
-  return devIdx;
+// TODO(lifetime): remove it someday.
+//
+// This static variable is used to initialize StreamDevice and StreamIds. As
+// BFCachingAllocator depends on them via getDefaultDIPUStream. We need to make
+// sure its lifetime longer than static BFCachingAllocator.
+auto const& force_to_initialize_streams = LocalStreams();
+
+c10::DeviceIndex setupDevice(c10::DeviceIndex device_index) {
+  if (device_index == -1) {
+    device_index = devproxy::current_device();
+  }
+
+  auto& device_list = StreamDeviceList();
+  auto number_of_device = static_cast<int>(device_list.size());
+  // TODO(assert): AT_ASSERT is deprecated and TORCH_CHECK contains their own
+  // help message. We need our version.
+  AT_ASSERT(0 <= device_index && device_index < number_of_device);
+  device_list[device_index]->initDevice();
+  return device_index;
 }
 
 }  // end anonymous namespace
 
 // api
 deviceStream_t DIPUStream::rawstream() const {
-  return streamDeviceList[this->device_index()]->obtainRawStream(
-      this->unwrap().id());
+  return StreamDeviceList()[stream_.device_index()]->obtainRawStream(
+      stream_.id());
 }
 
-DIPUStream getDIPUStreamFromPool(c10::DeviceIndex devIdx) {
-  devIdx = initDIPUGlobal(devIdx);
+DIPUStream getDIPUStreamFromPool(c10::DeviceIndex device_index) {
+  device_index = setupDevice(device_index);
   // Initializes the stream pools (once)
-  streamDeviceList[devIdx]->initPool();
-  return streamDeviceList[devIdx]->getDIPUStreamfromPool();
+  auto& device = *StreamDeviceList()[device_index];
+  device.initPool();
+  return device.getDIPUStreamfromPool();
 }
 
-DIPUStream getDefaultDIPUStream(c10::DeviceIndex devIdx) {
-  devIdx = initDIPUGlobal(devIdx);
-  return streamDeviceList[devIdx]->getDefaultDIPUStream();
+DIPUStream getDefaultDIPUStream(c10::DeviceIndex device_index) {
+  device_index = setupDevice(device_index);
+  return StreamDeviceList()[device_index]->getDefaultDIPUStream();
 }
 
-DIPUStream getCurrentDIPUStream(c10::DeviceIndex devIdx) {
-  devIdx = initDIPUGlobal(devIdx);
-  return DIPUStream(devIdx, current_streams[devIdx]);
+DIPUStream getCurrentDIPUStream(c10::DeviceIndex device_index) {
+  device_index = setupDevice(device_index);
+  return DIPUStream(device_index, LocalStreams()[device_index]);
 }
 
 // copy from pytorch, not verify
@@ -218,13 +208,10 @@ DIPUStream getStreamFromExternal(deviceStream_t ext_stream,
 }
 
 void setCurrentDIPUStream(DIPUStream stream) {
-  auto devIdx = stream.device_index();
-  initDIPUGlobal(devIdx);
-  current_streams[devIdx] = stream.unwrap().id();
-}
-
-std::ostream& operator<<(std::ostream& os, const DIPUStream& stream) {
-  return os << stream.unwrap();
+  auto device_index = stream.device_index();
+  // TODO(assert): assert(setupDevice(device_index) == device_index)
+  setupDevice(device_index);
+  LocalStreams()[device_index] = stream.unwrap().id();
 }
 
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.h b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.h
index 33cadadee2..63f75d4d3b 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.h
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUStream.h
@@ -1,15 +1,10 @@
 // Copyright (c) 2023, DeepLink.
 #pragma once
 
-#include <cstdint>
-#include <functional>
-#include <mutex>
-
 #include <c10/core/Device.h>
 #include <c10/core/DeviceGuard.h>
 #include <c10/core/Stream.h>
 #include <c10/util/Exception.h>
-#include <c10/util/SmallVector.h>
 
 #include <csrc_dipu/base/basedef.h>
 #include <csrc_dipu/runtime/devproxy/deviceproxy.h>
@@ -17,33 +12,30 @@
 namespace dipu {
 
 class DIPU_API DIPUStream {
+ private:
+  c10::Stream stream_;
+
  public:
+  // Need more discussion to handle empty DIPUStream.
+  explicit DIPUStream() : DIPUStream(-1, 0) {}
+
   explicit DIPUStream(c10::Stream stream) : stream_(stream) {
     TORCH_CHECK(stream_.device_type() == dipu::DIPU_DEVICE_TYPE);
   }
 
-  explicit DIPUStream(devapis::deviceId_t devidx, c10::StreamId stream_id)
+  explicit DIPUStream(devapis::deviceId_t device_id, c10::StreamId stream_id)
       : DIPUStream(c10::Stream(c10::Stream::UNSAFE,
-                               c10::Device(dipu::DIPU_DEVICE_TYPE, devidx),
+                               c10::Device(dipu::DIPU_DEVICE_TYPE, device_id),
                                stream_id)) {}
 
-  // Need more discussion to handle empty DIPUStream.
-  explicit DIPUStream() : DIPUStream(-1, 0) {}
-
-  ~DIPUStream() = default;
-
   bool operator==(const DIPUStream& other) const noexcept {
     return unwrap() == other.unwrap();
   }
 
   bool operator!=(const DIPUStream& other) const noexcept {
-    return unwrap() != other.unwrap();
+    return not operator==(other);
   }
 
-  // FIXME: add explicit later as it is used by many other files.
-  operator c10::Stream() const { return unwrap(); }
-  operator deviceStream_t() const { return rawstream(); }
-
   /// Get the device index that this stream is associated with.
   c10::DeviceIndex device_index() const { return stream_.device_index(); }
 
@@ -65,26 +57,12 @@ class DIPU_API DIPUStream {
     return devproxy::isStreamEmpty(rawstream());
   }
 
-  /// Explicit conversion to rtStream_t.
-  deviceStream_t rawstream() const;
-
-  /// Explicit conversion to Stream.
   c10::Stream unwrap() const { return stream_; }
 
-  c10::StreamData3 pack3() const noexcept { return stream_.pack3(); }
-
-  static DIPUStream unpack3(c10::StreamId stream_id,
-                            c10::DeviceIndex device_index,
-                            c10::DeviceType device_type) {
-    TORCH_CHECK(device_type == dipu::DIPU_DEVICE_TYPE);
-    return DIPUStream(device_index, stream_id);
-  }
-
- private:
-  c10::Stream stream_;
+  deviceStream_t rawstream() const;
 };
 
-DIPU_API DIPUStream getDIPUStreamFromPool(c10::DeviceIndex device = -1);
+DIPU_API DIPUStream getDIPUStreamFromPool(c10::DeviceIndex device_index = -1);
 
 DIPU_API DIPUStream getDefaultDIPUStream(c10::DeviceIndex device_index = -1);
 
@@ -95,7 +73,10 @@ DIPU_API void setCurrentDIPUStream(DIPUStream stream);
 DIPU_API DIPUStream getStreamFromExternal(deviceStream_t ext_stream,
                                           c10::DeviceIndex device_index);
 
-std::ostream& operator<<(std::ostream& stream, const DIPUStream& s);
+template <typename O>
+O& operator<<(O& oss, const dipu::DIPUStream& stream) {
+  oss << stream.unwrap();
+}
 }  // namespace dipu
 
 template <>
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/MemChecker.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/MemChecker.cpp
index 82c7d61b47..74b9242a95 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/MemChecker.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/MemChecker.cpp
@@ -73,11 +73,15 @@ int32_t MemChecker::log_interval() {
 
 std::string MemChecker::current_state() const {
   std::stringstream stream;
-  stream << "current block num = " << blocks_.size()
-         << ", total_size = " << (total_size_ >> 20) << "MB"
-         << ", insert count = " << insert_cnt_
-         << ", max block num = " << max_block_num()
-         << ", log interval = " << log_interval();
+  stream
+      << "current block num = "
+      << blocks_.size()
+      // convert B to MB
+      // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
+      << ", total_size = " << (total_size_ >> 20) << "MB"
+      << ", insert count = " << insert_cnt_
+      << ", max block num = " << max_block_num()
+      << ", log interval = " << log_interval();
   return stream.str();
 }
 
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/MemChecker.h b/dipu/torch_dipu/csrc_dipu/runtime/core/MemChecker.h
index 9f3f7b0e44..b047ff3979 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/MemChecker.h
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/MemChecker.h
@@ -1,8 +1,8 @@
 // Copyright (c) 2023, DeepLink.
 #pragma once
 
+#include <cstdint>
 #include <mutex>
-#include <stdint.h>
 #include <string>
 #include <unordered_map>
 #include <utility>
@@ -28,7 +28,6 @@ class MemChecker final {
  private:
   std::string current_state() const;
 
- private:
   std::mutex mtx_;
   std::unordered_map<const void*, std::pair<size_t, std::string>> blocks_;
   int64_t total_size_ = 0;
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBFCachingAllocator.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBFCachingAllocator.cpp
index 388d05856c..cb25fb4bc2 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBFCachingAllocator.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBFCachingAllocator.cpp
@@ -1,10 +1,10 @@
 // Copyright (c) 2023, DeepLink.
 
 #include <functional>
-#include <map>
-#include <queue>
+#include <memory>
 #include <stack>
 #include <thread>
+#include <utility>
 #include <vector>
 
 #include "DIPUCachingAllocator.h"
@@ -27,9 +27,9 @@ class BFCachingAllocatorImpl {
   static constexpr int kLogNumSubBins = 2;
   // Allocation parameters
   static constexpr size_t kMinAllocationSize = 512;
-  static constexpr size_t kMaxInternalFragmentation = 8u << 20u;  // 8MB
-  static constexpr size_t kMinExtendSize = 8u << 20u;             // 8MB
-  static constexpr size_t kMaxExtendSize = 1u << 30u;             // 1GB
+  static constexpr size_t kMaxInternalFragmentation = 8U << 20U;  // 8MB
+  static constexpr size_t kMinExtendSize = 8U << 20U;             // 8MB
+  static constexpr size_t kMaxExtendSize = 1U << 30U;             // 1GB
 
   size_t cachedBytes = 0;
   size_t allocatedBytes = 0;
@@ -61,7 +61,7 @@ class BFCachingAllocatorImpl {
     // into 128 bits (`kNumBigBins` * `kNumSubBins`)
     __uint128_t bits = 0;
     // Virtual chunks which are the heads of the bins
-    int binHeads_[kNumBigBins * kNumSubBins]{0};
+    std::array<int, static_cast<size_t>(kNumBigBins* kNumSubBins)> binHeads_{};
     // The extending size next time
     size_t currExtendSize_ = kMinExtendSize;
 
@@ -78,12 +78,14 @@ class BFCachingAllocatorImpl {
       // Find the index of the first "1"
       // `__builtin_ctzll` only support `uint64_t`,
       // so we have to divide
-      uint64_t low_bits = map, high_bits = map >> 64u;
+      uint64_t low_bits = map;
+      constexpr int kLowBitWidth = 64;
+      uint64_t high_bits = map >> kLowBitWidth;
       if (low_bits) {
         return __builtin_ctzll(low_bits);
       }
       if (high_bits) {
-        return 64 + __builtin_ctzll(high_bits);
+        return kLowBitWidth + __builtin_ctzll(high_bits);
       }
       return -1;
     }
@@ -117,14 +119,16 @@ class BFCachingAllocatorImpl {
     Chunk(void* ptr, size_t size, size_t stream)
         : ptr(ptr), size(size), stream(stream) {}
 
-    bool isMonoBlock() const { return !prevChunkInMem && !nextChunkInMem; }
+    bool isMonoBlock() const {
+      return (prevChunkInMem == 0) && (nextChunkInMem == 0);
+    }
   };
 
   std::vector<Chunk> chunks_;
   // Use id recycling for better performance
   std::stack<int> recycleIds_;
 
-  typedef std::unique_ptr<StreamSet> StreamSetHandle;
+  using StreamSetHandle = std::unique_ptr<StreamSet>;
   std::vector<StreamSetHandle> streamSets_;
 
   using mutex_t = SpinMutex;
@@ -135,14 +139,14 @@ class BFCachingAllocatorImpl {
   }
 
   int newChunk(void* ptr, size_t size, size_t stream) {
-    int id;
+    int id = 0;
     if (!recycleIds_.empty()) {
       id = recycleIds_.top();
       recycleIds_.pop();
       chunks_[id] = Chunk(ptr, size, stream);
     } else {
-      id = chunks_.size();
-      chunks_.emplace_back(Chunk(ptr, size, stream));
+      id = static_cast<int>(chunks_.size());
+      chunks_.emplace_back(ptr, size, stream);
     }
     if (!ptr) {
       chunks_[id].allocated = true;
@@ -155,11 +159,14 @@ class BFCachingAllocatorImpl {
     //      [2^`bigBinIdx`, 2^(`bigBinIdx`+1)), length: 2^`bigBinIdx`
     // Split big bin into `kNumSubBins` sub bins
     size_t nBlocks = nbytes / kMinAllocationSize;
-    int bigBinIdx = 63 - __builtin_clzll(nBlocks);
+    constexpr int kMaxBinIdx = 63;
+    int bigBinIdx = kMaxBinIdx - __builtin_clzll(nBlocks);
     // If `nbytes` is so large, we just put it into the last
-    if (bigBinIdx > kNumBigBins - 1) return kNumBigBins * kNumSubBins - 1;
+    if (bigBinIdx > kNumBigBins - 1) {
+      return kNumBigBins * kNumSubBins - 1;
+    }
     // Get the index of sub bin
-    int subBinIdx = nBlocks ^ (1ull << bigBinIdx);
+    int subBinIdx = static_cast<int>(nBlocks ^ (1ULL << bigBinIdx));
     subBinIdx >>= std::max(bigBinIdx - kLogNumSubBins, 0);
     return bigBinIdx * kNumSubBins + subBinIdx;
   }
@@ -385,11 +392,11 @@ class BFCachingAllocatorImpl {
   void set_mem_allocate_fn(allocate_fn_t allocate_fn,
                            deallocate_fn_t deallocate_fn) {
     DIPU_DEBUG_ALLOCATOR(4, "BFCachingAllocator: set_mem_allocate_fn ");
-    this->allocate_fn = allocate_fn;
-    this->deallocate_fn = deallocate_fn;
+    this->allocate_fn = std::move(allocate_fn);
+    this->deallocate_fn = std::move(deallocate_fn);
   }
 
-  size_t memory_reserved() { return cachedBytes; }
+  size_t memory_reserved() const { return cachedBytes; }
 };
 
 static void deleteBFContext(void* ptr);
@@ -405,7 +412,7 @@ class BFCachingAllocator : public CacheAllocator {
     while (async_mem_pool()->ready()) {
       const auto block = async_mem_pool()->get();
       void* ptr = std::get<0>(block);
-      int id = std::get<1>(block);
+      int id = static_cast<int>(std::get<1>(block));
       DIPU_DEBUG_ALLOCATOR(
           8, "BFCachingAllocator: " << __FUNCTION__ << " ,ptr:" << ptr
                                     << " ,id:" << id << " ,allocator:" << this
@@ -424,7 +431,7 @@ class BFCachingAllocator : public CacheAllocator {
       }
       const auto block = async_mem_pool()->get();
       void* ptr = std::get<0>(block);
-      int id = std::get<1>(block);
+      int id = static_cast<int>(std::get<1>(block));
       DIPU_DEBUG_ALLOCATOR(
           8, "BFCachingAllocator: " << __FUNCTION__ << " ,ptr:" << ptr
                                     << " ,id:" << id << " ,allocator:" << this
@@ -437,14 +444,16 @@ class BFCachingAllocator : public CacheAllocator {
     if (impl) {
       return;
     }
-    impl.reset(new BFCachingAllocatorImpl());
+    impl = std::make_unique<BFCachingAllocatorImpl>();
 
     std::function<void*(size_t)> alloc_fn =
-        std::bind(&BFCachingAllocator::allocate_raw, (BFCachingAllocator*)this,
-                  std::placeholders::_1);
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+        std::bind(&BFCachingAllocator::allocate_raw,
+                  const_cast<BFCachingAllocator*>(this), std::placeholders::_1);
     std::function<void(void*)> dealloc_fn =
-        std::bind(&BFCachingAllocator::free_raw, (BFCachingAllocator*)this,
-                  std::placeholders::_1);
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+        std::bind(&BFCachingAllocator::free_raw,
+                  const_cast<BFCachingAllocator*>(this), std::placeholders::_1);
     impl->set_mem_allocate_fn(alloc_fn, dealloc_fn);
   }
 
@@ -470,11 +479,11 @@ class BFCachingAllocator : public CacheAllocator {
       if (allocator_->impl) {
         if (ptr()) {
           std::deque<DIPUEvent> events;
-          for (auto iter = streams().begin(); iter != streams().end(); iter++) {
+          for (auto const& stream : streams()) {
             events.emplace_back();
             DIPU_DEBUG_ALLOCATOR(8, "BFCachingAllocator: record to stream:"
-                                        << iter->rawstream());
-            events.back().record(*iter);
+                                        << stream.rawstream());
+            events.back().record(stream);
           }
           allocator_->async_mem_pool()->add(std::make_tuple(ptr(), id_),
                                             events);
@@ -546,7 +555,7 @@ class BFCachingAllocator : public CacheAllocator {
 
   BFCachingAllocator() { check_impl(); }
 
-  ~BFCachingAllocator() {
+  ~BFCachingAllocator() override {
     DIPU_DEBUG_ALLOCATOR(8, "~BFCachingAllocator allocator:" << this);
     release_all_memory();
   }
@@ -561,7 +570,7 @@ static void deleteBFContext(void* ptr) {
   delete ctx;
 }
 
-DIPU_REGISTER_ALLOCATOR(BF, dipu::DIPU_DEVICE_TYPE, BFCachingAllocator, 0);
-DIPU_REGISTER_ALLOCATOR(BF, at::DeviceType::CPU, BFCachingAllocator, 0);
+DIPU_REGISTER_ALLOCATOR(BF, DIPU_DEVICE_TYPE_MACRO, BFCachingAllocator, 0);
+DIPU_REGISTER_ALLOCATOR(BF, CPU, BFCachingAllocator, 0);
 
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBSCachingAllocator.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBSCachingAllocator.cpp
index 04dcead1f7..df1dd7a800 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBSCachingAllocator.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBSCachingAllocator.cpp
@@ -1,18 +1,18 @@
 // Copyright (c) 2023, DeepLink.
 
+#include <cstdint>
 #include <deque>
 #include <list>
 #include <map>
 #include <mutex>
 #include <set>
-#include <stdint.h>
 #include <unordered_map>
 
 #include "DIPUCachingAllocator.h"
 
 namespace dipu {
 
-static void deleteBSContext(void*);
+static void deleteBSContext(void* ptr);
 
 class BSCachingAllocator : public CacheAllocator {
   struct Impl {
@@ -26,32 +26,35 @@ class BSCachingAllocator : public CacheAllocator {
   mutable mutex_t mutex_;
 
  public:
-  BSCachingAllocator() { impl.reset(new Impl()); }
+  BSCachingAllocator() { impl = std::make_unique<Impl>(); }
 
-  ~BSCachingAllocator() { release_all_memory(); }
+  ~BSCachingAllocator() override { release_all_memory(); }
 
   // Better adaptability to memory blocks of various sizes, but internal
   // fragmentation will be larger
-  size_t getAllocateSizeMoreAdaptable(size_t nbytes) const {
-    static const int kMinAllocationSizeExp = []() {
-      size_t size = 511;
+  static size_t getAllocateSizeMoreAdaptable(size_t nbytes) {
+    constexpr int kMaxBits = 32;
+    static const int kMinAllocationSizeExp = [kMaxBits]() {
+      constexpr int kBytesNum = 511;
+      size_t size = kBytesNum;
       const char* env = std::getenv("DIPU_BS_ALLOCATOR_MIN_ALLOCATE_SIZE");
       if (env != nullptr) {
         size = std::atoi(env);
       }
-      int exp = 32 - __builtin_clz(size);
+      int exp = kMaxBits - __builtin_clz(size);
       return exp;
     }();
-    auto r = std::max(32 - __builtin_clz(nbytes), kMinAllocationSizeExp);
+    auto r = std::max(kMaxBits - __builtin_clz(nbytes), kMinAllocationSizeExp);
     size_t allocateSize = 1 << r;
     return allocateSize;
   }
 
   // The internal fragments are smaller, but are less adaptable to scenes with
   // frequent and drastic changes in size.
-  size_t getAllocateSizeLessFragmentation(size_t nbytes) const {
+  static size_t getAllocateSizeLessFragmentation(size_t nbytes) {
     static const size_t kMinAllocationSize = []() {
-      size_t size = 512;
+      const int kBytesNum = 512;
+      size_t size = kBytesNum;
       const char* env = std::getenv("DIPU_BS_ALLOCATOR_MIN_ALLOCATE_SIZE");
       if (env != nullptr) {
         size = std::atoi(env);
@@ -62,7 +65,7 @@ class BSCachingAllocator : public CacheAllocator {
     return allocateSize;
   }
 
-  size_t getAllocateSize(size_t nbytes) const {
+  static size_t getAllocateSize(size_t nbytes) {
     static bool less_fragmentation =
         std::getenv("DIPU_BS_MORE_ADAPTABLE") == nullptr;
     return less_fragmentation ? getAllocateSizeLessFragmentation(nbytes)
@@ -79,11 +82,11 @@ class BSCachingAllocator : public CacheAllocator {
     size_t nbytes = getAllocateSize(size);
     void* ptr = nullptr;
     auto& idel_blocks = impl->idel_blocks_[nbytes];
-    if (idel_blocks.size() <= 0) {
+    if (idel_blocks.empty()) {
       empty_resource_pool();
     }
     for (size_t i = 0; i < 2; i++) {
-      if (idel_blocks.size() > 0) {
+      if (!idel_blocks.empty()) {
         ptr = idel_blocks.front();
         idel_blocks.pop_front();
         impl->total_idel_bytes_ -= nbytes;
@@ -92,28 +95,24 @@ class BSCachingAllocator : public CacheAllocator {
                                     << " bytes, ptr:" << ptr
                                     << ",allocator:" << this);
         break;
-      } else {
-        try {
-          auto data_ptr = raw_allocator()->allocate(nbytes);
-          ptr = data_ptr.get();
-          device() = data_ptr.device();
-          data_ptr.release_context();
-          set_memory_reserved(memory_reserved() + nbytes);
-
-          impl->allocated_.insert(ptr);
-          impl->total_alocated_bytes_ += nbytes;
-          DIPU_DEBUG_ALLOCATOR(4, "BSCachingAllocator::allocate "
-                                      << nbytes << ", requires:" << size
-                                      << " bytes, ptr:" << ptr
-                                      << ",allocator:" << this);
-          break;
-        } catch (...) {
-          if (i == 0) {
-            empty_cache();
-          } else {
-            TORCH_CHECK(false, "no memory available")
-          }
-        }
+      }
+      try {
+        auto data_ptr = raw_allocator()->allocate(nbytes);
+        ptr = data_ptr.get();
+        device() = data_ptr.device();
+        data_ptr.release_context();
+        set_memory_reserved(memory_reserved() + nbytes);
+
+        impl->allocated_.insert(ptr);
+        impl->total_alocated_bytes_ += nbytes;
+        DIPU_DEBUG_ALLOCATOR(4, "BSCachingAllocator::allocate "
+                                    << nbytes << ", requires:" << size
+                                    << " bytes, ptr:" << ptr
+                                    << ",allocator:" << this);
+        break;
+      } catch (...) {
+        TORCH_CHECK(i == 0, "no memory available");
+        empty_cache();
       }
     }
     set_memory_allocated(memory_allocated() + nbytes);
@@ -148,7 +147,7 @@ class BSCachingAllocator : public CacheAllocator {
     }
   }
 
-  void empty_cache() const override {
+  void empty_cache_impl() const {
     DIPU_DEBUG_ALLOCATOR(8,
                          "BSCachingAllocator::empty_cache ,allocator:" << this);
     empty_resource_pool();
@@ -168,12 +167,16 @@ class BSCachingAllocator : public CacheAllocator {
     }
   }
 
-  void release_all_memory() const {
+  void empty_cache() const override { empty_cache_impl(); }
+
+  void release_all_memory_impl() const {
     DIPU_DEBUG_ALLOCATOR(
         8, "BSCachingAllocator::release_all_memory allocator:" << this);
     empty_cache();
   }
 
+  void release_all_memory() const override { release_all_memory_impl(); }
+
   void flush_mem_pool() const {
     DIPU_DEBUG_ALLOCATOR(
         8, "BSCachingAllocator::flush_mem_pool allocator:" << this);
@@ -195,9 +198,9 @@ class BSCachingAllocator : public CacheAllocator {
                                            << ", size_:" << size());
       if (allocator_->impl) {
         std::deque<DIPUEvent> events;
-        for (auto iter = streams().begin(); iter != streams().end(); iter++) {
+        for (const auto& item : streams()) {
           events.emplace_back();
-          events.back().record(*iter);
+          events.back().record(item);
         }
 
         allocator_->async_mem_pool()->add(std::make_tuple(ptr(), size()),
@@ -225,7 +228,9 @@ static void deleteBSContext(void* ptr) {
   delete ctx;
 }
 
-DIPU_REGISTER_ALLOCATOR(BS, dipu::DIPU_DEVICE_TYPE, BSCachingAllocator, 0);
-DIPU_REGISTER_ALLOCATOR(BS, at::DeviceType::CPU, BSCachingAllocator, 0);
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+DIPU_REGISTER_ALLOCATOR(BS, DIPU_DEVICE_TYPE_MACRO, BSCachingAllocator, 0);
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+DIPU_REGISTER_ALLOCATOR(BS, CPU, BSCachingAllocator, 0);
 
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.cpp
index b3396b2f26..37071972e3 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.cpp
@@ -5,9 +5,17 @@
 #include <map>
 #include <set>
 #include <tuple>
+#include <vector>
+
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+
+#include "csrc_dipu/base/basedef.h"
+#include "csrc_dipu/runtime/devproxy/deviceproxy.h"
 
 namespace dipu {
 
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 std::mutex DIPURawDeviceAllocator::mutex_;
 
 namespace {
@@ -22,41 +30,44 @@ using RegisteredAllocator = std::map<
     std::map<std::string,
              std::tuple<std::function<c10::Allocator*(int)>, uint8_t>>>;
 
-static std::unique_ptr<RegisteredAllocator> gDIPURegisterdAllocatorPtr;
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+std::unique_ptr<RegisteredAllocator> gDIPURegisteredAllocatorPtr;
 
-static std::mutex dipu_register_allocator_mutex;
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+std::mutex dipu_register_allocator_mutex;
 
-static std::set<c10::Allocator*> used_allocator;
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+std::set<c10::Allocator*> used_allocator;
 
 }  // namespace
 
 constexpr const char* dipu_default_memcaching_algorithm = "BF";
 
-std::string dipu_device_memcaching_algorithm = []() {
+const std::string dipu_device_memcaching_algorithm = []() {
   const char* env = std::getenv("DIPU_DEVICE_MEMCACHING_ALGORITHM");
   return env ? env : dipu_default_memcaching_algorithm;
 }();
 
-std::string dipu_host_memcaching_algorithm = []() {
+const std::string dipu_host_memcaching_algorithm = []() {
   const char* env = std::getenv("DIPU_HOST_MEMCACHING_ALGORITHM");
   return env ? env : dipu_default_memcaching_algorithm;
 }();
 
-void setAllocator(const std::string name, c10::DeviceType device_type,
-                  std::function<c10::Allocator*(int)> allocator_geter,
+void setAllocator(const std::string& name, c10::DeviceType device_type,
+                  const std::function<c10::Allocator*(int)>& allocator_getter,
                   uint8_t priority) {
   std::lock_guard<std::mutex> lock(dipu_register_allocator_mutex);
-  if (!gDIPURegisterdAllocatorPtr) {
-    gDIPURegisterdAllocatorPtr = std::make_unique<RegisteredAllocator>();
+  if (!gDIPURegisteredAllocatorPtr) {
+    gDIPURegisteredAllocatorPtr = std::make_unique<RegisteredAllocator>();
   }
-  auto& gDIPURegisterdAllocator = *gDIPURegisterdAllocatorPtr;
-  if (gDIPURegisterdAllocator[device_type].count(name) <= 0) {
-    gDIPURegisterdAllocator[device_type][name] =
-        std::make_tuple(allocator_geter, priority);
+  auto& gDIPURegisteredAllocator = *gDIPURegisteredAllocatorPtr;
+  if (gDIPURegisteredAllocator[device_type].count(name) <= 0) {
+    gDIPURegisteredAllocator[device_type][name] =
+        std::make_tuple(allocator_getter, priority);
   } else {
-    if (std::get<1>(gDIPURegisterdAllocator[device_type][name]) < priority) {
-      gDIPURegisterdAllocator[device_type][name] =
-          std::make_tuple(allocator_geter, priority);
+    if (std::get<1>(gDIPURegisteredAllocator[device_type][name]) < priority) {
+      gDIPURegisteredAllocator[device_type][name] =
+          std::make_tuple(allocator_getter, priority);
     } else {
       TORCH_CHECK(false,
                   "A higher priority allocator is already registered for the "
@@ -66,21 +77,29 @@ void setAllocator(const std::string name, c10::DeviceType device_type,
   }
 }
 
-c10::Allocator* getAllocator(const c10::Device& device) {
+namespace {
+
+int getDeviceIndex(const c10::Device& device, int host_index) {
+  if (device.is_cpu()) {
+    return host_index;
+  }
+  if (device.has_index()) {
+    return device.index();
+  }
+  return devproxy::current_device();
+}
+
+c10::Allocator* createAllocator(const c10::Device& device) {
   c10::DeviceType device_type = device.type();
   c10::Allocator* result = nullptr;
-  auto& gDIPURegisterdAllocator = *gDIPURegisterdAllocatorPtr;
+  auto& gDIPURegisteredAllocator = *gDIPURegisteredAllocatorPtr;
   const std::string algorithm =
       (device_type == dipu::DIPU_DEVICE_TYPE ? dipu_device_memcaching_algorithm
                                              : dipu_host_memcaching_algorithm);
-  if (gDIPURegisterdAllocator[device_type].count(algorithm) > 0) {
+  if (gDIPURegisteredAllocator[device_type].count(algorithm) > 0) {
     auto allocator_geter =
-        std::get<0>(gDIPURegisterdAllocator[device_type][algorithm]);
-    int device_index = 0;
-    if (device_type == dipu::DIPU_DEVICE_TYPE) {
-      device_index =
-          device.has_index() ? device.index() : devproxy::current_device();
-    }
+        std::get<0>(gDIPURegisteredAllocator[device_type][algorithm]);
+    int device_index = getDeviceIndex(device, 0);
 
     auto allocator = allocator_geter(device_index);
     if (device_type == dipu::DIPU_DEVICE_TYPE) {
@@ -94,14 +113,31 @@ c10::Allocator* getAllocator(const c10::Device& device) {
   return nullptr;
 }
 
+}  // namespace
+
+c10::Allocator* getAllocator(const c10::Device& device) {
+  // allocator_lookup_table[device_index] == device allocator
+  // allocator_lookup_table[device_count] == host allocator
+  static const int device_count = devproxy::getDeviceCount();
+  static const int host_index = device_count;
+  static std::vector<c10::Allocator*> allocator_lookup_table(device_count + 1);
+  int device_index = getDeviceIndex(device, host_index);
+  auto& allocator = allocator_lookup_table[device_index];
+  if (allocator == nullptr) {
+    allocator = createAllocator(device);
+  }
+  return allocator;
+}
+
 c10::Allocator* getAllocator(c10::DeviceType device_type) {
   return getAllocator(c10::Device(device_type));
 }
 
 void emptyCachedMem() {
-  auto empty_allocator_cache = [](auto allocator) {
+  auto function_name = __FUNCTION__;
+  auto empty_allocator_cache = [&function_name](auto allocator) {
     auto cached_allocator = dynamic_cast<CacheAllocator*>(allocator);
-    DIPU_DEBUG_ALLOCATOR(8, __FUNCTION__
+    DIPU_DEBUG_ALLOCATOR(8, function_name
                                 << " allocator:" << allocator
                                 << ", cached_allocator:" << cached_allocator);
     if (cached_allocator != nullptr) {
@@ -164,18 +200,14 @@ size_t maxMemoryAllocated(const c10::Device& device) {
   return 0;
 }
 
-void recordStream(const c10::DataPtr& ptr, DIPUStream stream) {
-  void* ctx = ptr.get_context();
-  if (ctx == nullptr) {
-    return;
-  }
-  auto base_cxt = static_cast<CacheAllocator::DataPtrContextBase*>(ctx);
-  if (base_cxt) {
-    base_cxt->streams().insert(stream);
+void recordStream(const c10::DataPtr& ptr, const DIPUStream& stream) {
+  using pointer = CacheAllocator::DataPtrContextBase*;
+  if (auto ctx = static_cast<pointer>(ptr.get_context())) {
+    ctx->streams().insert(stream);
   }
 }
 
-void recordStream(const at::Tensor& tensor, DIPUStream stream) {
+void recordStream(const at::Tensor& tensor, const DIPUStream& stream) {
   dipu::recordStream(tensor.storage().data_ptr(), stream);
 }
 
@@ -184,12 +216,12 @@ class DIPUDeviceCachingProxy : public c10::Allocator {
   c10::DeviceType device_type_;
 
  public:
-  DIPUDeviceCachingProxy(c10::DeviceType device_type)
+  explicit DIPUDeviceCachingProxy(c10::DeviceType device_type)
       : device_type_(device_type) {}
 
-  ~DIPUDeviceCachingProxy() {}
+  ~DIPUDeviceCachingProxy() override = default;
 
-  c10::DataPtr allocate(size_t size) const {
+  c10::DataPtr allocate(size_t size) const override {
     return getAllocator(device_type_)->allocate(size);
   }
 
@@ -197,15 +229,17 @@ class DIPUDeviceCachingProxy : public c10::Allocator {
     return getAllocator(device_type_)->raw_deleter();
   }
 };
-static DIPUDeviceCachingProxy dipu_default_device_allocator(
-    dipu::DIPU_DEVICE_TYPE);
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+DIPUDeviceCachingProxy dipu_default_device_allocator(dipu::DIPU_DEVICE_TYPE);
 };  // namespace
 
 void initCachedAllocator() {
   // Make the c10::GetAllocator interface available
+  constexpr int kPriority = 255;
   c10::SetAllocator(dipu::DIPU_DEVICE_TYPE, &dipu_default_device_allocator,
-                    255);
-  c10::SetAllocator(c10::DeviceType::CUDA, &dipu_default_device_allocator, 255);
+                    kPriority);
+  c10::SetAllocator(c10::DeviceType::CUDA, &dipu_default_device_allocator,
+                    kPriority);
 }
 
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h
index 98245a430d..134f499f34 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h
@@ -1,17 +1,11 @@
 // Copyright (c) 2023, DeepLink.
 #pragma once
 
-#include <list>
-#include <map>
-#include <set>
-
 #include <c10/core/Allocator.h>
 #include <c10/core/Device.h>
-
-#include "csrc_dipu/runtime/core/DIPUEvent.h"
+#include <c10/util/flat_hash_map.h>
 
 #include "DIPUAsyncResourcePool.h"
-#include "DIPUCachingAllocatorUtils.h"
 #include "DIPURawAllocator.h"
 
 namespace dipu {
@@ -41,7 +35,7 @@ class MemStats {
   }
 
  public:
-  MemStats() {}
+  MemStats() = default;
 
   ~MemStats() {
     if (allocated_in_bytes_ != 0) {
@@ -58,9 +52,9 @@ class MemStats {
 
   size_t memory_reserved() const { return reserved_in_bytes_; }
 
-  size_t max_memory_allocated() { return max_allocated_in_bytes_; }
+  size_t max_memory_allocated() const { return max_allocated_in_bytes_; }
 
-  size_t max_memory_reserved() { return max_reserved_in_bytes_; }
+  size_t max_memory_reserved() const { return max_reserved_in_bytes_; }
 };
 
 class DIPU_API CacheAllocator : public c10::Allocator, public MemStats {
@@ -78,7 +72,7 @@ class DIPU_API CacheAllocator : public c10::Allocator, public MemStats {
   void free_raw(void* ptr) { return raw_allocator()->raw_deallocate(ptr); }
 
  public:
-  CacheAllocator() {}
+  CacheAllocator() = default;
 
   void set_raw_allocator(c10::Allocator* raw_allocator) {
     raw_allocator_ = raw_allocator;
@@ -89,9 +83,7 @@ class DIPU_API CacheAllocator : public c10::Allocator, public MemStats {
     async_mem_pool_ = async_mem_pool;
   }
 
-  virtual ~CacheAllocator(){
-
-  };
+  ~CacheAllocator() override = default;
 
   virtual void empty_cache() const = 0;
 
@@ -100,8 +92,7 @@ class DIPU_API CacheAllocator : public c10::Allocator, public MemStats {
   c10::Device& device() const { return device_; }
 
   class DataPtrContextBase {
-   private:
-    std::set<DIPUStream> streams_;
+    ska::flat_hash_set<DIPUStream> streams_;
     mutable const CacheAllocator* allocator_ = nullptr;
     void* ptr_ = nullptr;
     size_t size_ = 0;
@@ -123,28 +114,28 @@ class DIPU_API CacheAllocator : public c10::Allocator, public MemStats {
 
     ~DataPtrContextBase() { MemChecker::instance().erase(ptr_); }
 
-    std::set<DIPUStream>& streams() { return streams_; }
+    ska::flat_hash_set<DIPUStream>& streams() { return streams_; }
 
     const CacheAllocator* allocator() { return allocator_; }
 
     void* ptr() { return ptr_; }
 
-    size_t size() { return size_; }
+    size_t size() const { return size_; }
   };
 };
 
-void setAllocator(const std::string name, c10::DeviceType device_type,
-                  std::function<c10::Allocator*(int)> allocator_get_fn,
+void setAllocator(const std::string& name, c10::DeviceType device_type,
+                  const std::function<c10::Allocator*(int)>& allocator_getter,
                   uint8_t priority = 0);
 
 c10::Allocator* getAllocator(c10::DeviceType device_type);
 
-namespace {  // For internal implementation only
+namespace allocator_details {  // For internal implementation only
 
 struct AllocatorRegisterer {
   explicit AllocatorRegisterer(
-      const std::string name, c10::DeviceType device_type,
-      std::function<c10::Allocator*(int)> allocator_get_fn,
+      const std::string& name, c10::DeviceType device_type,
+      const std::function<c10::Allocator*(int)>& allocator_get_fn,
       uint8_t priority = 0) {
     setAllocator(name, device_type, allocator_get_fn, priority);
   }
@@ -180,42 +171,46 @@ c10::Allocator* get_allocator_impl(c10::Allocator* raw_allocator) {
 
 template <class AllocatorImpl, class AsyncMemPoolImpl>
 c10::Allocator* get_allocator(int device_id, c10::Allocator* raw_allocator) {
-#define allocator_dispatch_device_id(id)                            \
-  if (device_id == id) {                                            \
+#define DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(id)                       \
+  if (device_id == (id)) {                                          \
     return get_allocator_impl<AllocatorImpl, AsyncMemPoolImpl, id>( \
         raw_allocator);                                             \
   }
 
-  allocator_dispatch_device_id(0);
-  allocator_dispatch_device_id(1);
-  allocator_dispatch_device_id(2);
-  allocator_dispatch_device_id(3);
-  allocator_dispatch_device_id(4);
-  allocator_dispatch_device_id(5);
-  allocator_dispatch_device_id(6);
-  allocator_dispatch_device_id(7);
-  allocator_dispatch_device_id(8);
-  allocator_dispatch_device_id(9);
-  allocator_dispatch_device_id(10);
-  allocator_dispatch_device_id(11);
-  allocator_dispatch_device_id(12);
-  allocator_dispatch_device_id(13);
-  allocator_dispatch_device_id(14);
-  allocator_dispatch_device_id(15);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(0);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(1);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(2);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(3);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(4);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(5);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(6);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(7);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(8);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(9);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(10);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(11);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(12);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(13);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(14);
+  DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(15);
   TORCH_CHECK(false, "support up to 16 cards");
 }
+#undef DIPU_ALLOCATOR_DISPATCH_DEVICE_ID
 
 #define DIPU_REGISTER_ALLOCATOR(name, device_type, CachingAllocator, priority) \
   namespace name##device_type {                                                \
-    static RawAllocator<device_type>::type raw_allocator;                      \
-    using AsyncMemPool = AsyncResourcePoolImpl<std::tuple<void*, size_t>,      \
-                                               device_type, priority>;         \
-    static std::function<c10::Allocator*(int)> allocator_get_fn =              \
-        std::bind(get_allocator<CachingAllocator, AsyncMemPool>,               \
-                  std::placeholders::_1, &raw_allocator);                      \
-    static AllocatorRegisterer g_allocator(#name, device_type,                 \
-                                           allocator_get_fn, priority);        \
+    static allocator_details::RawAllocator<at::DeviceType::device_type>::type  \
+        raw_allocator;                                                         \
+    using AsyncMemPool =                                                       \
+        AsyncResourcePoolImpl<std::tuple<void*, size_t>,                       \
+                              at::DeviceType::device_type, priority>;          \
+    static const std::function<c10::Allocator*(int)> allocator_get_fn =        \
+        std::bind(                                                             \
+            allocator_details::get_allocator<CachingAllocator, AsyncMemPool>,  \
+            std::placeholders::_1, &raw_allocator);                            \
+    static const allocator_details::AllocatorRegisterer g_allocator(           \
+        #name, at::DeviceType::device_type, allocator_get_fn, priority);       \
   }
-}  // namespace
+}  // namespace allocator_details
 
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocatorUtils.h b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocatorUtils.h
index 19455be9a7..2e5e71cbd1 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocatorUtils.h
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocatorUtils.h
@@ -21,8 +21,8 @@ void initCachedAllocator();
 
 void releaseAllDeviceMem();
 
-void recordStream(const c10::DataPtr& ptr, DIPUStream stream);
+void recordStream(const c10::DataPtr& ptr, const DIPUStream& stream);
 
-void recordStream(const at::Tensor& tensor, DIPUStream stream);
+void recordStream(const at::Tensor& tensor, const DIPUStream& stream);
 
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.cpp
index e28073f950..2f1607abab 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.cpp
@@ -15,7 +15,6 @@ namespace dipu {
 
 static void DIPURawDeviceAllocatorDeleter(void* ptr) {
   if (ptr) {
-    auto device = devproxy::current_device();
     DIPU_DEBUG_ALLOCATOR(2, "devproxy::freeDevice: free " << ptr);
     // When only one stream is involved, in order to improve performance and
     // memory usage, we actually do not use events for synchronization. The
@@ -29,7 +28,7 @@ static void DIPURawDeviceAllocatorDeleter(void* ptr) {
   }
 }
 
-DIPURawDeviceAllocator::DIPURawDeviceAllocator() {}
+DIPURawDeviceAllocator::DIPURawDeviceAllocator() = default;
 
 c10::DataPtr DIPURawDeviceAllocator::allocate(size_t size) const {
   auto idx = devproxy::current_device();
@@ -40,6 +39,7 @@ c10::DeleterFnPtr DIPURawDeviceAllocator::raw_deleter() const {
   return &DIPURawDeviceAllocatorDeleter;
 }
 
+// NOLINTNEXTLINE(readability-convert-member-functions-to-static)
 c10::DataPtr DIPURawDeviceAllocator::allocate(
     size_t nbytes, c10::DeviceIndex device_index) const {
   std::lock_guard<std::mutex> lock(mutex_);
@@ -55,7 +55,7 @@ c10::DataPtr DIPURawDeviceAllocator::allocate(
 
 class DIPURawHostAllocatorImpl final {
  public:
-  std::pair<void*, void*> allocate(size_t size) {
+  static std::pair<void*, void*> allocate(size_t size) {
     if (size == 0) {
       return {nullptr, nullptr};
     }
@@ -71,7 +71,7 @@ class DIPURawHostAllocatorImpl final {
     return {data, data};
   }
 
-  void free(void* ctx) {
+  static void free(void* ctx) {
     if (ctx == nullptr) {
       return;
     }
@@ -85,7 +85,7 @@ class DIPURawHostAllocatorImpl final {
     ctx = nullptr;
   }
 
-  bool isPinnedPtr(const void* p) {
+  static bool isPinnedPtr(const void* p) {
     bool is_pinned = false;
     {
       std::lock_guard<std::mutex> lck(mtx_);
@@ -98,10 +98,9 @@ class DIPURawHostAllocatorImpl final {
         if (cp >= cptr && cp < max_ptr) {
           is_pinned = true;
           break;
-        } else {
-          if (cp >= max_ptr) {
-            break;
-          }
+        }
+        if (cp >= max_ptr) {
+          break;
         }
       }
     }
@@ -109,20 +108,23 @@ class DIPURawHostAllocatorImpl final {
   }
 
  private:
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
   static std::mutex mtx_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
   static std::map<void*, size_t> blocks_;
 };
 
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 std::map<void*, size_t> DIPURawHostAllocatorImpl::blocks_;
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 std::mutex DIPURawHostAllocatorImpl::mtx_;
 
 namespace {
 
-static DIPURawHostAllocatorImpl dipu_host_allocator;
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+DIPURawHostAllocatorImpl dipu_host_allocator;
 
-static void DIPURawHostAllocatorDeleter(void* ctx) {
-  dipu_host_allocator.free(ctx);
-}
+void DIPURawHostAllocatorDeleter(void* ctx) { dipu_host_allocator.free(ctx); }
 
 }  // namespace
 
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.h b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.h
index e8983fafbf..d26e5411a6 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.h
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawAllocator.h
@@ -21,7 +21,7 @@ namespace dipu {
       auto env = std::getenv("DIPU_DEBUG_ALLOCATOR");                          \
       return env ? std::atoi(env) : 0;                                         \
     }();                                                                       \
-    if (((mask)&value) == (mask)) {                                            \
+    if ((mask & value) == mask) {                                              \
       std::cout << "[" << std::this_thread::get_id() << "]" << x << std::endl; \
     }                                                                          \
   }
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawCachingAllocator.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawCachingAllocator.cpp
index d558667d5b..f986852d42 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawCachingAllocator.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPURawCachingAllocator.cpp
@@ -4,13 +4,13 @@
 
 namespace dipu {
 
-static void deleteRawCachingAllocatorContext(void*);
+static void deleteRawCachingAllocatorContext(void* ptr);
 
 class RawCachingAllocator : public CacheAllocator {
  public:
-  RawCachingAllocator() {}
+  RawCachingAllocator() = default;
 
-  ~RawCachingAllocator() {}
+  ~RawCachingAllocator() override = default;
 
   class Context : public DataPtrContextBase {
    public:
@@ -19,9 +19,9 @@ class RawCachingAllocator : public CacheAllocator {
         : DataPtrContextBase(allocator, ptr, size), real_size_(real_size) {}
     ~Context() {
       std::deque<DIPUEvent> events;
-      for (auto iter = streams().begin(); iter != streams().end(); iter++) {
+      for (const auto& item : streams()) {
         events.emplace_back();
-        events.back().record(*iter);
+        events.back().record(item);
       }
       auto allocator_ = static_cast<const RawCachingAllocator*>(allocator());
       allocator_->async_mem_pool()->add(std::make_tuple(ptr(), size()), events);
@@ -32,9 +32,10 @@ class RawCachingAllocator : public CacheAllocator {
     size_t real_size_ = 0;
   };
 
-  size_t getAllocateSize(size_t nbytes) const {
+  static size_t getAllocateSize(size_t nbytes) {
     static const size_t kMinAllocationSize = []() {
-      size_t size = 512;
+      const int kBytesNum = 512;
+      size_t size = kBytesNum;
       const char* env = std::getenv("DIPU_RAW_ALLOCATOR_MIN_ALLOCATE_SIZE");
       if (env != nullptr) {
         size = std::atoi(env);
@@ -54,8 +55,8 @@ class RawCachingAllocator : public CacheAllocator {
     auto ptr = raw_allocator()->raw_allocate(nbytes);
     set_memory_reserved(memory_reserved() + nbytes);
     set_memory_allocated(memory_allocated() + nbytes);
-    return c10::DataPtr(ptr, new Context(this, ptr, size, nbytes),
-                        deleteRawCachingAllocatorContext, device());
+    return {ptr, new Context(this, ptr, size, nbytes),
+            deleteRawCachingAllocatorContext, device()};
   }
 
   void empty_cache() const override {
@@ -84,8 +85,9 @@ static void deleteRawCachingAllocatorContext(void* ptr) {
   auto ctx = static_cast<RawCachingAllocator::Context*>(ptr);
   delete ctx;
 }
-
-DIPU_REGISTER_ALLOCATOR(RAW, dipu::DIPU_DEVICE_TYPE, RawCachingAllocator, 0);
-DIPU_REGISTER_ALLOCATOR(RAW, at::DeviceType::CPU, RawCachingAllocator, 0);
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+DIPU_REGISTER_ALLOCATOR(RAW, DIPU_DEVICE_TYPE_MACRO, RawCachingAllocator, 0);
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+DIPU_REGISTER_ALLOCATOR(RAW, CPU, RawCachingAllocator, 0);
 
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/guardimpl/DIPUGuardImpl.h b/dipu/torch_dipu/csrc_dipu/runtime/core/guardimpl/DIPUGuardImpl.h
index bfc4182b55..bd50069d9c 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/core/guardimpl/DIPUGuardImpl.h
+++ b/dipu/torch_dipu/csrc_dipu/runtime/core/guardimpl/DIPUGuardImpl.h
@@ -4,6 +4,7 @@
 
 #include <limits>
 
+#include <c10/core/Stream.h>
 #include <c10/core/impl/DeviceGuardImplInterface.h>
 #include <c10/macros/Macros.h>
 
@@ -66,11 +67,11 @@ struct DIPUGuardImpl : public c10::impl::DeviceGuardImplInterface {
 
   c10::Stream getStreamFromGlobalPool(c10::Device d,
                                       bool isHighPriority) const override {
-    return getDIPUStreamFromPool(d.index());
+    return getDIPUStreamFromPool(d.index()).unwrap();
   }
 
   c10::Stream getDefaultStream(c10::Device device) const override {
-    return getDefaultDIPUStream(device.index());
+    return getDefaultDIPUStream(device.index()).unwrap();
   }
 
   void record(void** event, const c10::Stream& s,
@@ -83,7 +84,6 @@ struct DIPUGuardImpl : public c10::impl::DeviceGuardImplInterface {
 
     auto dipu_event = static_cast<deviceEvent_t>(*event);
     auto stream = DIPUStream(s);
-    auto raw_stream = stream.rawstream();
 
     // Moves to queue's device to record
     const c10::Device orig_device = this->getDevice();
@@ -93,7 +93,7 @@ struct DIPUGuardImpl : public c10::impl::DeviceGuardImplInterface {
     if (!dipu_event) {
       devproxy::createEvent(&dipu_event);
     }
-    devproxy::recordEvent(dipu_event, raw_stream);
+    devproxy::recordEvent(dipu_event, stream.rawstream());
     *event = dipu_event;
 
     // Resets device
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/device/basedef.h b/dipu/torch_dipu/csrc_dipu/runtime/device/basedef.h
index 364844e3ad..ff8557c5cb 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/device/basedef.h
+++ b/dipu/torch_dipu/csrc_dipu/runtime/device/basedef.h
@@ -16,7 +16,7 @@ namespace dipu {
 // "default", "hidden", "protected" or "internal
 #define DIPU_HIDDEN __attribute__((visibility("hidden")))
 
-typedef int32_t enum_t;
+using enum_t = int32_t;
 
 #define DIPU_STRING(x) #x
 #define DIPU_CODELOC __FILE__ " (" DIPU_STRING(__LINE__) ")"
@@ -38,6 +38,7 @@ enum class VendorDeviceType : enum_t {
   GCU,      // gcu
   SUPA,     // Biren
   DROPLET,  // droplet
+  KLX,      // Kunlunxin
 };
 
 enum class EventStatus : enum_t { PENDING, RUNNING, DEFERRED, READY };
@@ -54,14 +55,14 @@ enum class MemCPKind : enum_t {
   D2D,
 };
 
-typedef enum {
+enum diclResult_t {
   /*! The operation was successful. */
   DICL_SUCCESS = 0x0,
 
   /*! undefined error */
   DICL_ERR_UNDEF = 0x01000,
 
-} diclResult_t;
+};
 
 struct DIPUDeviceStatus {
   size_t freeGlobalMem = 0;
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.cpp b/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.cpp
index 275d358e80..2ac23ef3c0 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.cpp
@@ -28,7 +28,7 @@ DIPUDeviceStatus getDeviceStatus(int32_t device_index) {
   if (devapis::getDeviceStatus) {
     return devapis::getDeviceStatus(device_index);
   }
-  return DIPUDeviceStatus();
+  return {};
 }
 
 // set current device given device according to id
@@ -41,7 +41,10 @@ void syncDevice() { return devapis::syncDevice(); }
 // check last launch succ or not, throw if fail
 void checkLastError() { return devapis::checkLastError(); }
 
-int getDeviceCount() { return devapis::getDeviceCount(); }
+int getDeviceCount() {
+  static int device_count = devapis::getDeviceCount();
+  return device_count;
+}
 
 void getDriverVersion(int* version) {
   return devapis::getDriverVersion(version);
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/distributed/DICLUtils.hpp b/dipu/torch_dipu/csrc_dipu/runtime/distributed/DICLUtils.hpp
index d0dad8c14d..2b0e98f7d0 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/distributed/DICLUtils.hpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/distributed/DICLUtils.hpp
@@ -5,6 +5,8 @@
 
 #include <c10/core/Device.h>
 
+#include <csrc_dipu/runtime/core/DIPUEvent.h>
+#include <csrc_dipu/runtime/core/DIPUStream.h>
 #include <csrc_dipu/runtime/devproxy/deviceproxy.h>
 #include <csrc_dipu/runtime/devproxy/diclproxy.h>
 
diff --git a/dipu/torch_dipu/csrc_dipu/runtime/distributed/ProcessGroupDICL.cpp b/dipu/torch_dipu/csrc_dipu/runtime/distributed/ProcessGroupDICL.cpp
index 7b7b325efe..6b0a3fb396 100644
--- a/dipu/torch_dipu/csrc_dipu/runtime/distributed/ProcessGroupDICL.cpp
+++ b/dipu/torch_dipu/csrc_dipu/runtime/distributed/ProcessGroupDICL.cpp
@@ -19,8 +19,7 @@ namespace {
 
 // Get the list of devices from list of tensors, collective comm always use all
 // ranks, so no rank prefix required in key.
-static inline std::string getDevieceIds(
-    const std::vector<at::Device>& devices) {
+std::string getDevieceIds(const std::vector<at::Device>& devices) {
   std::string deviceList;
   for (auto& device : devices) {
     if (deviceList.empty()) {
@@ -32,7 +31,7 @@ static inline std::string getDevieceIds(
   return deviceList;
 }
 
-static inline pair<int, int> mapPGRank2P2P(int myRank, int peer) {
+pair<int, int> mapPGRank2P2P(int myRank, int peer) {
   // ProcessGroupNCCL support send/recv self, but that seems only work with
   // ncclGroup?
   TORCH_CHECK(myRank != peer,
@@ -49,15 +48,14 @@ static inline pair<int, int> mapPGRank2P2P(int myRank, int peer) {
 // Get p2p sorted ranks as key, p2p only support 1 device tensor at a time and
 // one comm endpoint can bind with either device. so use rank as comm key is
 // enough.
-static inline std::string getP2PRankIds(
-    int myRank, int peer, const std::vector<at::Device>& devices) {
+std::string getP2PRankIds(int myRank, int peer,
+                          const std::vector<at::Device>& devices) {
   int lowRank = myRank < peer ? myRank : peer;
   int highRank = myRank < peer ? peer : myRank;
   return std::to_string(lowRank) + ":" + std::to_string(highRank);
 }
 
-static inline std::vector<at::Device> getDeviceList(
-    const std::vector<at::Tensor>& tensors) {
+std::vector<at::Device> getDeviceList(const std::vector<at::Tensor>& tensors) {
   std::vector<at::Device> res;
   res.reserve(tensors.size());
   for (auto& tensor : tensors) {
@@ -66,9 +64,9 @@ static inline std::vector<at::Device> getDeviceList(
   return res;
 }
 
-static inline void syncStreams(std::vector<std::shared_ptr<DICLComm>>& comms) {
-  for (size_t i = 0; i < comms.size(); ++i) {
-    comms[i]->preSyncStream();
+void syncStreams(std::vector<std::shared_ptr<DICLComm>>& comms) {
+  for (auto& comm : comms) {
+    comm->preSyncStream();
   }
 }
 
@@ -89,12 +87,8 @@ bool ProcessGroupDICL::WorkDICL::isSuccess() const {
 }
 
 bool ProcessGroupDICL::WorkDICL::finishedDICLExecutionInternal() const {
-  for (auto& workEvent : workEvents_) {
-    if (!workEvent.query()) {
-      return false;
-    }
-  }
-  return true;
+  return std::all_of(workEvents_.begin(), workEvents_.end(),
+                     [](const DIPUEvent& e) { return e.query(); });
 }
 
 // record post work event on communicator stream
@@ -138,6 +132,7 @@ void ProcessGroupDICL::WorkDICL::synchronize() {
 }
 
 // Same as calling synchronize().
+// NOLINTNEXTLINE(google-default-arguments)
 bool ProcessGroupDICL::WorkDICL::wait(std::chrono::milliseconds timeout) {
   synchronize();
   return true;
@@ -175,7 +170,7 @@ ProcessGroupDICL::ProcessGroupDICL(const c10::intrusive_ptr<Store>& store,
   }
 }
 
-ProcessGroupDICL::~ProcessGroupDICL() {}
+ProcessGroupDICL::~ProcessGroupDICL() = default;
 
 void ProcessGroupDICL::broadcastUniqueID(commUniqueId* uniqueId,
                                          const std::string& storeKey,
@@ -228,7 +223,7 @@ std::vector<std::shared_ptr<DICLComm>>& ProcessGroupDICL::getDICLComms(
   }
   // not cached, create a new entry
   std::vector<std::shared_ptr<DICLComm>> diclComms;
-  auto devSize = devices.size();
+  int devSize = static_cast<int>(devices.size());
   diclComms.resize(devSize);
   int deviceWorldSize = isP2POp(opType, false) ? 2 : getSize() * devSize;
 
@@ -264,7 +259,7 @@ namespace {
 
 // Flatten each list in `tensor_lists' for a gather or scatter operation, and
 // ensure compatibility with the corresponding tensor in `other'.
-static inline std::vector<at::Tensor> flatten_for_scatter_gather(
+std::vector<at::Tensor> flatten_for_scatter_gather(
     std::vector<std::vector<at::Tensor>>& tensor_lists,
     std::vector<at::Tensor>& other, size_t world_size) {
   if (tensor_lists.size() != other.size()) {
@@ -303,9 +298,8 @@ static inline std::vector<at::Tensor> flatten_for_scatter_gather(
 }
 
 template <bool RecordDest, typename Dest, typename Src>
-static inline void copyInCommStream(std::shared_ptr<DICLComm>& diclComm,
-                                    const Dest& dest, const Src& src,
-                                    int nums) {
+void copyInCommStream(std::shared_ptr<DICLComm>& diclComm, const Dest& dest,
+                      const Src& src, int nums) {
   auto diclStream = diclComm->diclStream_;
   DIPUStreamGuard guard(diclStream.unwrap());
   for (size_t j = 0; j < nums; ++j) {
@@ -318,16 +312,16 @@ static inline void copyInCommStream(std::shared_ptr<DICLComm>& diclComm,
   }
 }
 
-static inline void copyInCurrentStream(std::shared_ptr<DICLComm>& diclComm,
-                                       const std::vector<at::Tensor>& dest,
-                                       const at::Tensor& src) {
+void copyInCurrentStream(std::shared_ptr<DICLComm>& diclComm,
+                         const std::vector<at::Tensor>& dest,
+                         const at::Tensor& src) {
   auto diclStream = diclComm->diclStream_;
   auto currStream = dipu::getCurrentDIPUStream(diclStream.device_index());
   diclComm->preCopyEvent_.record(diclStream);
   // copy after comm finish, loss concurrency,assume all dest finish in one comm
   // op
   diclComm->preCopyEvent_.wait(currStream);
-  for (size_t j = 0; j < dest.size(); ++j) {
+  for (int64_t j = 0; j < dest.size(); ++j) {
     dest[j].copy_(src[j], true);
   }
 }
@@ -337,7 +331,7 @@ static inline void copyInCurrentStream(std::shared_ptr<DICLComm>& diclComm,
 // device specific check
 void ProcessGroupDICL::checkDeviceTensors(
     const std::vector<at::Tensor>& tensors) {
-  if (tensors.size() == 0) {
+  if (tensors.empty()) {
     TORCH_CHECK(false, "Tensor list must be nonempty");
   }
   if (tensors.size() > static_cast<size_t>(devproxy::getDeviceCount())) {
@@ -351,7 +345,7 @@ void ProcessGroupDICL::checkDeviceTensors(
   std::unordered_set<decltype(first.get_device())> usedDevices;
   usedDevices.reserve(tensors.size());
 
-  for (auto tensor : tensors) {
+  for (const auto& tensor : tensors) {
     if (!dipu::isDeviceTensor(tensor) ||
         !tensor.is_non_overlapping_and_dense()) {
       TORCH_CHECK(false, "Tensors must be DIPU and non-overlapping and dense");
@@ -411,7 +405,7 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::doComm(
   // todo:: dipu need support multistream guard & remove
   // work->workEvents_(future already has events ).
   {
-    DIPUStreamGuard streamGuard(diclComms[0]->diclStream_);
+    DIPUStreamGuard guard(diclComms[0]->diclStream_.unwrap());
 
     work->future_ = c10::make_intrusive<at::ivalue::Future>(
         c10::ListType::create(c10::TensorType::get()), devices);
@@ -472,6 +466,7 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::pointToPoint(
   return doComm(inputs, outputs, diclComms, devices, fn, pre, post, opType);
 }
 
+// NOLINTNEXTLINE(google-default-arguments)
 c10::intrusive_ptr<Work> ProcessGroupDICL::allreduce(
     std::vector<at::Tensor>& tensors, const AllreduceOptions& opts) {
   // inplace in = out, every rank use both in&out.
@@ -481,16 +476,17 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::allreduce(
       [&](at::Tensor& input, at::Tensor& output, diclComm_t comm,
           DIPUStream& stream) {
         RECORD_FUNCTION("DiclAllreduce", std::vector<c10::IValue>({input}));
-        profile::RecordBlockCreator _("DiclAllreduce",
-                                      profile::ExtraRecordInfo(),
-                                      stream.rawstream(), stream.id());
-        return devproxy::diclAllReduce(
-            input.data_ptr(), output.data_ptr(), (size_t)input.numel(),
-            input.scalar_type(), opts.reduceOp, comm, stream.rawstream());
+        profile::RecordBlockCreator _("DiclAllreduce", stream.rawstream(),
+                                      static_cast<int>(stream.id()));
+        return devproxy::diclAllReduce(input.data_ptr(), output.data_ptr(),
+                                       static_cast<size_t>(input.numel()),
+                                       input.scalar_type(), opts.reduceOp, comm,
+                                       stream.rawstream());
       },
       OpType::ALLREDUCE);
 }
 
+// NOLINTNEXTLINE(google-default-arguments)
 c10::intrusive_ptr<Work> ProcessGroupDICL::broadcast(
     std::vector<at::Tensor>& tensors, const BroadcastOptions& opts) {
   checkDeviceTensors(tensors);
@@ -500,18 +496,19 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::broadcast(
       [&](at::Tensor& input, at::Tensor& output, diclComm_t comm,
           DIPUStream& stream) {
         RECORD_FUNCTION("DiclBroadcast", std::vector<c10::IValue>({input}));
-        profile::RecordBlockCreator _("DiclBroadcast",
-                                      profile::ExtraRecordInfo(),
-                                      stream.rawstream(), stream.id());
+        profile::RecordBlockCreator _("DiclBroadcast", stream.rawstream(),
+                                      static_cast<int>(stream.id()));
         // only one root (root rank root device)
         const auto root = opts.rootRank * tensors.size() + opts.rootTensor;
         return devproxy::diclBroadcast(
-            input.data_ptr(), input.data_ptr(), (size_t)input.numel(),
-            input.scalar_type(), root, comm, stream.rawstream());
+            input.data_ptr(), input.data_ptr(),
+            static_cast<size_t>(input.numel()), input.scalar_type(),
+            static_cast<int>(root), comm, stream.rawstream());
       },
       OpType::BROADCAST);
 }
 
+// NOLINTNEXTLINE(google-default-arguments)
 c10::intrusive_ptr<Work> ProcessGroupDICL::reduce(
     std::vector<at::Tensor>& tensors, const ReduceOptions& opts) {
   // inplace in = out, only rootRank use out.
@@ -524,22 +521,25 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::reduce(
       [&](at::Tensor& input, at::Tensor& output, diclComm_t comm,
           DIPUStream& stream) {
         RECORD_FUNCTION("DiclReduce", std::vector<c10::IValue>({input}));
-        profile::RecordBlockCreator _("DiclReduce", profile::ExtraRecordInfo(),
-                                      stream.rawstream(), stream.id());
+        profile::RecordBlockCreator _("DiclReduce", stream.rawstream(),
+                                      static_cast<int>(stream.id()));
         const auto root = opts.rootRank * tensors.size() + opts.rootTensor;
         return devproxy::diclReduce(
-            input.data_ptr(), output.data_ptr(), (size_t)input.numel(),
-            input.scalar_type(), opts.reduceOp, root, comm, stream.rawstream());
+            input.data_ptr(), output.data_ptr(),
+            static_cast<size_t>(input.numel()), input.scalar_type(),
+            opts.reduceOp, static_cast<int>(root), comm, stream.rawstream());
       },
       OpType::REDUCE);
 }
 
+// NOLINTNEXTLINE(google-default-arguments)
 c10::intrusive_ptr<Work> ProcessGroupDICL::gather(
     std::vector<std::vector<at::Tensor>>& outputTensors,
     std::vector<at::Tensor>& inputTensors, const GatherOptions& opts) {
   TORCH_CHECK(false, "ProcessGroupDICL does not support gather now");
 }
 
+// NOLINTNEXTLINE(google-default-arguments)
 c10::intrusive_ptr<Work> ProcessGroupDICL::allgather(
     std::vector<std::vector<at::Tensor>>& outputs,
     std::vector<at::Tensor>& inputs, const AllgatherOptions& opts) {
@@ -553,13 +553,13 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::allgather(
       [&](at::Tensor& input, at::Tensor& output, diclComm_t comm,
           DIPUStream& stream) {
         RECORD_FUNCTION("DiclAllgather", std::vector<c10::IValue>({input}));
-        profile::RecordBlockCreator _("DiclAllgather",
-                                      profile::ExtraRecordInfo(),
-                                      stream.rawstream(), stream.id());
+        profile::RecordBlockCreator _("DiclAllgather", stream.rawstream(),
+                                      static_cast<int>(stream.id()));
 
-        return devproxy::diclAllGather(
-            input.data_ptr(), output.data_ptr(), (size_t)input.numel(),
-            input.scalar_type(), comm, stream.rawstream());
+        return devproxy::diclAllGather(input.data_ptr(), output.data_ptr(),
+                                       static_cast<size_t>(input.numel()),
+                                       input.scalar_type(), comm,
+                                       stream.rawstream());
       },
       [&](std::vector<std::shared_ptr<DICLComm>>& diclComms) {},
       [&](std::vector<std::shared_ptr<DICLComm>>& diclComms) {
@@ -569,7 +569,7 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::allgather(
           // record dest tensor outputs, because src tensor outputFlattened
           // already recorded in collective.
           copyInCommStream<true>(diclComms[i], outputs[i], outputFlattened[i],
-                                 outputs[i].size());
+                                 static_cast<int>(outputs[i].size()));
           // copyInCurrentStream(diclComms[i], outputs[i], outputFlattened[i]);
         }
       },
@@ -577,6 +577,7 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::allgather(
   return work;
 }
 
+// NOLINTNEXTLINE(google-default-arguments)
 c10::intrusive_ptr<Work> ProcessGroupDICL::_allgather_base(
     at::Tensor& outputTensor, at::Tensor& inputTensor,
     const AllgatherOptions& opts) {
@@ -597,16 +598,17 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::_allgather_base(
           DIPUStream& stream) {
         RECORD_FUNCTION("DiclAllgather_base",
                         std::vector<c10::IValue>({input}));
-        profile::RecordBlockCreator _("DiclAllgather_base",
-                                      profile::ExtraRecordInfo(),
-                                      stream.rawstream(), stream.id());
-        return devproxy::diclAllGather(
-            input.data_ptr(), output.data_ptr(), (size_t)input.numel(),
-            input.scalar_type(), comm, stream.rawstream());
+        profile::RecordBlockCreator _("DiclAllgather_base", stream.rawstream(),
+                                      static_cast<int>(stream.id()));
+        return devproxy::diclAllGather(input.data_ptr(), output.data_ptr(),
+                                       static_cast<size_t>(input.numel()),
+                                       input.scalar_type(), comm,
+                                       stream.rawstream());
       },
       OpType::_ALLGATHER_BASE);
 }
 
+// NOLINTNEXTLINE(google-default-arguments)
 c10::intrusive_ptr<Work> ProcessGroupDICL::_reduce_scatter_base(
     at::Tensor& outputTensor, at::Tensor& inputTensor,
     const ReduceScatterOptions& opts) {
@@ -628,15 +630,17 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::_reduce_scatter_base(
         RECORD_FUNCTION("DiclReduceScatter_base",
                         std::vector<c10::IValue>({input}));
         profile::RecordBlockCreator _("DiclReduceScatter_base",
-                                      profile::ExtraRecordInfo(),
-                                      stream.rawstream(), stream.id());
-        return devproxy::diclReduceScatter(
-            input.data_ptr(), output.data_ptr(), (size_t)output.numel(),
-            input.scalar_type(), opts.reduceOp, comm, stream.rawstream());
+                                      stream.rawstream(),
+                                      static_cast<int>(stream.id()));
+        return devproxy::diclReduceScatter(input.data_ptr(), output.data_ptr(),
+                                           static_cast<size_t>(output.numel()),
+                                           input.scalar_type(), opts.reduceOp,
+                                           comm, stream.rawstream());
       },
       OpType::_REDUCE_SCATTER_BASE);
 }
 
+// NOLINTNEXTLINE(google-default-arguments)
 c10::intrusive_ptr<Work> ProcessGroupDICL::reduce_scatter(
     std::vector<at::Tensor>& outputs,
     std::vector<std::vector<at::Tensor>>& inputs,
@@ -652,12 +656,12 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::reduce_scatter(
       [&](at::Tensor& input, at::Tensor& output, diclComm_t comm,
           DIPUStream& stream) {
         RECORD_FUNCTION("DiclReduceScatter", std::vector<c10::IValue>({input}));
-        profile::RecordBlockCreator _("DiclReduceScatter",
-                                      profile::ExtraRecordInfo(),
-                                      stream.rawstream(), stream.id());
-        return devproxy::diclReduceScatter(
-            input.data_ptr(), output.data_ptr(), (size_t)output.numel(),
-            input.scalar_type(), opts.reduceOp, comm, stream.rawstream());
+        profile::RecordBlockCreator _("DiclReduceScatter", stream.rawstream(),
+                                      static_cast<int>(stream.id()));
+        return devproxy::diclReduceScatter(input.data_ptr(), output.data_ptr(),
+                                           static_cast<size_t>(output.numel()),
+                                           input.scalar_type(), opts.reduceOp,
+                                           comm, stream.rawstream());
       },
       [&](std::vector<std::shared_ptr<DICLComm>>& diclComms) {
         // Copy the inputs[i].size nums raw tensor intto flattened
@@ -665,7 +669,7 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::reduce_scatter(
           // record src tensor inputs, because dest tensor inputFlattened
           // already recorded in collective
           copyInCommStream<false>(diclComms[i], inputFlattened[i], inputs[i],
-                                  inputs[0].size());
+                                  static_cast<int>(inputs[0].size()));
         }
       },
       [&](std::vector<std::shared_ptr<DICLComm>>& diclComms) {},
@@ -682,11 +686,11 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::send(
       [&](at::Tensor& input, at::Tensor& output, diclComm_t comm,
           DIPUStream& stream) {
         RECORD_FUNCTION("diclSend", std::vector<c10::IValue>({input}));
-        profile::RecordBlockCreator _("diclSend", profile::ExtraRecordInfo(),
-                                      stream.rawstream(), stream.id());
-        return devproxy::diclSend(input.data_ptr(), (size_t)input.numel(),
-                                  input.scalar_type(), p2pPair.second, comm,
-                                  stream.rawstream());
+        profile::RecordBlockCreator _("diclSend", stream.rawstream(),
+                                      static_cast<int>(stream.id()));
+        return devproxy::diclSend(
+            input.data_ptr(), static_cast<size_t>(input.numel()),
+            input.scalar_type(), p2pPair.second, comm, stream.rawstream());
       },
       [](std::vector<std::shared_ptr<DICLComm>>&) {},
       [](std::vector<std::shared_ptr<DICLComm>>&) {}, OpType::SEND);
@@ -701,30 +705,33 @@ c10::intrusive_ptr<Work> ProcessGroupDICL::recv(
       [&](at::Tensor& input, at::Tensor& output, diclComm_t comm,
           DIPUStream& stream) {
         RECORD_FUNCTION("diclRecv", std::vector<c10::IValue>({input}));
-        profile::RecordBlockCreator _("diclRecv", profile::ExtraRecordInfo(),
-                                      stream.rawstream(), stream.id());
-        return devproxy::diclRecv(input.data_ptr(), (size_t)input.numel(),
-                                  input.scalar_type(), p2pPair.second, comm,
-                                  stream.rawstream());
+        profile::RecordBlockCreator _("diclRecv", stream.rawstream(),
+                                      static_cast<int>(stream.id()));
+        return devproxy::diclRecv(
+            input.data_ptr(), static_cast<size_t>(input.numel()),
+            input.scalar_type(), p2pPair.second, comm, stream.rawstream());
       },
       [](std::vector<std::shared_ptr<DICLComm>>&) {},
       [](std::vector<std::shared_ptr<DICLComm>>&) {}, OpType::RECV);
 }
 
+// NOLINTNEXTLINE(google-default-arguments)
 c10::intrusive_ptr<Work> ProcessGroupDICL::barrier(const BarrierOptions& opts) {
   std::vector<at::Device> devices;
   if (usedDeviceIdxs_.empty()) {
     auto numDIPUs = devproxy::getDeviceCount();
     int16_t deviceIdx =
         static_cast<int16_t>(rank_ % std::max(static_cast<int>(numDIPUs), 1));
-    devices.push_back(at::Device(dipu::DIPU_DEVICE_TYPE, deviceIdx));
+    devices.emplace_back(dipu::DIPU_DEVICE_TYPE,
+                         static_cast<c10::DeviceIndex>(deviceIdx));
   } else {
     for (auto usedDeviceIdx : usedDeviceIdxs_) {
-      devices.push_back(at::Device(dipu::DIPU_DEVICE_TYPE, usedDeviceIdx));
+      devices.emplace_back(dipu::DIPU_DEVICE_TYPE,
+                           static_cast<c10::DeviceIndex>(usedDeviceIdx));
     }
   }
 
-  std::vector<at::Tensor> barrierTensors;
+  std::vector<at::Tensor> barrierTensors{};
   barrierTensors.reserve(devices.size());
 
   OptionalDIPUGuard dipuGuard;
diff --git a/dipu/torch_dipu/csrc_dipu/stub.cpp b/dipu/torch_dipu/csrc_dipu/stub.cpp
index 35d7fdd65e..b48ee04e4d 100644
--- a/dipu/torch_dipu/csrc_dipu/stub.cpp
+++ b/dipu/torch_dipu/csrc_dipu/stub.cpp
@@ -1,6 +1,7 @@
 // Copyright (c) 2023, DeepLink.
 #include <csrc_dipu/binding/exportapi.h>
 
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 static std::vector<PyMethodDef> methods;
 
 static void AddPyMethodDefs(std::vector<PyMethodDef>& vector,
diff --git a/dipu/torch_dipu/csrc_dipu/utils/Log.h b/dipu/torch_dipu/csrc_dipu/utils/Log.h
index 66a22d8011..c9d73a6843 100644
--- a/dipu/torch_dipu/csrc_dipu/utils/Log.h
+++ b/dipu/torch_dipu/csrc_dipu/utils/Log.h
@@ -1,19 +1,18 @@
 // Copyright (c) 2023, DeepLink.
 #pragma once
 
-#include <iostream>
-#include <stdio.h>
+#include <iostream>  // IWYU pragma: export
 
 #define CONCAT_(prefix, suffix) prefix##suffix
 #define CONCAT(prefix, suffix) CONCAT_(prefix, suffix)
 #define MAKE_UNIQUE_VARIABLE_NAME(prefix) CONCAT(prefix##_, __LINE__)
 
 #define DIPU_LOG std::cout << __FILE__ << ":" << __LINE__ << " "
-#define DIPU_LOG_ONCE                                                        \
-  static auto& __attribute__((unused)) MAKE_UNIQUE_VARIABLE_NAME(__func__) = \
-      DIPU_LOG
+#define DIPU_LOG_ONCE                        \
+  static const auto& __attribute__((unused)) \
+  MAKE_UNIQUE_VARIABLE_NAME(__func__) = DIPU_LOG
 
 #define DIPU_LOG_ERROR std::cerr << __FILE__ << ":" << __LINE__ << " "
-#define DIPU_LOG_ERROR_ONCE                                                  \
-  static auto& __attribute__((unused)) MAKE_UNIQUE_VARIABLE_NAME(__func__) = \
-      DIPU_LOG_ERROR
+#define DIPU_LOG_ERROR_ONCE                  \
+  static const auto& __attribute__((unused)) \
+  MAKE_UNIQUE_VARIABLE_NAME(__func__) = DIPU_LOG_ERROR
diff --git a/dipu/torch_dipu/csrc_dipu/utils/helpfunc.cpp b/dipu/torch_dipu/csrc_dipu/utils/helpfunc.cpp
index b056de6124..ebb66f1550 100644
--- a/dipu/torch_dipu/csrc_dipu/utils/helpfunc.cpp
+++ b/dipu/torch_dipu/csrc_dipu/utils/helpfunc.cpp
@@ -12,6 +12,7 @@ bool isDeviceTensor(const at::Tensor& tensor) {
   return tensor.unsafeGetTensorImpl()->device_type() == dipu::DIPU_DEVICE_TYPE;
 }
 
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 static bool in_bad_fork = false;
 bool is_in_bad_fork() { return in_bad_fork; }
 
diff --git a/dipu/torch_dipu/csrc_dipu/utils/helpfunc.hpp b/dipu/torch_dipu/csrc_dipu/utils/helpfunc.hpp
index ae4ce62477..efc14606ca 100644
--- a/dipu/torch_dipu/csrc_dipu/utils/helpfunc.hpp
+++ b/dipu/torch_dipu/csrc_dipu/utils/helpfunc.hpp
@@ -2,10 +2,9 @@
 #pragma once
 #include <csrc_dipu/base/basedef.h>
 
-using dipu::devapis::VendorDeviceType;
-
 namespace dipu {
 
+using dipu::devapis::VendorDeviceType;
 constexpr const char* VendorTypeToStr(VendorDeviceType t) noexcept {
   switch (t) {
     case VendorDeviceType::MLU:
@@ -20,6 +19,8 @@ constexpr const char* VendorTypeToStr(VendorDeviceType t) noexcept {
       return "SUPA";
     case VendorDeviceType::DROPLET:
       return "DROPLET";
+    case VendorDeviceType::KLX:
+      return "KLX";
   }
   return "null";
 }
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/cuda/CudaGeneratorImpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/cuda/CudaGeneratorImpl.cpp
index bd65992fff..3005b66be0 100644
--- a/dipu/torch_dipu/csrc_dipu/vendor/cuda/CudaGeneratorImpl.cpp
+++ b/dipu/torch_dipu/csrc_dipu/vendor/cuda/CudaGeneratorImpl.cpp
@@ -5,14 +5,14 @@
 
 namespace dipu {
 
-static const size_t states_size = 200 * sizeof(4120);
+static const size_t states_size = 200 * sizeof(decltype(4120));
 static const size_t seed_size = sizeof(uint64_t);
 static const size_t offset_size = sizeof(int64_t);
 static const size_t total_size = states_size + seed_size + offset_size;
 
 class CUDAGeneratorImpl : public dipu::DIPUGeneratorImpl {
  public:
-  CUDAGeneratorImpl(at::DeviceIndex device_index)
+  explicit CUDAGeneratorImpl(at::DeviceIndex device_index)
       : dipu::DIPUGeneratorImpl(device_index) {}
 
   void set_state(const c10::TensorImpl& state) override {
@@ -30,7 +30,7 @@ class CUDAGeneratorImpl : public dipu::DIPUGeneratorImpl {
 
   void update_state() const override {
     if (state_need_reset_) {
-      state_ = at::detail::empty_cpu({(int64_t)total_size},
+      state_ = at::detail::empty_cpu({static_cast<int64_t>(total_size)},
                                      c10::ScalarType::Byte, c10::nullopt,
                                      c10::nullopt, c10::nullopt, c10::nullopt);
       auto rng_state = state_.data_ptr<uint8_t>();
@@ -47,6 +47,7 @@ class CUDAGeneratorImpl : public dipu::DIPUGeneratorImpl {
   }
 };
 
+// NOLINTNEXTLINE(readability-const-return-type)
 const at::Generator vendorMakeGenerator(at::DeviceIndex device_index) {
   return at::make_generator<CUDAGeneratorImpl>(device_index);
 }
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/cuda/communiatorimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/cuda/communiatorimpl.cpp
index 3ac321a48e..d456839b71 100644
--- a/dipu/torch_dipu/csrc_dipu/vendor/cuda/communiatorimpl.cpp
+++ b/dipu/torch_dipu/csrc_dipu/vendor/cuda/communiatorimpl.cpp
@@ -8,7 +8,7 @@ namespace dipu {
 namespace devapis {
 
 // NCCL op mapping
-static std::map<ReduceOp::RedOpType, ncclRedOp_t> ncclOp = {
+static const std::map<ReduceOp::RedOpType, ncclRedOp_t> ncclOp = {
     {ReduceOp::MIN, ncclMin}, {ReduceOp::MAX, ncclMax},
     {ReduceOp::SUM, ncclSum}, {ReduceOp::PRODUCT, ncclProd},
 #ifdef NCCL_HAS_AVG
@@ -17,7 +17,7 @@ static std::map<ReduceOp::RedOpType, ncclRedOp_t> ncclOp = {
 };
 
 // NCCL type typing
-static std::map<at::ScalarType, ncclDataType_t> ncclDataType = {
+static const std::map<at::ScalarType, ncclDataType_t> ncclDataType = {
     {at::kChar, ncclInt8},         {at::kByte, ncclUint8},
     {at::kFloat, ncclFloat},       {at::kDouble, ncclDouble},
     {at::kInt, ncclInt32},         {at::kLong, ncclInt64},
@@ -42,13 +42,13 @@ static std::map<at::ScalarType, ncclDataType_t> ncclDataType = {
 const int DICL_UNIQUE_ID_BYTES_SIZE = NCCL_UNIQUE_ID_BYTES;
 
 DIPU_API diclResult_t diclGetCommAsyncError(diclComm_t comm) {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ncclResult_t ncclAsyncErr_;
   NCCL_THROW(ncclCommGetAsyncError(comm, &ncclAsyncErr_));
   if (ncclAsyncErr_ != ncclSuccess) {
     return DICL_SUCCESS;
-  } else {
-    return DICL_ERR_UNDEF;
   }
+  return DICL_ERR_UNDEF;
 }
 
 DIPU_API diclResult_t diclGetUniqueId(commUniqueId* uniqueId) {
@@ -72,8 +72,9 @@ DIPU_API diclResult_t diclAllReduce(const void* sendbuff, void* recvbuff,
                                     size_t count, at::ScalarType datatype,
                                     const ReduceOp& reduceOp, diclComm_t comm,
                                     deviceStream_t stream) {
-  NCCL_THROW(ncclAllReduce(sendbuff, recvbuff, count, ncclDataType[datatype],
-                           ncclOp[reduceOp], comm, stream));
+  // TODO(wanglei): add .find() != .end() check.
+  NCCL_THROW(ncclAllReduce(sendbuff, recvbuff, count, ncclDataType.at(datatype),
+                           ncclOp.at(reduceOp), comm, stream));
   return DICL_SUCCESS;
 }
 
@@ -81,7 +82,7 @@ DIPU_API diclResult_t diclBroadcast(const void* sendbuff, void* recvbuff,
                                     size_t count, at::ScalarType datatype,
                                     int root, diclComm_t comm,
                                     deviceStream_t stream) {
-  NCCL_THROW(ncclBroadcast(sendbuff, recvbuff, count, ncclDataType[datatype],
+  NCCL_THROW(ncclBroadcast(sendbuff, recvbuff, count, ncclDataType.at(datatype),
                            root, comm, stream));
   return DICL_SUCCESS;
 }
@@ -89,8 +90,8 @@ DIPU_API diclResult_t diclBroadcast(const void* sendbuff, void* recvbuff,
 DIPU_API diclResult_t diclAllGather(const void* sendBuf, void* recvBuf,
                                     size_t sendCount, at::ScalarType datatype,
                                     diclComm_t comm, deviceStream_t stream) {
-  NCCL_THROW(ncclAllGather(sendBuf, recvBuf, sendCount, ncclDataType[datatype],
-                           comm, stream));
+  NCCL_THROW(ncclAllGather(sendBuf, recvBuf, sendCount,
+                           ncclDataType.at(datatype), comm, stream));
   return DICL_SUCCESS;
 }
 
@@ -98,8 +99,8 @@ DIPU_API diclResult_t diclReduce(const void* sendbuff, void* recvbuff,
                                  size_t count, at::ScalarType datatype,
                                  const ReduceOp& reduceOp, int root,
                                  diclComm_t comm, deviceStream_t stream) {
-  NCCL_THROW(ncclReduce(sendbuff, recvbuff, count, ncclDataType[datatype],
-                        ncclOp[reduceOp], root, comm, stream));
+  NCCL_THROW(ncclReduce(sendbuff, recvbuff, count, ncclDataType.at(datatype),
+                        ncclOp.at(reduceOp), root, comm, stream));
   return DICL_SUCCESS;
 }
 
@@ -107,8 +108,8 @@ DIPU_API diclResult_t diclReduceScatter(
     void* sendBuf, void* recvBuf, size_t recvCount, at::ScalarType datatype,
     const ReduceOp& reduceOp, diclComm_t comm, deviceStream_t stream) {
   NCCL_THROW(ncclReduceScatter(sendBuf, recvBuf, recvCount,
-                               ncclDataType[datatype], ncclOp[reduceOp], comm,
-                               stream));
+                               ncclDataType.at(datatype), ncclOp.at(reduceOp),
+                               comm, stream));
   return DICL_SUCCESS;
 }
 
@@ -116,7 +117,7 @@ DIPU_API diclResult_t diclSend(void* sendbuff, size_t count,
                                at::ScalarType datatype, int peer,
                                diclComm_t comm, deviceStream_t stream) {
   NCCL_THROW(
-      ncclSend(sendbuff, count, ncclDataType[datatype], peer, comm, stream));
+      ncclSend(sendbuff, count, ncclDataType.at(datatype), peer, comm, stream));
   return DICL_SUCCESS;
 }
 
@@ -124,7 +125,7 @@ DIPU_API diclResult_t diclRecv(void* recvbuff, size_t count,
                                at::ScalarType datatype, int peer,
                                diclComm_t comm, deviceStream_t stream) {
   NCCL_THROW(
-      ncclRecv(recvbuff, count, ncclDataType[datatype], peer, comm, stream));
+      ncclRecv(recvbuff, count, ncclDataType.at(datatype), peer, comm, stream));
   return DICL_SUCCESS;
 }
 
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/cuda/deviceimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/cuda/deviceimpl.cpp
index 04282160de..969c2d23f9 100644
--- a/dipu/torch_dipu/csrc_dipu/vendor/cuda/deviceimpl.cpp
+++ b/dipu/torch_dipu/csrc_dipu/vendor/cuda/deviceimpl.cpp
@@ -7,6 +7,7 @@
 #include <csrc_dipu/runtime/device/deviceapis.h>
 
 namespace dipu {
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 DIPU_API devapis::VendorDeviceType VENDOR_TYPE =
     devapis::VendorDeviceType::CUDA;
 
@@ -22,12 +23,14 @@ void initializeVendor() {}
 void finalizeVendor() {}
 
 deviceId_t current_device() {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   cuda_deviceId devId_;
   DIPU_CALLCUDA(::cudaGetDevice(&devId_))
   return static_cast<deviceId_t>(devId_);
 }
 
 DIPUDeviceProperties getDeviceProperties(int32_t device_index) {
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   ::cudaDeviceProp device_prop;
   DIPU_CALLCUDA(cudaGetDeviceProperties(&device_prop, device_index))
 
@@ -62,7 +65,7 @@ void checkLastError() { DIPU_CALLCUDA(::cudaGetLastError()) }
 
 int getDeviceCount() {
   int num = -1;
-  DIPU_CALLCUDA(::cudaGetDeviceCount(reinterpret_cast<int*>(&num)))
+  DIPU_CALLCUDA(::cudaGetDeviceCount(&num))
   return num;
 }
 
@@ -94,7 +97,7 @@ void destroyStream(deviceStream_t stream, deviceId_t devId) {
   destroyStream(stream);
 }
 
-void releaseStream() { return; }
+void releaseStream() {}
 
 bool streamNotNull(deviceStream_t stream) {
   return (stream != nullptr && stream != cudaStreamLegacy &&
@@ -111,10 +114,7 @@ void streamWaitEvent(deviceStream_t stream, deviceEvent_t event) {
 
 bool isStreamEmpty(deviceStream_t stream) {
   auto err = cudaStreamQuery(stream);
-  if (err == ::cudaSuccess) {
-    return true;
-  }
-  return false;
+  return err == ::cudaSuccess;
 }
 
 // =====================
@@ -126,9 +126,8 @@ void createEvent(deviceEvent_t* event) {
     const char* env = std::getenv("DIPU_CUDA_EVENT_TIMING");
     if (env) {
       return std::atoi(env) > 0;
-    } else {
-      return true;
     }
+    return true;
   }();
 
   DIPU_CALLCUDA(::cudaEventCreateWithFlags(
@@ -154,13 +153,12 @@ EventStatus getEventStatus(deviceEvent_t event) {
   ::cudaError_t ret = ::cudaEventQuery(event);
   if (ret == ::cudaSuccess) {
     return devapis::EventStatus::READY;
-  } else if (ret == ::cudaErrorNotReady) {
+  }
+  if (ret == ::cudaErrorNotReady) {
     ::cudaGetLastError(); /* reset internal error state*/
     return devapis::EventStatus::PENDING;
-  } else {
-    TORCH_CHECK(false,
-                "unexpected event status in getEventStatus, ret = ", ret);
   }
+  TORCH_CHECK(false, "unexpected event status in getEventStatus, ret = ", ret);
 }
 
 // =====================
@@ -191,6 +189,7 @@ OpStatus mallocDevice(void** p, size_t nbytes, bool throwExcepion) {
 void freeDevice(void* p) { DIPU_CALLCUDA(::cudaFree(p)) }
 
 bool isPinnedPtr(const void* p) {
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   ::cudaPointerAttributes attr;
   DIPU_CALLCUDA(::cudaPointerGetAttributes(&attr, p))
   return attr.type == cudaMemoryTypeHost;
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/cuda/patch/DIPUPatchCudaAllocator.cpp b/dipu/torch_dipu/csrc_dipu/vendor/cuda/patch/DIPUPatchCudaAllocator.cpp
index 14e7401a67..453dc405b1 100644
--- a/dipu/torch_dipu/csrc_dipu/vendor/cuda/patch/DIPUPatchCudaAllocator.cpp
+++ b/dipu/torch_dipu/csrc_dipu/vendor/cuda/patch/DIPUPatchCudaAllocator.cpp
@@ -11,9 +11,10 @@ namespace cuda {
 
 namespace CUDACachingAllocator {
 
-#define DIPU_PATCH_CUDA_ALLOCATOR(x)           \
-  std::cout << __FUNCTION__ << ":" << __LINE__ \
-            << " this function should not be called!" x << std::endl;
+#define DIPU_PATCH_CUDA_ALLOCATOR(x)                                  \
+  std::cout << __FUNCTION__ << ":" << __LINE__                        \
+            << " this function should not be called!" x << std::endl; \
+  throw std::runtime_error("this function should not be called!");
 
 class DIPUCUDAAllocatorProxy : public CUDAAllocator {
   std::unordered_map<void*, c10::DataPtr> tempMemBlock;
@@ -21,62 +22,56 @@ class DIPUCUDAAllocatorProxy : public CUDAAllocator {
   mutable mutex_t mut_;
 
  public:
-  virtual void* raw_alloc_with_stream(size_t nbytes,
-                                      cudaStream_t stream) override {
+  void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual void setMemoryFraction(double fraction, int device) override {
+  void setMemoryFraction(double fraction, int device) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual void* getBaseAllocation(void* ptr, size_t* size) override {
+  void* getBaseAllocation(void* ptr, size_t* size) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual void recordStream(const DataPtr&, CUDAStream stream) override {
+  void recordStream(const DataPtr& /*unused*/, CUDAStream stream) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual DeviceStats getDeviceStats(int device) override {
+  DeviceStats getDeviceStats(int device) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual void resetAccumulatedStats(int device) override {
+  void resetAccumulatedStats(int device) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual void resetPeakStats(int device) override {
+  void resetPeakStats(int device) override { DIPU_PATCH_CUDA_ALLOCATOR(); }
+  SnapshotInfo snapshot() override { DIPU_PATCH_CUDA_ALLOCATOR(); }
+  void notifyCaptureBegin(int device, CaptureId_t graph_id,
+                          MempoolId_t mempool_id) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual SnapshotInfo snapshot() override { DIPU_PATCH_CUDA_ALLOCATOR(); }
-  virtual void notifyCaptureBegin(int device, CaptureId_t graph_id,
-                                  MempoolId_t mempool_id) override {
+  void notifyCaptureAboutToEnd(int device, CaptureId_t graph_id) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual void notifyCaptureAboutToEnd(int device,
-                                       CaptureId_t graph_id) override {
+  void notifyCaptureEnded(int device, CaptureId_t graph_id) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual void notifyCaptureEnded(int device, CaptureId_t graph_id) override {
+  void notifyCaptureDestroy(int device, MempoolId_t mempool_id) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual void notifyCaptureDestroy(int device,
-                                    MempoolId_t mempool_id) override {
+  std::shared_ptr<void> getIpcDevPtr(std::string handle) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual std::shared_ptr<void> getIpcDevPtr(std::string handle) override {
+  void recordHistory(bool enabled, CreateContextFn context_recorder,
+                     size_t alloc_trace_max_entries,
+                     bool alloc_trace_record_context) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual void recordHistory(bool enabled, CreateContextFn context_recorder,
-                             size_t alloc_trace_max_entries,
-                             bool alloc_trace_record_context) override {
+  void attachOutOfMemoryObserver(OutOfMemoryObserver observer) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
-  virtual void attachOutOfMemoryObserver(
-      OutOfMemoryObserver observer) override {
-    DIPU_PATCH_CUDA_ALLOCATOR();
-  }
-  virtual std::string name() override { DIPU_PATCH_CUDA_ALLOCATOR(); }
-  virtual void cacheInfo(int dev_id, size_t* largestBlock) override {
+  std::string name() override { DIPU_PATCH_CUDA_ALLOCATOR(); }
+  void cacheInfo(int dev_id, size_t* largestBlock) override {
     DIPU_PATCH_CUDA_ALLOCATOR();
   }
 
-  virtual void* raw_alloc(size_t nbytes) override {
+  void* raw_alloc(size_t nbytes) override {
     auto data_ptr = this->allocate(nbytes);
     void* ptr = data_ptr.get();
     std::lock_guard<mutex_t> lk(mut_);
@@ -84,23 +79,23 @@ class DIPUCUDAAllocatorProxy : public CUDAAllocator {
     return ptr;
   }
 
-  virtual void raw_delete(void* ptr) override {
+  void raw_delete(void* ptr) override {
     std::lock_guard<mutex_t> lk(mut_);
     tempMemBlock.erase(ptr);
   }
 
-  virtual void init(int device_count) override {}
+  void init(int device_count) override {}
 
-  virtual bool initialized() override { return true; }
+  bool initialized() override { return true; }
 
-  virtual void emptyCache() override { dipu::emptyCachedMem(); }
+  void emptyCache() override { dipu::emptyCachedMem(); }
 
-  virtual bool needsPoolSpecificPeerAccess() override {
+  bool needsPoolSpecificPeerAccess() override {
     // DIPU_PATCH_CUDA_ALLOCATOR();
     return false;
   }
 
-  virtual DataPtr allocate(size_t n) const override {
+  DataPtr allocate(size_t n) const override {
     // DIPU_PATCH_CUDA_ALLOCATOR();
     auto data_ptr = c10::GetAllocator(dipu::DIPU_DEVICE_TYPE)->allocate(n);
     data_ptr.unsafe_set_device(
@@ -142,6 +137,6 @@ int patchCachingAllocator() {
 and this compilation unit may not be compiled, so it is still initialized with
 global variables
 */
-static int n = patchCachingAllocator();
+static const int n = patchCachingAllocator();
 
 }  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/cuda/patch/wrapperRegister.cpp b/dipu/torch_dipu/csrc_dipu/vendor/cuda/patch/wrapperRegister.cpp
index 5692610306..8880efef46 100644
--- a/dipu/torch_dipu/csrc_dipu/vendor/cuda/patch/wrapperRegister.cpp
+++ b/dipu/torch_dipu/csrc_dipu/vendor/cuda/patch/wrapperRegister.cpp
@@ -1,7 +1,13 @@
 // Copyright (c) 2023, DeepLink.
-#include <csrc_dipu/aten/DIPUATenFunctions.h>
-#include <csrc_dipu/aten/RegisterDIPU.hpp>
-#include <csrc_dipu/base/basedef.h>
+#include <cstdint>
+
+#include <ATen/core/ATen_fwd.h>
+#include <ATen/core/TensorBody.h>
+#include <ATen/ops/_cudnn_rnn_flatten_weight_native.h>
+#include <c10/core/CompileTimeFunctionPointer.h>
+
+#include "csrc_dipu/aten/RegisterDIPU.hpp"
+#include "csrc_dipu/base/basedef.h"
 
 namespace at {
 
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/CMakeLists.txt b/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/CMakeLists.txt
new file mode 100644
index 0000000000..4bfa284350
--- /dev/null
+++ b/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/CMakeLists.txt
@@ -0,0 +1,14 @@
+cmake_minimum_required(VERSION 3.14)
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
+
+include(cmake/FindKLXRuntime.cmake)
+
+message(STATUS XPURT_INCLUDE_DIR ${XPURT_INCLUDE_DIR})
+
+set(VENDOR_INCLUDE_DIRS ${VENDOR_INCLUDE_DIRS} ${XPURT_INCLUDE_DIR} ${XDNN_INCLUDE_DIR} PARENT_SCOPE)
+set(VENDOR_LIB_DIRS ${VENDOR_LIB_DIRS} ${XPURT_LIBRARIES} ${XDNN_LIBRARIES} PARENT_SCOPE)
+#set(DIPU_VENDOR_LIB ${DIPU_VENDOR_LIB} xpurt xpuapi PARENT_SCOPE)
+
+file(GLOB SRC_FILES  *.cpp)
+set(VENDOR_FILES  ${SRC_FILES} PARENT_SCOPE)
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/KLXGeneratorImpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/KLXGeneratorImpl.cpp
new file mode 100644
index 0000000000..b7fffe4909
--- /dev/null
+++ b/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/KLXGeneratorImpl.cpp
@@ -0,0 +1,29 @@
+#include <ATen/Functions.h>
+#include <ATen/Utils.h>
+
+#include <csrc_dipu/runtime/core/DIPUGeneratorImpl.h>
+#include <csrc_dipu/runtime/core/DIPUGuard.h>
+#include <csrc_dipu/runtime/device/deviceapis.h>
+
+namespace dipu {
+
+// Discriminate floating device type.
+// static bool is_floating_device = true;
+
+// just an example
+// not implemented now
+class KLXGeneratorImpl : public dipu::DIPUGeneratorImpl {
+ public:
+  KLXGeneratorImpl(at::DeviceIndex device_index)
+      : dipu::DIPUGeneratorImpl(device_index) {}
+
+  void set_state(const c10::TensorImpl& state) override {}
+
+  void update_state() const override {}
+};
+
+const at::Generator vendorMakeGenerator(at::DeviceIndex device_index) {
+  return at::make_generator<KLXGeneratorImpl>(device_index);
+}
+
+}  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/cmake/FindKLXRuntime.cmake b/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/cmake/FindKLXRuntime.cmake
new file mode 100644
index 0000000000..11e37c918e
--- /dev/null
+++ b/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/cmake/FindKLXRuntime.cmake
@@ -0,0 +1,49 @@
+set(XPURT_TOOLKIT_ROOT /workspace/baidu/personal-code/diopi/xpu_toolchain/xpurt)
+set(XDNN_TOOLKIT_ROOT /workspace/baidu/personal-code/diopi/xpu_toolchain/xdnn)
+
+include(FindPackageHandleStandardArgs)
+
+## xdnn
+find_path(XDNN_INCLUDE_DIR
+    NAMES xpu/xdnn.h
+    HINTS ${XDNN_TOOLKIT_ROOT}/include
+          $ENV{XDNN_TOOLKIT_ROOT}/include
+)
+message("XDNN_INCLUDE_DIR:" ${XDNN_INCLUDE_DIR})
+find_library(XDNN_LIBRARIES
+    NAMES xpuapi
+    HINTS ${XDNN_TOOLKIT_ROOT}/so
+          $ENV{XDNN_TOOLKIT_ROOT}/so
+)
+message("XDNN_TOOLKIT_ROOT: " ${XDNN_TOOLKIT_ROOT})
+message("XDNN_LIBRARIES:" ${XDNN_LIBRARIES})
+if(NOT XDNN_INCLUDE_DIR OR NOT XDNN_LIBRARIES)
+    message(FATAL_ERROR "Cannot find Xdnn TOOLKIT for kunlunxin, set ENV 'XDNN_TOOLKIT_ROOT' correctly")
+endif()
+
+## runtime
+find_path(XPURT_INCLUDE_DIR
+    NAMES xpu/runtime.h
+    HINTS ${XPURT_TOOLKIT_ROOT}/include
+          $ENV{XPURT_TOOLKIT_ROOT}/include
+)
+message("XPURT_INCLUDE_DIR:" ${XPURT_INCLUDE_DIR})
+find_library(XPURT_LIBRARIES
+    NAMES xpurt
+    HINTS ${XPURT_TOOLKIT_ROOT}/so
+          $ENV{XPURT_TOOLKIT_ROOT}/so
+)
+message("XPURT_LIBRARIES:" ${XPURT_LIBRARIES})
+if(NOT XPURT_INCLUDE_DIR OR NOT XPURT_LIBRARIES)
+    message(FATAL_ERROR "Cannot find XPURT TOOLKIT for kunlunxin, set ENV 'XPURT_TOOLKIT_ROOT' correctly")
+endif()
+
+find_package_handle_standard_args(XPURT DEFAULT_MSG
+    XPURT_INCLUDE_DIR
+    XPURT_LIBRARIES)
+
+find_package_handle_standard_args(XDNN DEFAULT_MSG
+    XDNN_INCLUDE_DIR
+    XDNN_LIBRARIES)
+
+mark_as_advanced(XPURT_INCLUDE_DIR XPURT_LIBRARIES XDNN_INCLUDE_DIR XDNN_LIBRARIES)
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/communicatorimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/communicatorimpl.cpp
new file mode 100644
index 0000000000..b0c8bab58a
--- /dev/null
+++ b/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/communicatorimpl.cpp
@@ -0,0 +1,81 @@
+#include <stdexcept>
+#include <string>
+
+#include <c10/core/ScalarType.h>
+#include <torch/csrc/distributed/c10d/Types.hpp>
+
+#include <csrc_dipu/common.h>
+#include <csrc_dipu/runtime/device/diclapis.h>
+
+namespace dipu {
+
+namespace devapis {
+
+const int DICL_UNIQUE_ID_BYTES_SIZE = 0;
+
+DIPU_API diclResult_t diclGetCommAsyncError(diclComm_t comm) {
+  return DICL_ERR_UNDEF;
+}
+
+DIPU_API diclResult_t diclGetUniqueId(pcclUniqueId* uniqueId) {
+  return DICL_ERR_UNDEF;
+}
+
+DIPU_API diclResult_t diclCommInitRank(diclComm_t* comm, int nranks,
+                                       pcclUniqueId uniqueId, int rank,
+                                       int localDeviceId) {
+  return DICL_ERR_UNDEF;
+}
+
+DIPU_API diclResult_t diclCommDestroy(diclComm_t comm) {
+  return DICL_ERR_UNDEF;
+}
+
+DIPU_API diclResult_t diclAllReduce(const void* sendbuff, void* recvbuff,
+                                    size_t count, at::ScalarType datatype,
+                                    const ReduceOp& reduceOp, diclComm_t comm,
+                                    deviceStream_t stream) {
+  return DICL_ERR_UNDEF;
+}
+
+DIPU_API diclResult_t diclBroadcast(const void* sendbuff, void* recvbuff,
+                                    size_t count, at::ScalarType datatype,
+                                    int root, diclComm_t comm,
+                                    deviceStream_t stream) {
+  return DICL_ERR_UNDEF;
+}
+
+DIPU_API diclResult_t diclAllGather(const void* sendBuf, void* recvBuf,
+                                    size_t count, at::ScalarType datatype,
+                                    diclComm_t comm, deviceStream_t stream) {
+  return DICL_ERR_UNDEF;
+}
+
+DIPU_API diclResult_t diclReduce(const void* sendbuff, void* recvbuff,
+                                 size_t count, at::ScalarType datatype,
+                                 const ReduceOp& reduceOp, int root,
+                                 diclComm_t comm, deviceStream_t stream) {
+  return DICL_ERR_UNDEF;
+}
+
+DIPU_API diclResult_t diclReduceScatter(
+    void* sendBuf, void* recvBuf, size_t recvCount, at::ScalarType datatype,
+    const ReduceOp& reduceOp, diclComm_t comm, deviceStream_t stream) {
+  return DICL_ERR_UNDEF;
+}
+
+DIPU_API diclResult_t diclSend(void* sendbuff, size_t count,
+                               at::ScalarType datatype, int peer,
+                               diclComm_t comm, deviceStream_t stream) {
+  return DICL_ERR_UNDEF;
+}
+
+DIPU_API diclResult_t diclRecv(void* recvbuff, size_t count,
+                               at::ScalarType datatype, int peer,
+                               diclComm_t comm, deviceStream_t stream) {
+  return DICL_ERR_UNDEF;
+}
+
+}  // end namespace devapis
+
+}  // end namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/deviceimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/deviceimpl.cpp
new file mode 100644
index 0000000000..4bd06f069d
--- /dev/null
+++ b/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/deviceimpl.cpp
@@ -0,0 +1,223 @@
+
+
+#include <csrc_dipu/common.h>
+#include <csrc_dipu/runtime/device/deviceapis.h>
+
+namespace dipu {
+DIPU_API devapis::VendorDeviceType VENDOR_TYPE = devapis::VendorDeviceType::KLX;
+
+namespace devapis {
+
+using klx_deviceId = int;
+
+// =====================
+//  Device class related
+// =====================
+
+void initializeVendor() {}
+
+void finalizeVendor() {}
+
+deviceId_t current_device() {
+  klx_deviceId devId_;
+  DIPU_CALLKLX(xpu_current_device(&devId_))
+  return static_cast<deviceId_t>(devId_);
+}
+
+DIPUDeviceProperties getDeviceProperties(int32_t device_index) {
+  DIPUDeviceProperties prop;
+  return prop;
+}
+
+// set current device given device according to id
+void setDevice(deviceId_t devId) {
+  klx_deviceId devId_ = static_cast<deviceId_t>(devId);
+  DIPU_CALLKLX(xpu_set_device(devId_))
+}
+
+void resetDevice(deviceId_t devId) {
+  DIPU_CALLKLX_ERROR("[kunlunxin]resetDevice is not implemented")
+}
+
+void syncDevice() { DIPU_CALLKLX(xpu_wait()); }
+
+// check last launch succ or not, throw if fail
+void checkLastError() {
+  DIPU_CALLKLX_ERROR("[kunlunxin]checkLastError is not implemented")
+}
+
+int getDeviceCount() {
+  int num = -1;
+  DIPU_CALLKLX(xpu_device_count(reinterpret_cast<int*>(&num)))
+  return num;
+}
+
+void getDriverVersion(int* version) {
+  uint32_t major;
+  uint32_t minor;
+  DIPU_CALLKLX(xpu_get_driver_version(&major, &minor));
+  *version = static_cast<int32_t>(major);
+}
+
+void getRuntimeVersion(int* version) {
+  uint32_t major;
+  uint32_t minor;
+  DIPU_CALLKLX(xpu_get_runtime_version(&major, &minor));
+  *version = static_cast<int32_t>(major);
+}
+
+// =====================
+//  device stream related
+// =====================
+void createStream(deviceStream_t* stream, bool prior) {
+  if (prior) {
+    DIPU_LOGW(
+        "kunlunxin device doesn't support prior queue(stream)."
+        " Fall back on creating queue without priority.");
+  }
+  DIPU_CALLKLX(xpu_stream_create(stream));
+}
+
+void destroyStream(deviceStream_t stream) {
+  DIPU_CALLKLX(xpu_stream_destroy(stream))
+}
+
+void destroyStream(deviceStream_t stream, deviceId_t devId) {
+  setDevice(devId);
+  destroyStream(stream);
+}
+
+void releaseStream() {}
+
+bool streamNotNull(deviceStream_t stream) { return stream != nullptr; }
+
+void syncStream(deviceStream_t stream) { DIPU_CALLKLX(xpu_wait(stream)); }
+
+void streamWaitEvent(deviceStream_t stream, deviceEvent_t event) {
+  DIPU_CALLKLX(xpu_stream_wait_event(stream, event))
+}
+
+bool isStreamEmpty(deviceStream_t stream) {
+  DIPU_CALLKLX_ERROR("[kunlunxin]isStreamEmpty is not implemented")
+}
+
+// =====================
+//  device event related
+// =====================
+
+void createEvent(deviceEvent_t* event) { DIPU_CALLKLX(xpu_event_create(event)) }
+
+void destroyEvent(deviceEvent_t event) {
+  DIPU_CALLKLX(xpu_event_destroy(event))
+}
+
+void waitEvent(deviceEvent_t event) { DIPU_CALLKLX(xpu_event_wait(event)) }
+
+void recordEvent(deviceEvent_t event, deviceStream_t stream) {
+  DIPU_CALLKLX(xpu_event_record(event, stream))
+}
+
+void eventElapsedTime(float* time, deviceEvent_t start, deviceEvent_t end) {}
+
+EventStatus getEventStatus(deviceEvent_t event) {
+  return devapis::EventStatus::READY;
+}
+
+// =====================
+//  mem related
+// =====================
+void mallocHost(void** p, size_t nbytes) {
+  DIPU_CALLKLX(xpu_host_alloc(p, nbytes, 0))
+}
+
+void freeHost(void* p){DIPU_CALLKLX(xpu_host_free(p))}
+
+OpStatus mallocDevice(void** p, size_t nbytes, bool throwExcepion) {
+  if (nbytes == 0) {
+    return OpStatus::SUCCESS;
+  }
+  int r = xpu_malloc(p, nbytes);
+  if (r != 0) {
+    if (throwExcepion) {
+      DIPU_LOGE("call xpu_malloc function failed.");
+      throw std::runtime_error("alloc failed in dipu");
+    } else if (r == XPUERR_NOMEM) {
+      return OpStatus::ERR_NOMEM;
+    } else {
+      return OpStatus::ERR_UNKNOWN;
+    }
+  }
+  return OpStatus::SUCCESS;
+}
+
+void freeDevice(void* p) { DIPU_CALLKLX(xpu_free(p)) }
+
+bool isPinnedPtr(const void* p) { return false; }
+
+static int _xpuMemset(void* ptr, int value, size_t count,
+                      deviceStream_t stream) {
+  if (count == 0) {
+    // skip if nothing to write.
+    return 0;
+  }
+  if (ptr == nullptr) {
+    return -1;
+  }
+
+  void* ptr_host = nullptr;
+  ptr_host = malloc(count);
+  if (ptr_host == nullptr) {
+    return -1;
+  }
+  int ret = xpu_memcpy(ptr, ptr_host, static_cast<uint64_t>(count),
+                       XPU_HOST_TO_DEVICE);
+  free(ptr_host);
+  return ret;
+}
+
+void memSetAsync(const deviceStream_t stream, void* ptr, int val, size_t size) {
+  DIPU_CALLKLX(_xpuMemset(ptr, val, size, stream))
+}
+
+void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void* dst,
+                deviceId_t srcDevId, const void* src) {
+  if (dstDevId == srcDevId) {
+    DIPU_CALLKLX(xpu_memcpy(dst, src, nbytes, XPU_DEVICE_TO_DEVICE))
+  } else {
+    DIPU_CALLKLX(xpu_memcpy_peer(dstDevId, dst, srcDevId, src,
+                                 static_cast<uint64_t>(nbytes)))
+  }
+}
+
+// (synchronous) copy from host to a DROPLET device
+void memCopyH2D(size_t nbytes, void* dst, const void* src) {
+  DIPU_CALLKLX(xpu_memcpy(dst, src, nbytes, XPU_HOST_TO_DEVICE))
+}
+
+// (synchronous) copy from a DROPLET device to host
+void memCopyD2H(size_t nbytes, void* dst, const void* src) {
+  DIPU_CALLKLX(xpu_memcpy(dst, src, nbytes, XPU_DEVICE_TO_HOST))
+}
+
+// (asynchronous) copy from device to a device
+void memCopyD2DAsync(const deviceStream_t stream, size_t nbytes,
+                     deviceId_t dstDevId, void* dst, deviceId_t srcDevId,
+                     const void* src) {
+  memCopyD2D(nbytes, dstDevId, dst, srcDevId, src);
+}
+
+// (asynchronous) copy from host to a device
+void memCopyH2DAsync(const deviceStream_t stream, size_t nbytes, void* dst,
+                     const void* src) {
+  memCopyH2D(nbytes, dst, src);
+}
+
+// (asynchronous) copy from a device to host
+void memCopyD2HAsync(const deviceStream_t stream, size_t nbytes, void* dst,
+                     const void* src) {
+  memCopyD2H(nbytes, dst, src);
+}
+
+}  // end namespace devapis
+
+}  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/vendorapi.h b/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/vendorapi.h
new file mode 100644
index 0000000000..829e45b290
--- /dev/null
+++ b/dipu/torch_dipu/csrc_dipu/vendor/kunlunxin/vendorapi.h
@@ -0,0 +1,33 @@
+#pragma once
+#include <xpu/runtime.h>
+#include <xpu/xdnn.h>
+
+#include <c10/util/Exception.h>
+
+#include <csrc_dipu/common.h>
+
+namespace xdnn = baidu::xpu::api;
+namespace dipu {
+
+#define DIPU_CALLKLX_ERROR(Expr) \
+  { throw std::runtime_error(#Expr); }
+
+#define DIPU_CALLKLX(Expr)                                           \
+  {                                                                  \
+    int ret = (Expr);                                                \
+    TORCH_CHECK(ret == XPU_SUCCESS, "call ku error, expr = ", #Expr, \
+                ", ret = ", ret);                                    \
+  }
+
+using deviceId_t = int;
+using deviceStream_t = XPUStream;
+#define deviceDefaultStreamLiteral nullptr
+using deviceEvent_t = XPUEvent;
+using deviceHandle_t = xdnn::Context*;
+
+class pcclComm_t {};
+using diclComm_t = pcclComm_t*;
+class pcclUniqueId {};
+using commUniqueId = pcclUniqueId;
+
+}  // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp b/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp
index 0b84a9e8ab..9149e8e985 100644
--- a/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp
+++ b/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp
@@ -18,11 +18,18 @@ class SUPACopyInplace : public DIPUCopyInpOnDIOPI {
   SUPACopyInplace() = default;
   ~SUPACopyInplace() = default;
 
-  // assume it can handle between device.
-  void copyNodirectBetweenDevices(at::Tensor& dst, const at::Tensor& src,
-                                  bool non_blocking,
-                                  CopyParamsInfo& info) override {
-    dipu_wrap_diopi_copy_inp(dst, src, non_blocking);
+  void run(at::Tensor& dst, const at::Tensor& src, bool non_blocking) override {
+    auto curStream = dipu::getCurrentDIPUStream();
+    ::diopiContext context(curStream.rawstream());
+    auto ctx = &context;
+    auto diopi_src = dipu::diopi_helper::toDiopiTensorHandle(src);
+    auto diopi_dst = dipu::diopi_helper::toDiopiTensorHandle(dst);
+    TORCH_CHECK(diopiError_t::diopiSuccess ==
+                diopiCopyInp(ctx, diopi_src, diopi_dst));
+    // syncAfterCopy
+    if (!non_blocking) {
+      dipu::devapis::syncStream(curStream.rawstream());
+    }
   }
 };
 
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/supa/deviceimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/supa/deviceimpl.cpp
index c04b74e79f..f2f2983869 100644
--- a/dipu/torch_dipu/csrc_dipu/vendor/supa/deviceimpl.cpp
+++ b/dipu/torch_dipu/csrc_dipu/vendor/supa/deviceimpl.cpp
@@ -184,6 +184,8 @@ DIPU_API void freeHost(void* p) { free(p); }
 extern "C" {
 void* br_device_malloc(uint64_t bytes);
 void br_device_free(void* ptr);
+// get physical address from ptr(virtual)
+void* get_phy_ptr(const void* ptr);
 }
 
 DIPU_API OpStatus mallocDevice(void** p, size_t nbytes, bool throwExcepion) {
@@ -206,47 +208,60 @@ DIPU_API bool isPinnedPtr(const void* p) { return false; }
 // (asynchronous) set val
 DIPU_API void memSetAsync(const deviceStream_t stream, void* ptr, int val,
                           size_t size) {
-  SUPA_CALL(suMemsetAsync(ptr, val, size, stream));
+  auto phy_gpu_addr = get_phy_ptr(ptr);
+  SUPA_CALL(suMemsetAsync(phy_gpu_addr, val, size, stream));
 }
 
 // (synchronous) copy from device to a device
 DIPU_API void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void* dst,
                          deviceId_t srcDevId, const void* src) {
   // SUPA uses Unified Virtual Address
-  SUPA_CALL(suMemcpy(dst, src, nbytes, suMemcpyDeviceToDevice));
+  auto phy_src_gpu_addr = get_phy_ptr(src);
+  auto phy_dst_gpu_addr = get_phy_ptr(dst);
+  SUPA_CALL(suMemcpy(phy_dst_gpu_addr, phy_src_gpu_addr, nbytes,
+                     suMemcpyDeviceToDevice));
 }
 
 // (synchronous) copy from host to a device
 DIPU_API void memCopyH2D(size_t nbytes, /*deviceId_t dstDevId,*/ void* dst,
                          /*Host srcDev,*/ const void* src) {
-  SUPA_CALL(suMemcpy(dst, src, nbytes, suMemcpyHostToDevice));
+  auto phy_dst_gpu_addr = get_phy_ptr(dst);
+  SUPA_CALL(suMemcpy(phy_dst_gpu_addr, src, nbytes, suMemcpyHostToDevice));
 }
 
 // (synchronous) copy from a device to host
 DIPU_API void memCopyD2H(size_t nbytes, /*Host dstDev,*/ void* dst,
                          /*deviceId_t srcDevId,*/ const void* src) {
-  SUPA_CALL(suMemcpy(dst, src, nbytes, suMemcpyDeviceToHost));
+  auto phy_src_gpu_addr = get_phy_ptr(src);
+  SUPA_CALL(suMemcpy(dst, phy_src_gpu_addr, nbytes, suMemcpyDeviceToHost));
 }
 
 // (asynchronous) copy from device to a device
 DIPU_API void memCopyD2DAsync(const deviceStream_t stream, size_t nbytes,
                               deviceId_t dstDevId, void* dst,
                               deviceId_t srcDevId, const void* src) {
-  SUPA_CALL(suMemcpyAsync(dst, src, nbytes, stream, suMemcpyDeviceToDevice));
+  auto phy_src_gpu_addr = get_phy_ptr(src);
+  auto phy_dst_gpu_addr = get_phy_ptr(dst);
+  SUPA_CALL(suMemcpyAsync(phy_dst_gpu_addr, phy_src_gpu_addr, nbytes, stream,
+                          suMemcpyDeviceToDevice));
 }
 
 // (asynchronous) copy from host to a device
 DIPU_API void memCopyH2DAsync(const deviceStream_t stream, size_t nbytes,
                               /*deviceId_t dstDevId,*/ void* dst,
                               /*Host srcDev,*/ const void* src) {
-  SUPA_CALL(suMemcpyAsync(dst, src, nbytes, stream, suMemcpyHostToDevice));
+  auto phy_dst_gpu_addr = get_phy_ptr(dst);
+  SUPA_CALL(suMemcpyAsync(phy_dst_gpu_addr, src, nbytes, stream,
+                          suMemcpyHostToDevice));
 }
 
 // (asynchronous) copy from a device to host
 DIPU_API void memCopyD2HAsync(const deviceStream_t stream, size_t nbytes,
                               /*Host dstDev,*/ void* dst,
                               /*deviceId_t srcDevId,*/ const void* src) {
-  SUPA_CALL(suMemcpyAsync(dst, src, nbytes, stream, suMemcpyDeviceToHost));
+  auto phy_src_gpu_addr = get_phy_ptr(src);
+  SUPA_CALL(suMemcpyAsync(dst, phy_src_gpu_addr, nbytes, stream,
+                          suMemcpyDeviceToHost));
 }
 }  // end namespace devapis
 }  // end namespace dipu
diff --git a/dipu/torch_dipu/profiler/readme.md b/dipu/torch_dipu/profiler/README.md
similarity index 75%
rename from dipu/torch_dipu/profiler/readme.md
rename to dipu/torch_dipu/profiler/README.md
index 6a91f325c6..46f1e2fdb5 100644
--- a/dipu/torch_dipu/profiler/readme.md
+++ b/dipu/torch_dipu/profiler/README.md
@@ -1,10 +1,14 @@
 # Profiler
 
 ## 简介
-DeepLink Profiler是一个允许在训练和推理过程中收集性能指标的工具。Profiler的上下文管理器API可用于了解哪些模型算子最耗时，并检查其输入形状和堆栈跟踪，研究设备kernel活动并可视化执行跟踪。当使用DeepLink进行模型训练时，可以使用DeepLink Profiler定位性能瓶颈，指导性能优化。
+
+DeepLink Profiler 是一个允许在训练和推理过程中收集性能指标的工具。Profiler 的上下文管理器 API 可用于了解哪些模型算子最耗时，并检查其输入形状和堆栈跟踪，研究设备 kernel 活动并可视化执行跟踪。当使用 DeepLink 进行模型训练时，可以使用 DeepLink Profiler 定位性能瓶颈，指导性能优化。
+
 ## 使用说明
-本教程将以resnet18模型为例，讲解如何使用DeepLink Profiler分析模型性能。
-1. 导入必要的库
+
+本教程将以 resnet18 模型为例，讲解如何使用 DeepLink Profiler 分析模型性能。
+
+### 1. 导入必要的库
 
 ``` python
 import torch_dipu
@@ -13,22 +17,23 @@ import torchvision.models as models
 from torch.profiler import profile, record_function, ProfilerActivity
 ```
 
-2. 实例化resnet18模型
+### 2. 实例化 resnet18 模型
 
 ```python
 model = models.resnet18()
 inputs = torch.randn(5, 3, 224, 224)
 ```
 
-3. 使用DeepLink profiler分析模型执行时间
+### 3. 使用 DeepLink Profiler 分析模型执行时间
+
+DeepLink Profiler 接口对齐了 PyTorch Profiler，通过上下文管理器启用，并接受很多参数，常用的参数有
 
-DeepLink profiler接口对齐了PyTorch Profiler，通过上下文管理器启用，并接受很多参数，常用的参数有
 + `activities`：要收集的打点列表
-   * `ProfilerActivity.CPU`：收集PyTorch算子、TorchScript函数以及用户自定义代码标签
-    * `ProfilerActivity.CUDA`：收集设备kernel打点
+  + `ProfilerActivity.CPU`：收集 PyTorch 算子、TorchScript 函数以及用户自定义代码标签
+  + `ProfilerActivity.CUDA`：收集设备 kernel 打点
 + `record_shapes`：是否记录算子输入的形状
 + `profile_memory`：是否统计模型张量内存消耗
-+ `use_cuda`：是否统计设备kernel执行时间
++ `use_cuda`：是否统计设备 kernel 执行时间
 + `with_stack`：是否打印调用栈
 
 ```Python
@@ -36,14 +41,16 @@ with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
     with record_function("model_inference"):
         model(inputs)
 ```
+
 打印出上面执行的统计数据：
+
 ```Python
 print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
 ```
 
-输出如下
+输出如下：
 
-```
+```text
 ---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------
                              Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls
 ---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------
@@ -60,16 +67,20 @@ print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
 ---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------
 Self CPU time total: 253.751ms
 ```
-从输出中可以发现，大部分的执行时间花在conv2d。
 
-需要说明的是，cpu time是指这个算子执行的总时间；同时，该算子有可能调用其他算子，self cpu time是该算子的总时间减去调用其他算子的时间。
+从输出中可以发现，大部分的执行时间花在 conv2d。
+
+需要说明的是，cpu time 是指这个算子执行的总时间；同时，该算子有可能调用其他算子，self cpu time 是该算子的总时间减去调用其他算子的时间。
+
+要获得更精细的结果粒度并包括运算符输入形状，需要设置 `group_by_input_shape=True`（注意：这需要将 profiler 的输入参数 `record_shape` 设置为 `True`）
 
-要获得更精细的结果粒度并包括运算符输入形状，需要设置`group_by_input_shape=True`（注意：这需要将profile的输入参数`record_shape`设置为True）
 ```Python
 print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10))
 ```
-输出如下
-```
+
+输出如下：
+
+```text
 ---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------
                              Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls                                                                      Input Shapes
 ---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------
@@ -85,9 +96,11 @@ print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total
                 aten::thnn_conv2d         0.01%      15.000us        14.36%      34.465ms      34.465ms             1                                 [[5, 3, 224, 224], [64, 3, 7, 7], [], [], [], []]
 ---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------
 ```
-从输出可以看到，resnet18模型中卷积包含了几种不同的形状。
 
-Profiler还可用于分析在GPU和其他AI加速芯片上执行的模型的性能：
+从输出可以看到，resnet18 模型中卷积包含了几种不同的形状。
+
+Profiler 还可用于分析在 GPU 和其他 AI 加速芯片上执行的模型的性能：
+
 ```Python
 model = models.resnet18().cuda()
 inputs = torch.randn(5, 3, 224, 224).cuda()
@@ -100,9 +113,9 @@ with profile(activities=[
 print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=20))
 ```
 
-输出如下
+输出如下：
 
-```
+```text
 -------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                              Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
 -------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
@@ -130,11 +143,13 @@ print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=20))
 Self CPU time total: 143.583ms
 Self CUDA time total: 168.781ms
 ```
-从输出可以看到，`diopiConvolution2d`和`diopiBatchNorm`是两个算子耗时最长。
 
-4. 分析内存消耗
+从输出可以看到，`diopiConvolution2d` 和 `diopiBatchNorm` 是两个算子耗时最长。
+
+### 4. 分析内存消耗
+
+PyTorch Profiler 还可以统计算子分配或释放的内存量。要启用内存分析功能，请将 `profile_memory` 设置成 `True`。
 
-PyTorch profiler还可以统计算子分配或释放的内存量。要启用内存分析功能，请将profile_memory设置成True。
 ```Python
 model = models.resnet18()
 inputs = torch.randn(5, 3, 224, 224)
@@ -143,8 +158,10 @@ with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shap
 
 print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))
 ```
-输出如下
-```
+
+输出如下：
+
+```text
 ---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                              Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls
 ---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
@@ -162,28 +179,27 @@ print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))
 Self CPU time total: 119.442ms
 ```
 
+### 5. 使用 Chrome trace viewer 进行可视化
 
-5. 使用chrome trace viewer进行可视化
+Profiling 结果可以输出成 json 文件
 
-Profiling结果可以输出成json文件
 ```Python
 model = models.resnet18().cuda()
 inputs = torch.randn(5, 3, 224, 224).cuda()
 with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
     model(inputs)
-    
+
 prof.export_chrome_trace("trace.json")
 ```
 
-使用Chrome trace viewer (chrome://tracing)工具查看trace.json文件，可视化结果如下图
+使用 Chrome trace viewer (`chrome://tracing`) 工具查看 `trace.json` 文件，可视化结果如下图：
+
+![trace](/dipu/img/profiler/trace_json.png)
 
-<div align=center>
-<img src="/dipu/img/profiler/trace_json.png">
-</div>
+### 6. 打印调用链
 
-6. 打印调用链
+Profiler 可用于分析 Python 和 TorchScript 堆栈跟踪。
 
-Profiler可用于分析Python和TorchScript堆栈跟踪。
 ```Python
 with profile(
     activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
@@ -194,8 +210,10 @@ with profile(
 
 print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total", row_limit=2))
 ```
-输出如下
-```
+
+输出如下：
+
+```text
 -------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  -----------------------------------------------------------------
                                              Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  Source Location
 -------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  -----------------------------------------------------------------
@@ -215,13 +233,16 @@ print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total"
 Self CPU time total: 139.666ms
 Self CUDA time total: 169.640ms
 ```
-7. 使用Profiler分析长时间运行任务
 
-Profiler提供了一个额外的API来处理长时间运行的作业（如模型训练）。跟踪所有的执行可能很慢，并导致非常大的跟踪文件。要避免这种情况，请使用可选参数：
-  1. `schedule`：指定一个函数，该函数以整数参数作为输入，并返回一个动作给Profiler。使用这个参数的最佳方式是使用`torch.profiler.schedule`辅助函数，它可以为您生成一个schedule
-  2. `on_trace_ready`：指定一个函数，该函数将Profiler的引用作为输入，并在每次准备好新跟踪时由Profiler调用。
+### 7. 使用 Profiler 分析长时间运行任务
+
+Profiler 提供了一个额外的 API 来处理长时间运行的作业（如模型训练）。跟踪所有的执行可能很慢，并导致非常大的跟踪文件。要避免这种情况，请使用可选参数：
+
+1. `schedule`：指定一个函数，该函数以整数参数作为输入，并返回一个动作给 Profiler。使用这个参数的最佳方式是使用 `torch.profiler.schedule` 辅助函数，它可以为您生成一个 schedule。
+2. `on_trace_ready`：指定一个函数，该函数将 Profiler 的引用作为输入，并在每次准备好新跟踪时由 Profiler 调用。
+
+为了说明 API 是如何工作的，让我们首先考虑以下带有 `torch.profiler.schedule` 函数的示例：
 
-为了说明API是如何工作的，让我们首先考虑以下带有`torch.profiler.schedule`函数的示例：
 ```Python
 from torch.profiler import schedule
 
@@ -232,22 +253,25 @@ my_schedule = schedule(
     active=3,
     repeat=2)
 ```
-Profiler假设长时间运行的任务由多个步骤组成，步骤编号从零开始。上面的示例定义了分析器的以下操作序列：
-1. 参数`skip_first`告诉分析器在前10个步骤中忽略追踪（`skip_first`的默认值为零）；
-2. 在前`skip_first`个步骤之后，分析器开始执行分析器周期；
+
+Profiler 假设长时间运行的任务由多个步骤组成，步骤编号从零开始。上面的示例定义了分析器的以下操作序列：
+
+1. 参数 `skip_first` 告诉分析器在前 10 个步骤中忽略追踪（`skip_first` 的默认值为零）；
+2. 在前 `skip_first` 个步骤之后，分析器开始执行分析器周期；
 3. 每个周期包括三个阶段：
-    1. 空闲阶段（`wait=5`步骤），在此阶段分析器处于非活动状态；
-    2. 预热阶段（`warmup=1`步骤），在此阶段分析器开始追踪，但结果会被丢弃。此阶段用于丢弃追踪开始时分析器获取的样本，因为它们通常会被额外的开销所影响；
-    3. 活动追踪阶段（`active=3`步骤），在此阶段分析器进行追踪和记录数据；
-4. 可选的repeat参数指定循环的上限。默认情况下（零值），分析器将在任务运行时执行循环。
+    1. 空闲阶段（`wait=5` 步骤），在此阶段分析器处于非活动状态；
+    2. 预热阶段（`warmup=1` 步骤），在此阶段分析器开始追踪，但结果会被丢弃。此阶段用于丢弃追踪开始时分析器获取的样本，因为它们通常会被额外的开销所影响；
+    3. 活动追踪阶段（`active=3` 步骤），在此阶段分析器进行追踪和记录数据；
+4. 可选的 repeat 参数指定循环的上限。默认情况下（零值），分析器将在任务运行时执行循环。
 
-因此，在上面的示例中，分析器将跳过前15个步骤，将下一个步骤用于预热，积极记录接下来的3个步骤，再跳过另外5个步骤，将下一个步骤用于预热，再积极记录另外3个步骤。由于指定了repeat=2参数值，分析器将在第一个两个周期后停止记录。
+因此，在上面的示例中，分析器将跳过前 15 个步骤，将下一个步骤用于预热，积极记录接下来的 3 个步骤，再跳过另外 5 个步骤，将下一个步骤用于预热，再积极记录另外 3 个步骤。由于指定了 `repeat=2` 参数值，分析器将在第一个两个周期后停止记录。
 
-在每个周期结束时，分析器调用指定的on_trace_ready函数，并将自身作为参数传递。该函数用于处理新的追踪结果，可以通过获取表格输出或将输出保存为追踪文件来进行处理。
+在每个周期结束时，分析器调用指定的 `on_trace_ready` 函数，并将自身作为参数传递。该函数用于处理新的追踪结果，可以通过获取表格输出或将输出保存为追踪文件来进行处理。
 
-要向分析器发送下一个步骤已开始的信号，请调用prof.step()函数。当前分析器步骤存储在prof.step_num中。
+要向分析器发送下一个步骤已开始的信号，请调用 `prof.step()` 函数。当前分析器步骤存储在 `prof.step_num` 中。
 
 以下示例显示了如何使用上述概念：
+
 ```Python
 def trace_handler(p):
     output = p.key_averages().table(sort_by="self_cuda_time_total", row_limit=10)
@@ -266,27 +290,32 @@ with profile(
         model(inputs)
         p.step()
 ```
+
 ## 使用案例
 
-### 案例一 Mobilenet v2多卡训练性能分析与优化
-##### 1. 问题描述：
+### 案例一 Mobilenet v2 多卡训练性能分析与优化
 
-  开发人员使用某个版本的DeepLink完成Mobilenet v2的适配后，发现该模型在NV上单机八卡训练很慢，需要进行性能优化，提升训练性能。
+#### 问题描述
 
-##### 2. 使用DeepLink Profer进行性能分析
-  1. 修改`mmpretrain`的`tools/train.py`，在`runner.train()`之前开启Profiler，将收集到的性能分析数据存入`mobilenetv2_profiler-slow`
-```Python
-from mmengine.hooks import ProfilerHook
+开发人员使用某个版本的 DeepLink 完成 Mobilenet v2 的适配后，发现该模型在 NV 上单机八卡训练很慢，需要进行性能优化，提升训练性能。
 
-profiler_hook = ProfilerHook(by_epoch = False, profile_times=10, activity_with_cpu=True, activity_with_cuda=True, json_trace_path='mobilenetv2_profiler-slow')
-runner.register_custom_hooks([profiler_hook])
-```
-  2. 使用chrome trace viewer查看，发现conv2d耗时长，从图中可以看到conv2d调用到了`thnn_conv2d`，而不是预期的`cudnn_convolution`
-<div align=center>
-<img src="/dipu/img/profiler/thnn_conv2d.png">
-</div>
+#### 使用 DeepLink Profer 进行性能分析
+
+1. 修改 `mmpretrain` 的 `tools/train.py`，在 `runner.train()` 之前开启 Profiler，将收集到的性能分析数据存入 `mobilenetv2_profiler-slow`：
+
+    ```Python
+    from mmengine.hooks import ProfilerHook
+
+    profiler_hook = ProfilerHook(by_epoch = False, profile_times=10, activity_with_cpu=True, activity_with_cuda=True, json_trace_path='mobilenetv2_profiler-slow')
+    runner.register_custom_hooks([profiler_hook])
+    ```
+
+2. 使用 chrome trace viewer 查看，发现 conv2d 耗时长，从图中可以看到 conv2d 调用到了`thnn_conv2d`，而不是预期的`cudnn_convolution`。
+
+    ![sample-conv2d](/dipu/img/profiler/thnn_conv2d.png)
+
+3. 最后定位到 DeepLink 某个版本新增了 `torch._C._set_cudnn_enabled(false)`，关闭了 cudnn，把这句话删除速度恢复正常。
 
-  3. 最后定位到DeepLink某个版本新增了 `torch._C._set_cudnn_enabled(false)`，关闭了cudnn，把这句话删除速度恢复正常。
+## 参考资料
 
-## 参考
-1. [PyTorch profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html)
++ [PyTorch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html)
diff --git a/dipu/torch_dipu/profiler/profiler.py b/dipu/torch_dipu/profiler/profiler.py
index 0011f9ac9c..61313844cd 100644
--- a/dipu/torch_dipu/profiler/profiler.py
+++ b/dipu/torch_dipu/profiler/profiler.py
@@ -412,6 +412,12 @@ def trim_path(path, src_column_width):
 
 
 def apply_profiler_patch():
+    # The data collected by dipu profiler differs significantly from pytorch profiler,
+    # making it difficult to align during performance analysis.
+    # Reuse pytorch profiler logic on NV, while providing environment variables to switch to dipu profiler.
+    if _C.dipu_vendor == 'CUDA' and os.environ.get("FORCE_USE_DIPU_PROFILER", 'False').lower() == 'false' :
+        return
+
     setattr(torch.profiler.profiler, 'kineto_available', dipu_kineto_available)
     setattr(torch.autograd.profiler, 'kineto_available', dipu_kineto_available)
     setattr(torch.autograd.profiler, '_prepare_profiler', _C._prepare_profiler)