From cb81f879d9a882a351ba3c0027e091f12df9d365 Mon Sep 17 00:00:00 2001
From: POI-WX <131418410+POI-WX@users.noreply.github.com>
Date: Wed, 6 Dec 2023 18:25:42 +0800
Subject: [PATCH 01/58] [DIPU] Wx/modify maximum schema due to the case in the
inference of internlm (#494)
* improve maximum schema due to the case in the inference of internlm
* fix bug according to comments
* fix bug
---
.../diopi_functions.yaml | 15 +++++--
.../python/unittests/test_minimum_maximum.py | 40 +++++++++++++++++++
2 files changed, 52 insertions(+), 3 deletions(-)
diff --git a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
index 8812397c5a..4b58185a24 100755
--- a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
+++ b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
@@ -1134,8 +1134,12 @@
interface: diopiMaxAll(ctx, out, self)
- schema: "maximum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"
- no_device_check_args: [other]
- interface: diopiMaximum(ctx, out, self, other)
+ no_device_check_args: [self, other]
+ ins: [selfTemp, otherTemp]
+ custom_code_at_the_beginning: |
+ auto selfTemp = (self.numel() == 1 && self.is_cpu()) ? self.to(other.device()) : self;
+ auto otherTemp = (other.numel() == 1 && other.is_cpu()) ? other.to(self.device()) : other;
+ interface: diopiMaximum(ctx, out, selfTemp, otherTemp)
- schema: "max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_indices) -> (Tensor(a!) max, Tensor(b!) max_indices)"
custom_code_at_the_beginning: |
@@ -1679,7 +1683,12 @@
interface: diopiClampMaxInp(ctx, self, max)
- schema: "minimum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"
- interface: diopiMinimum(ctx,out, self, other)
+ no_device_check_args: [self, other]
+ ins: [selfTemp, otherTemp]
+ custom_code_at_the_beginning: |
+ auto selfTemp = (self.numel() == 1 && self.is_cpu()) ? self.to(other.device()) : self;
+ auto otherTemp = (other.numel() == 1 && other.is_cpu()) ? other.to(self.device()) : other;
+ interface: diopiMinimum(ctx, out, selfTemp, otherTemp)
- schema: "scatter.value_out(Tensor self, int dim, Tensor index, Scalar value, *, Tensor(a!) out) -> Tensor(a!)"
interface: diopiScatterScalar(ctx, out, self, dim, value, index, "")
diff --git a/dipu/tests/python/unittests/test_minimum_maximum.py b/dipu/tests/python/unittests/test_minimum_maximum.py
index eecc57bc18..a6b00383d4 100644
--- a/dipu/tests/python/unittests/test_minimum_maximum.py
+++ b/dipu/tests/python/unittests/test_minimum_maximum.py
@@ -15,6 +15,26 @@ def test_minimum(self):
r_cpu = torch.minimum(a.to(self.cpu), b.to(self.cpu))
self.assertEqual(r_dipu.to(self.cpu), r_cpu)
+ def test_minimum_scalar(self):
+ # special test cases from the inference of internlm
+ a = torch.randn((3, 4))
+ b = torch.tensor(torch.finfo(a.dtype).max)
+ # scalar on cpu
+ r_dipu1 = torch.minimum(a.to(self.dipu), b)
+ # scalar on device
+ r_dipu2 = torch.minimum(a.to(self.dipu), b.to(self.dipu))
+ r_cpu = torch.minimum(a, b)
+ self.assertEqual(r_dipu1.to(self.cpu), r_cpu)
+ self.assertEqual(r_dipu2.to(self.cpu), r_cpu)
+
+ def test_minimum_different_devices(self):
+ a = torch.tensor([1, -2, 3])
+ b = torch.tensor([4, 0, 2]).to(self.dipu)
+ with self.assertRaises(RuntimeError) as context:
+ torch.minimum(a, b)
+ self.assertIn(
+ 'Expected all tensors to be on the same device', str(context.exception))
+
def test_maximum(self):
a = torch.tensor((1, 2, -1))
b = torch.tensor((3, 0, 4))
@@ -22,6 +42,26 @@ def test_maximum(self):
r_cpu = torch.maximum(a.to(self.cpu), b.to(self.cpu))
self.assertEqual(r_dipu.to(self.cpu), r_cpu)
+ def test_maximum_scalar(self):
+ # special test cases from the inference of internlm
+ a = torch.randn((3, 4))
+ b = torch.tensor(torch.finfo(a.dtype).min)
+ # scalar on cpu
+ r_dipu1 = torch.maximum(a.to(self.dipu), b)
+ # scalar on device
+ r_dipu2 = torch.maximum(a.to(self.dipu), b.to(self.dipu))
+ r_cpu = torch.maximum(a, b)
+ self.assertEqual(r_dipu1.to(self.cpu), r_cpu)
+ self.assertEqual(r_dipu2.to(self.cpu), r_cpu)
+
+ def test_maximum_different_devices(self):
+ a = torch.tensor([1, -2, 3])
+ b = torch.tensor([4, 0, 2]).to(self.dipu)
+ with self.assertRaises(RuntimeError) as context:
+ torch.maximum(a, b)
+ self.assertIn(
+ 'Expected all tensors to be on the same device', str(context.exception))
+
if __name__ == "__main__":
run_tests()
From d6c0094275afccf0147228926a53ba782519ca59 Mon Sep 17 00:00:00 2001
From: wiryls <7984500+wiryls@users.noreply.github.com>
Date: Wed, 6 Dec 2023 19:25:30 +0800
Subject: [PATCH 02/58] [both] fix, format and remove spaces in README.md
(#497)
* doc(readme): fix, format and remove spaces
* fix: typo and try auto-correct
* feat(ci): add autocorrect into ci
* fix: remove autocorrect form ci as it's not ready
---
.github/workflows/format.yml | 15 +-
README.md | 2 +-
dicp/README.md | 85 +++++++++
dicp/readme.md | 85 ---------
dipu/Contributors.md | 14 +-
dipu/QuickStart.md | 20 +--
dipu/README.md | 24 +--
.../profiler/{readme.md => README.md} | 165 ++++++++++--------
8 files changed, 220 insertions(+), 190 deletions(-)
create mode 100644 dicp/README.md
delete mode 100644 dicp/readme.md
rename dipu/torch_dipu/profiler/{readme.md => README.md} (75%)
diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml
index cbad72ae4a..32efe64a4e 100644
--- a/.github/workflows/format.yml
+++ b/.github/workflows/format.yml
@@ -11,19 +11,22 @@ jobs:
markdownlint:
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v4
+ - name: Checkout code
+ uses: actions/checkout@v4
with:
- fetch-depth: 2
- - uses: tj-actions/changed-files@v40
+ fetch-depth: 8
+ - name: Collect changed files
+ uses: tj-actions/changed-files@v40
id: changed-files
with:
files: '**/*.md'
- separator: ","
- - uses: DavidAnson/markdownlint-cli2-action@v14
+ separator: ','
+ - name: MarkdownLint
if: steps.changed-files.outputs.any_changed == 'true'
+ uses: DavidAnson/markdownlint-cli2-action@v14
with:
globs: ${{ steps.changed-files.outputs.all_changed_files }}
- separator: ","
+ separator: ','
clang-format:
needs: markdownlint
diff --git a/README.md b/README.md
index a3a17decde..7dd7aafe2f 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ Deeplink.framework 是 DeepLink 推出的介于 AI 训练框架和硬件语言
### DIPU
-DIPU (Device Independent Process Unit) 是由一组抽象设备 runtime 接口,一组框架能力相关的运行时基类/接口,一个针对 DIOPI 标准算子的适配层共同组成的拓展包。 用来在训练框架 PyTorch 上接入 DIOPI 算子库,实现 Eager 模式的推理和训练。其能够在编译时,决定抽象设备被影射的方式;并使用统一的运行时,减少在多硬件上适配训练框架的成本。DIPU 即可以基于统一的设备运行时来屏蔽厂商的实际设备;也可以基于统一的框架相关的运行时基类,由厂商自行实现特有的运行时逻辑。
+DIPU (Device Independent Process Unit) 是由一组抽象设备 runtime 接口,一组框架能力相关的运行时基类/接口,一个针对 DIOPI 标准算子的适配层共同组成的拓展包。用来在训练框架 PyTorch 上接入 DIOPI 算子库,实现 Eager 模式的推理和训练。其能够在编译时,决定抽象设备被影射的方式;并使用统一的运行时,减少在多硬件上适配训练框架的成本。DIPU 即可以基于统一的设备运行时来屏蔽厂商的实际设备;也可以基于统一的框架相关的运行时基类,由厂商自行实现特有的运行时逻辑。
### DICP
diff --git a/dicp/README.md b/dicp/README.md
new file mode 100644
index 0000000000..db01a09b6c
--- /dev/null
+++ b/dicp/README.md
@@ -0,0 +1,85 @@
+
+
+
+

+
+
+# DICP
+
+标准编译协议(Device-Independent Compile Protocol, DICP)定义了统一的计算描述(中间表示),通过计算图获取深度学习模型中的计算任务表达为上述中间表示,然后通过计算图优化技术自动生成人工智能芯片设备代码,从而提高研发效率和计算的执行性能。中间表示是介于源语言和目标语言之间的程序表示,能够极大程度地提高编译流程的可拓展性,同时也能降低优化流程对前端和后端的破坏。多层次中间表示包含从应用到芯片端的多种表示层次,不同层次旨在解决不同尺度的问题。
+
+DICP 主要的核心功能如下:
+
+1. 通过接入编译路线带来性能优势,在大模型场景最大限度释放芯片能力。
+2. 作为训练框架与国产硬件芯片之间的通用桥梁,支持多种前后端,带来使用易用性。
+3. 提供易用、高效的一站式编译适配流程,灵活支持国产硬件图编译器的特性,提高芯片适配效率。
+
+下图描述了 DICP 在编译链路中的位置:
+
+
+
+1. 训练框架通过图获取模块将用户的模型代码转换成统一的中间表达。此处的中间表达完全与芯片无关。所以在之后的编译协议部分中,需要建立起与后端芯片的联系。这样才能高效的完成接入。
+2. 编译协议完成了衔接框架与芯片编译器的工作,其中包含硬件相关的切图,统一中间表达与芯片所支持的算子之间的映射关系以及数据格式的转换模块。
+3. 在编译协议吸收了芯片特点之后,由代码生成模块生成最终的代码,并通过芯片的编译器生成二进制可执行文件之后由框架调用。
+
+## 基于 DICP 的国产硬件接入 PyTorch 2 实践
+
+
+
+基于上述 DICP,国产硬件可快速接入 PyTorch 2 的编译路线。此路线中的 TorchDynamo 组件,可使国产硬件在运行时的 overhead 大幅缩小。
+并且针对国产硬件实现了以下特性:
+
+- 灵活支持国产硬件图编译器的特性
+- 支持多种国产硬件数据格式
+- 支持动态 shape
+
+### 运行逻辑
+
+DICP 的运行逻辑如下图所示:
+
+
+
+
+其中:
+
+1. **算子映射**:主要解决框架层算子与后端图编译器的算子之间的语义差别,包括 1 对 1 和 1 对多的转换。
+2. **Shape & Dtype 推导**:进行 Shape & data_type 的推导,补全整张静态图上的信息,便于之后在代码生成模块能生成代码。
+3. **子图改写**:将多个小算子融合成为一个或多个适合图编译器的算子,配合后端图编译器将计算效率最大化。
+4. **数据格式调整**:是根据后端芯片与其图编译器的特性,针对特定的算子调整其输入输出的数据格式,使得最大程度的发挥芯片性能。
+
+### 目录结构
+
+- `dicp/dynamo_bridge`:多后端通用的接入代码,包含了
+ 1. 接收从 AOTAutograd 下发而来的 FX Graph
+ 2. 启动各个厂商的 IR 转换与优化
+ 3. 启动 CodeGen 以及 JIT 缓存的逻辑。
+- `dicp/vender`: 主要包含了各个厂商 IR 的定义,AtenIR 到厂商 IR 的转换,厂商 IR 上的优化以及最后的代码生成模块。
+- `test`: 包含了 model 测试与 op 测试
+
+### Demo
+
+#### 安装 DICP
+
+```bash
+cd /path_to_dicp
+pip install .
+```
+
+#### 在华为 910 上执行 llama7B 前向推理
+
+```bash
+export DIPU_MOCK_CUDA = false
+export DICP_TOPS_DIPU = True
+export TEST_DIR = /path_to_dicp/test/
+export LLAMA_MODEL_DIR=/path_to_llama_model
+bash /path_to_dicp/test/model/run_test_model.sh llama ascendgraph false
+```
+
+#### 在燧原 T20 上执行 resnet50 训练
+
+```bash
+export DIPU_MOCK_CUDA = false
+export DICP_TOPS_DIPU = True
+export TEST_DIR = /path_to_dicp/test/
+bash /path_to_dicp/test/model/run_test_model.sh resnet50 topsgraph false
+```
diff --git a/dicp/readme.md b/dicp/readme.md
deleted file mode 100644
index 6a5fc8de06..0000000000
--- a/dicp/readme.md
+++ /dev/null
@@ -1,85 +0,0 @@
-
-

-
-
-# DICP
-
-标准编译协议(Device-Independent Compile Protocol,DICP)定义了统一的计算描述(中间表示),通过计算图获取深度学习模型中的计算任务表达为上述中间表示,然后通过计算图优化技术自动生成人工智能芯片设备代码,从而提高研发效率和计算的执行性能。中间表示是介于源语言和目标语言之间的程序表示,能够极大程度地提高编译流程的可拓展性,同时也能降低优化流程对前端和后端的破坏。多层次中间表示包含从应用到芯片端的多种表示层次,不同层次旨在解决不同尺度的问题。
-
-DICP主要的核心功能如下:
-1. **通过接入编译路线带来性能优势,在大模型场景最大限度释放芯片能力**
-2. **作为训练框架与国产硬件芯片之间的通用桥梁,支持多种前后端,带来使用易用性**
-3. **提供易用、高效的一站式编译适配流程,灵活支持国产硬件图编译器的特性,提高芯片适配效率**
-
-下图描述了DICP在编译链路中的位置:
-
-
-

-
*DICP在编译链路中的位置
-
-
-
-1. 训练框架通过图获取模块将用户的模型代码转换成统一的中间表达。此处的中间表达完全与芯片无关。所以在之后的编译协议部分中,需要建立起与后端芯片的联系。这样才能高效的完成接入。
-2. 编译协议完成了衔接框架与芯片编译器的工作,其中包含硬件相关的切图,统一中间表达与芯片所支持的算子之间的映射关系以及数据格式的转换模块。
-3. 在编译协议吸收了芯片特点之后,由代码生成模块生成最终的代码,并通过芯片的编译器生成二进制可执行文件之后由框架调用。
-
-
-
-## 基于DICP的国产硬件接入PyTorch2实践
-
-
-
-基于上述DICP,国产硬件可快速接入Pytorch2的编译路线。此路线中的TorchDynamo组件,可使国产硬件在运行时的overhead大幅缩小。
-并且针对国产硬件实现了以下特性:
- - 灵活支持国产硬件图编译器的特性
- - 支持多种国产硬件数据格式
- - 支持动态shape
-
-### 运行逻辑
-DICP的运行逻辑如下图所示:
-
-
-
-

-
-
-其中:
-1. **算子映射**: 主要解决框架层算子与后端图编译器的算子之间的语义差别,包括1对1和1对多的转换。
-2. **Shape&Dtype推导**: 进行Shape&data_type的推导,补全整张静态图上的信息,便于之后在代码生成模块能生成代码。
-3. **子图改写**: 将多个小算子融合成为一个或多个适合图编译器的算子,配合后端图编译器将计算效率最大化。
-4. **数据格式调整**: 是根据后端芯片与其图编译器的特性,针对特定的算子调整其输入输出的数据格式,使得最大程度的发挥芯片性能。
-
-### 目录结构
-* dicp/dynamo_bridge: 多后端通用的接入代码,包含了
- 1. 接收从AOTAutograd下发而来的FX Graph
- 2. 启动各个厂商的IR转换与优化
- 3. 启动CodeGen以及JIT缓存的逻辑。
-* dicp/vender: 主要包含了各个厂商IR的定义,AtenIR到厂商IR的转换,厂商IR上的优化以及最后的代码生成模块。
-* test: 包含了model测试与op测试
-
-
-### Demo
-
-#### 安装DICP
-
-```
-cd /path_to_dicp
-pip install .
-```
-
-#### 在华为910上执行llama7B前向推理
-```
-export DIPU_MOCK_CUDA = false
-export DICP_TOPS_DIPU = True
-export TEST_DIR = /path_to_dicp/test/
-export LLAMA_MODEL_DIR=/path_to_llama_model
-bash /path_to_dicp/test/model/run_test_model.sh llama ascendgraph false
-```
-
-#### 在燧原T20上执行resnet50训练
-```
-export DIPU_MOCK_CUDA = false
-export DICP_TOPS_DIPU = True
-export TEST_DIR = /path_to_dicp/test/
-bash /path_to_dicp/test/model/run_test_model.sh resnet50 topsgraph false
-```
diff --git a/dipu/Contributors.md b/dipu/Contributors.md
index bbfd7ae213..e612cf0bdd 100644
--- a/dipu/Contributors.md
+++ b/dipu/Contributors.md
@@ -18,7 +18,7 @@
### 拉取请求工作流
-如果你对拉取请求不了解,没关系,接下来的内容将会从零开始,一步一步地指引你如何创建一个拉取请求。如果你想深入了解拉取请求的开发模式,可以参考[GitHub 官方文档](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)
+如果你对拉取请求不了解,没关系,接下来的内容将会从零开始,一步一步地指引你如何创建一个拉取请求。如果你想深入了解拉取请求的开发模式,可以参考 [GitHub 官方文档](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)
#### 复刻仓库
@@ -43,7 +43,7 @@ upstream git@github.com:DeepLink-org/deeplink.framework (fetch)
upstream git@github.com:DeepLink-org/deeplink.framework (push)
```
-> 这里对 origin 和 upstream 进行一个简单的介绍,当我们使用 `git clone` 来克隆代码时,会默认创建一个 origin 的 remote,它指向我们克隆的代码库地址,而 upstream 则是我们自己添加的,用来指向原始代码库地址。当然如果你不喜欢他叫 upstream,也可以自己修改,比如叫 dipu 。我们通常向 origin 提交代码(即 fork 下来的远程仓库),然后向 upstream 提交一个 pull request。如果提交的代码和最新的代码发生冲突,再从 upstream 拉取最新的代码,和本地分支解决冲突,再提交到 origin。
+> 这里对 origin 和 upstream 进行一个简单的介绍,当我们使用 `git clone` 来克隆代码时,会默认创建一个 origin 的 remote,它指向我们克隆的代码库地址,而 upstream 则是我们自己添加的,用来指向原始代码库地址。当然如果你不喜欢他叫 upstream,也可以自己修改,比如叫 dipu。我们通常向 origin 提交代码(即 fork 下来的远程仓库),然后向 upstream 提交一个 pull request。如果提交的代码和最新的代码发生冲突,再从 upstream 拉取最新的代码,和本地分支解决冲突,再提交到 origin。
#### 创建开发分支
@@ -59,7 +59,7 @@ git checkout -b xxx/refactor_contributing_doc
git pull upstream main
```
-#### 提交代码并在本地通过dipu测试
+#### 提交代码并在本地通过 DIPU 测试
提交的代码需要通过 DIPU 在各设备上的测例和模型 one_iter 测试。
@@ -78,11 +78,11 @@ git push -u origin {branch_name}
1. 在 GitHub 的 pull request 界面创建拉取请求
2. 根据指引修改 pull request 描述,以便于其他开发者更好地理解你的修改
-描述规范详见[拉取请求规范](#拉取请求规范)
+描述规范详见 [拉取请求规范](#拉取请求规范)
注意事项:
-- Pull request 描述应该包含修改理由、修改内容以及修改后带来的影响,并关联相关 issue(具体方式见[文档](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue))。
+- Pull request 描述应该包含修改理由、修改内容以及修改后带来的影响,并关联相关 issue(具体方式见 [GitHub 官方文档](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue))。
- 如果是第一次为 DIPU 做贡献,需要签署 CLA。
- 检查提交的 pull request 是否通过 CI(持续集成)。
- 如果 pull request 通过了 CI 检查,那么就可以等待其他开发者的 review,并根据 reviewer 的意见,修改代码,并重复上述步骤,直到 reviewer 同意合入 pull request。
@@ -117,7 +117,7 @@ git merge upstream/main
- 每次 commit 时需要提供清晰且有意义 commit 信息。
- 提供清晰且有意义的 pull request 描述:
- 标题写明白任务名称,参考格式:`[Prefix] Short description of the pull request (Suffix)`;
- - Prefix 参考:新增功能 `[Feature]`, 修 bug `[Fix]`, 文档相关 `[Docs]`, 开发中 `[WIP]` (暂时不会被 review)。
- - 描述里介绍 pull request 的主要修改内容,结果,以及对其他部分的影响, 参考 pull request 模板;
+ - Prefix 参考:新增功能 `[Feature]`, 修 bug `[Fix]`, 文档相关 `[Docs]`, 开发中 `[WIP]` (暂时不会被 review)。
+ - 描述里介绍 pull request 的主要修改内容,结果,以及对其他部分的影响,参考 pull request 模板;
- 关联相关的 issue 和其他 pull request。
- 如果引入了其他三方库,或借鉴了三方库的代码,请确认它们的许可证和 DIPU License 兼容,并在借鉴的代码上补充 `This code is inspired from `。
diff --git a/dipu/QuickStart.md b/dipu/QuickStart.md
index 10ccf63796..b5f640a2ad 100644
--- a/dipu/QuickStart.md
+++ b/dipu/QuickStart.md
@@ -167,7 +167,7 @@ export DIPU_FORCE_FALLBACK_OPS_LIST=add.out,conv2d
python -c "import torch_dipu"
```
-Fallback scalar 版本的重载函数, tensor 版本的重载函数类似:
+Fallback scalar 版本的重载函数,tensor 版本的重载函数类似:
```bash
export DIPU_FORCE_FALLBACK_OPS_LIST='.*.Scalar'
@@ -203,7 +203,7 @@ add_custom_command(
以上方法是对所有算子开启自动精度对比。如果只需要对特定算子做精度对比,也可只给需要的算子做精度对比,只需要在相关的配置文件(如 `dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml`)给相应的算子添加 `autocompare: True` 即可。
```shell
-$ unset DIPU_FORCE_FALLBACK_OPS_LIST # 主要是确保要比较的算子没有强制fallback到cpu,可选
+$ unset DIPU_FORCE_FALLBACK_OPS_LIST # 主要是确保要比较的算子没有强制 fallback 到 cpu, 可选
$ python
>>> import torch
>>> import torch_dipu
@@ -229,7 +229,7 @@ autocompare: add.out other: allclose
>>>
```
-可以看到,CPU 计算结果与设备计算结果 `allclose`,也能看到CPU和设备计算结果的 `shape`、`dtype` 等信息。特别的,需要注意以下几个问题:
+可以看到,CPU 计算结果与设备计算结果 `allclose`,也能看到 CPU 和设备计算结果的 `shape`、`dtype` 等信息。特别的,需要注意以下几个问题:
1. `dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml` 中配置了 `autograd:True` 的算子 (`cross_entropy_loss`、`conv2d`、`dropout`、`dropout_`、`linear`) 暂不支持 *backward* 的精度自动对比。如模型精度对不齐,可根据需要先将这几个算子 fallback 到 CPU 来确定问题。
2. 随机数生成相关的算子(`dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml` 中配置了 `autocompare:False`)没有做 `autocompare`,因为结果总是 `not_allclose`。
@@ -245,12 +245,11 @@ autocompare: add.out other: allclose
>>> import os
diopi dyload init
>>> x = torch.randn(3,4).cuda()
->>> os.environ['DIPU_DUMP_OP_ARGS']='1' # 只打印调用的底层算子名以及相关的diopi函数
+>>> os.environ['DIPU_DUMP_OP_ARGS']='1' # 只打印调用的底层算子名以及相关的 diopi 函数
>>> y = x + x
[dipu_add_out:349]:add.out diopiAdd
-
->>> os.environ['DIPU_DUMP_OP_ARGS']='2' # 打印调用的底层算子名,相关的diopi函数,算子参数
+>>> os.environ['DIPU_DUMP_OP_ARGS']='2' # 打印调用的底层算子名,相关的 diopi 函数,算子参数
>>> y = x + 3
[dipu_add_out:349]:add.out diopiAdd
[dipu_add_scalar_out:248]:add.Scalar_out diopiAddScalar
@@ -259,8 +258,7 @@ diopi dyload init
add.Scalar_out: alpha:1
add.Scalar_out: out:numel:12, sizes:[3, 4], stride:[4, 1], is_view:0, TensorOptions(dtype=float, device=privateuseone:0, layout=Strided, requires_grad=false (default), pinned_memory=false (default), memory_format=(nullopt)), data_ptr:0x7ff8c8c00400
-
->>> os.environ['DIPU_DUMP_OP_ARGS']='3' # 打印调用的底层算子名,相关的diopi函数,算子参数, tensor的值
+>>> os.environ['DIPU_DUMP_OP_ARGS']='3' # 打印调用的底层算子名,相关的 diopi 函数,算子参数, tensor 的值
>>> y = x * 3
[dipu_mul_out:815]:mul.out diopiMul
[dipu_mul_scalar_out:753]:mul.Scalar_out diopiMulScalar
@@ -289,7 +287,7 @@ diopi dyload init
### 核心代码添加
-- 在 `dipu/torch_dipu/csrc_dipu/runtime/device/basedef.h` 中定义了DIPU支持的硬件类型,我们需要在 `VendorDeviceType` 枚举类中添加 `DROPLET` 的硬件后端,并在这个文件中的`VendorTypeToStr` 函数里添加新硬件支持。后续这个文件中可能有更多的函数会涉及到硬件类型,按需添加即可。
+- 在 `dipu/torch_dipu/csrc_dipu/runtime/device/basedef.h` 中定义了 DIPU 支持的硬件类型,我们需要在 `VendorDeviceType` 枚举类中添加 `DROPLET` 的硬件后端,并在这个文件中的`VendorTypeToStr` 函数里添加新硬件支持。后续这个文件中可能有更多的函数会涉及到硬件类型,按需添加即可。
- `dipu/torch_dipu/csrc_dipu/vendor` 文件夹中存有各个硬件后端的 *runtime* 接入代码,我们需要根据 `dipu/torch_dipu/csrc_dipu/runtime/device/deviceapis.h` 中的声明,创建 `deviceimpl.cpp` 去根据硬件自己底层的 *runtime* 接口实现对应的函数。下面是 `deviceapis.h` 中的 `createStream` 函数的在国产硬件上的实现样例:
``` cpp
@@ -302,7 +300,7 @@ void createStream(deviceStream_t* stream, bool prior) {
}
```
-- 如果有多机多卡训练的需求,需要根据 `dipu/torch_dipu/csrc_dipu/runtime/device/diclapis.h` 中的声明,创建 `communiatorimpl.cpp` 去根据硬件自己底层的 *runtime* 接口实现对应的函数。
+- 如果有多机多卡训练的需求,需要根据 `dipu/torch_dipu/csrc_dipu/runtime/device/diclapis.h` 中的声明,创建 `communicatorimpl.cpp` 去根据硬件自己底层的 *runtime* 接口实现对应的函数。
- DIPU 在 `dipu/torch_dipu/csrc_dipu/runtime/core/DIPUGeneratorImpl.h` 中声明了 `DIPUGeneratorImpl` 这一个基本类型,如果我们的硬件实现了自己的 `generator` 基础函数,可以在这基础上实现自己的 `DeviceGeneratorImpl`,并实现基础的 `generator` 相关函数。国产硬件暂无这方面的实现。
### 增加编译脚本
@@ -326,4 +324,4 @@ void createStream(deviceStream_t* stream, bool prior) {
- 根据 DIPU 的编译介绍,我们在编译了 DIPU 之后,需要注意将 `LIBRARY_PATH`、`LD_LIBRARY_PATH`、`PYTHONPATH` 都设置好避免后续使用出现问题。
- `dipu/tests` 文件夹中有许多基础功能的测试,建议首先尝试测试 `python -u dipu/tests/python/unittests/test_add.py`,该文件测试跑通基本意味着我们的设备 *runtime* 接入没有问题了。
-- 编译脚本参考[编译 DIPU](#编译-dipu),测试脚本可以参考[验证 DIPU](#验证-dipu)。
+- 编译脚本参考 [编译 DIPU](#编译-dipu),测试脚本可以参考 [验证 DIPU](#验证-dipu)。
diff --git a/dipu/README.md b/dipu/README.md
index 3b55bac80d..ce128bcf4c 100644
--- a/dipu/README.md
+++ b/dipu/README.md
@@ -8,7 +8,7 @@
## 介绍
-DIPU (device independent process unit) 是由 **一组抽象设备 Runtime 接口,一组框架能力相关的运行时基类/接口,一个针对 DIOPI 标准算子的适配层** 共同组成的拓展包。 用来在训练框架 PyTorch 上接入 DIOPI 算子库,实现 Eager 模式的推理和训练。其能够在编译时,决定抽象设备被影射的方式;并使用统一的运行时,减少在多硬件上适配训练框架的成本。DIPU 即可以基于统一的设备运行时来屏蔽厂商的实际设备;也可以基于统一的框架相关的运行时基类,由厂商自行实现特有的运行时逻辑。
+DIPU (device independent process unit) 是由 **一组抽象设备 Runtime 接口,一组框架能力相关的运行时基类/接口,一个针对 DIOPI 标准算子的适配层** 共同组成的拓展包。用来在训练框架 PyTorch 上接入 DIOPI 算子库,实现 Eager 模式的推理和训练。其能够在编译时,决定抽象设备被影射的方式;并使用统一的运行时,减少在多硬件上适配训练框架的成本。DIPU 即可以基于统一的设备运行时来屏蔽厂商的实际设备;也可以基于统一的框架相关的运行时基类,由厂商自行实现特有的运行时逻辑。
虽然 PyTorch 定义了一套基础的运行时接口 `c10`,可以基于这个接口直接抽象各个设备接口,但是 `c10` 首先是个直面框架层的接口,每个接入的设备都需要实现大量类似的逻辑来完成 `c10` 的实现,对于多设备的支持很不方便。DIPU 先把 `c10` 的运行时适配到 DIPU 自己的运行时,把通用的逻辑抽取出来,可以让厂商仅实现必要的设备接口即可工作。
@@ -25,7 +25,7 @@ DIPU 结构上分为 Python 和 CPP 两部分:
Runtime 主要有以下几个部分:
1. *Core & Distributed*
- - PyTorch 把一些基本的设备层接口放到了一个叫 `c10` 的目录下,不同的设备接入者需要实现该接口来接入 PyTorch。详见[参考文档](http://blog.ezyang.com/2019/05/pytorch-internals/)对于`c10` 的介绍。
+ - PyTorch 把一些基本的设备层接口放到了一个叫 `c10` 的目录下,不同的设备接入者需要实现该接口来接入 PyTorch。详见 [参考文档](http://blog.ezyang.com/2019/05/pytorch-internals/) 对于`c10` 的介绍。
- DIPU 的这一部分主要就是对 PyTorch 的 `c10` 和 `c10d` 相关接口的实现,把设备无关的部分抽象出一组运行时基类。目前包含 `DIPUAllocator`、`DIPUGenerator`、`DIPUStream/Event/Guard`、`ProcessGroupDICL` 等。这些类会把设备相关的请求代理到 *device* 部分定义的一组设备接口。另外用户也可以继承上述基类,实现并注册自己的子类,实现设备特化的某些行为(这个能力的支持目前尚待完善)。
2. *Device*
- 包含 `deviceapis.h` 和 `diclapis.h` 两个接口文件。主要是设备 `memory/stream/event/communcation` 相关的接口函数(这部分接口后续有考虑挪到 DIOPI 中,成为 DIOPI 的 *Device* 接口,见上图)。
@@ -40,7 +40,7 @@ Aten 的能力主要依赖于 PyTorch 提供的注册自定义 *backend* 的能
#### DiopiRT (`csrc/dipu/diopirt`)
-用于实现 DIOPI 要求的 *Runtime*,具体参考 [DIOPI项目](https://github.com/DeepLink-org/DIOPI)。
+用于实现 DIOPI 要求的 *Runtime*,具体参考 [DIOPI 项目](https://github.com/DeepLink-org/DIOPI)。
#### Binding to Python (`csrc/dipu/binding`)
@@ -52,10 +52,10 @@ Aten 的能力主要依赖于 PyTorch 提供的注册自定义 *backend* 的能
一般的,除了要实现上面 *Device* 部分要求的接口函数外,*Vendor* 还需要实现一个特殊的 `vendorapi.h`,在这里导出设备 `device/stream/event/comm` 相关的数据结构定义。未来计划在设备层允许 *Vendor* 注册特化的 *Runtime* 子类,或者实现子类的构建器/工厂方法接口,实现设备特化的 *Runtime* 行为。
-### Python层
+### Python 层
1. DIPU 设备层接口 (`torch_dipu/dipu`):
- - 包含CPP层的 *Runtime* 接口对应的 Python 层。这部分会导出部分函数给用户侧,导出的函数类比 PyTorch 的 `torch/cuda` 部分。
+ - 包含 CPP 层的 *Runtime* 接口对应的 Python 层。这部分会导出部分函数给用户侧,导出的函数类比 PyTorch 的 `torch/cuda` 部分。
2. DIPU 采用 `monkey-patch` 的方式模拟了部分 PyTorch tensor 接口,让它们可以处理 DIPU 特殊的参数,该部分的设计还在优化中。
3. DIPU 拥有一定的模拟 CUDA 接口的能力。简单来说就是在 Python 层 用前面 DIPU 设备层的接口来替换 `torch.cuda` 的同名接口。
@@ -65,17 +65,17 @@ Aten 的能力主要依赖于 PyTorch 提供的注册自定义 *backend* 的能
### Dispatch 机制与 DIOPI 算子库
-PyTorch 的算子注册和分派有很多步骤,详见[参考文档](https://github.com/pytorch/pytorch/wiki/PyTorch-dispatcher-walkthrough)。
+PyTorch 的算子注册和分派有很多步骤,详见 [参考文档](https://github.com/pytorch/pytorch/wiki/PyTorch-dispatcher-walkthrough)。
-DIPU CPP 层适配的 ATen 算子对应的是分派过程中最底层(*backend* 层) 的算子或者 *composite* 层里等效为 *backend* 的算子。
+DIPU CPP 层适配的 ATen 算子对应的是分派过程中最底层(*backend* 层)的算子或者 *composite* 层里等效为 *backend* 的算子。
-这里面有一定的灵活性,以`Linear` 算子为例,在 PyTorch 的 `cpu/cuda` 设备上,它被实现为一个 `composite` 算子,实际的 *backend* 层算子是组合算子内部调用的 `addmm` 或者更底层的 `mm`。 而在 DIPU (`privateuse1`) 设备中,目前是注册了一个 `Linear` 算子(DIOPI 有这个算子)来替代组合算子,所以分派会直接走到新的 *backend* 层算子 `Linear`,而不会在调用原来的 `addmm/mm`。但是如果对应设备的 DIOPI 的 IMPL 算子库 没有实现 `diopiLinear` 而是实现了 `mm` 算子,也是可以正常走通 `Linear` 的调用流程的。
+这里面有一定的灵活性,以`Linear` 算子为例,在 PyTorch 的 `cpu/cuda` 设备上,它被实现为一个 `composite` 算子,实际的 *backend* 层算子是组合算子内部调用的 `addmm` 或者更底层的 `mm`。而在 DIPU (`privateuse1`) 设备中,目前是注册了一个 `Linear` 算子(DIOPI 有这个算子)来替代组合算子,所以分派会直接走到新的 *backend* 层算子 `Linear`,而不会在调用原来的 `addmm/mm`。但是如果对应设备的 DIOPI 的 IMPL 算子库 没有实现 `diopiLinear` 而是实现了 `mm` 算子,也是可以正常走通 `Linear` 的调用流程的。
### 无侵入式的 PyTorch 扩展包
-DIPU 没有直接修改 PyTorch 的代码,而是使用 out-of-tree 的方式接入新设备,详见[参考文档](https://pytorch.org/tutorials/advanced/extend_dispatcher.html)。
+DIPU 没有直接修改 PyTorch 的代码,而是使用 out-of-tree 的方式接入新设备,详见 [参考文档](https://pytorch.org/tutorials/advanced/extend_dispatcher.html)。
-PyTorch 要求 out-of-tree 的代码必须定义一个私有的 *Backend Key*,DIPU目前没有和 PyTorch 做官方的沟通,因此 PyTorch 主干里没有 `DIPU` 这个设备,目前是暂时借用 `PrivateUse1` 这个 Key(后续考虑改为借用 `XPU` 设备 Key,因为这个 Key 在 PyTorch 主干代码中有更好的支持)。
+PyTorch 要求 out-of-tree 的代码必须定义一个私有的 *Backend Key*,DIPU 目前没有和 PyTorch 做官方的沟通,因此 PyTorch 主干里没有 `DIPU` 这个设备,目前是暂时借用 `PrivateUse1` 这个 Key(后续考虑改为借用 `XPU` 设备 Key,因为这个 Key 在 PyTorch 主干代码中有更好的支持)。
基于用户私有的 *Backend Key* 和 `Dispatch Key`,PyTorch 会把算子调用请求分发到对应设备的算子实现。另外 `c10` 本身提供了一些注册能力,比如 `C10_REGISTER_GUARD_IMPL`,可以让用户把私有设备的 *Runtime* 代码注册到框架中。
@@ -83,7 +83,7 @@ PyTorch 要求 out-of-tree 的代码必须定义一个私有的 *Backend Key*,
### 算子适配能力
-为了更好的接入 DIOPI 算子,DIPU 提供了一组算子适配相关的辅助能力,比如灵活的算子 Fallback to CPU 的能力、算子精度自动对比的能力(对比 DIOPI 算子和 PyTorch 原生的 CPU 算子),算子执行过程中打印算子参数的能力。基于这些能力,接入算子时可以更方便排查算子精度等问题。 相关能力的具体说明参见 [Quick Start 文档](https://deeplink.readthedocs.io/zh-cn/latest/doc/DIPU/quick_start.html)的“算子库接入”章节。
+为了更好的接入 DIOPI 算子,DIPU 提供了一组算子适配相关的辅助能力,比如灵活的算子 Fallback to CPU 的能力、算子精度自动对比的能力(对比 DIOPI 算子和 PyTorch 原生的 CPU 算子),算子执行过程中打印算子参数的能力。基于这些能力,接入算子时可以更方便排查算子精度等问题。相关能力的具体说明参见 [Quick Start 文档](https://deeplink.readthedocs.io/zh-cn/latest/doc/DIPU/quick_start.html) 的“算子库接入”章节。
## 质量保障体系
@@ -94,7 +94,7 @@ PyTorch 要求 out-of-tree 的代码必须定义一个私有的 *Backend Key*,
2. 简单开发的手工测例。这部分测例更注重算子能否跑通,对算子要求较低。
3. 模型测试。我们开发了 `one_iter` 精度对比工具,会先在精度正确性没问题的设备(如 CPU 和 CUDA)上训练模型,保存每一层的算子输入、输出、权重、梯度数据,再在待测试设备上训练模型,逐层对比训练精度。
-> 更多信息请参考 [dipu/tests](./dipu/tests) 目录。
+> 更多信息请参考 [dipu/tests](./tests) 目录。
## Learn More
diff --git a/dipu/torch_dipu/profiler/readme.md b/dipu/torch_dipu/profiler/README.md
similarity index 75%
rename from dipu/torch_dipu/profiler/readme.md
rename to dipu/torch_dipu/profiler/README.md
index 6a91f325c6..46f1e2fdb5 100644
--- a/dipu/torch_dipu/profiler/readme.md
+++ b/dipu/torch_dipu/profiler/README.md
@@ -1,10 +1,14 @@
# Profiler
## 简介
-DeepLink Profiler是一个允许在训练和推理过程中收集性能指标的工具。Profiler的上下文管理器API可用于了解哪些模型算子最耗时,并检查其输入形状和堆栈跟踪,研究设备kernel活动并可视化执行跟踪。当使用DeepLink进行模型训练时,可以使用DeepLink Profiler定位性能瓶颈,指导性能优化。
+
+DeepLink Profiler 是一个允许在训练和推理过程中收集性能指标的工具。Profiler 的上下文管理器 API 可用于了解哪些模型算子最耗时,并检查其输入形状和堆栈跟踪,研究设备 kernel 活动并可视化执行跟踪。当使用 DeepLink 进行模型训练时,可以使用 DeepLink Profiler 定位性能瓶颈,指导性能优化。
+
## 使用说明
-本教程将以resnet18模型为例,讲解如何使用DeepLink Profiler分析模型性能。
-1. 导入必要的库
+
+本教程将以 resnet18 模型为例,讲解如何使用 DeepLink Profiler 分析模型性能。
+
+### 1. 导入必要的库
``` python
import torch_dipu
@@ -13,22 +17,23 @@ import torchvision.models as models
from torch.profiler import profile, record_function, ProfilerActivity
```
-2. 实例化resnet18模型
+### 2. 实例化 resnet18 模型
```python
model = models.resnet18()
inputs = torch.randn(5, 3, 224, 224)
```
-3. 使用DeepLink profiler分析模型执行时间
+### 3. 使用 DeepLink Profiler 分析模型执行时间
+
+DeepLink Profiler 接口对齐了 PyTorch Profiler,通过上下文管理器启用,并接受很多参数,常用的参数有
-DeepLink profiler接口对齐了PyTorch Profiler,通过上下文管理器启用,并接受很多参数,常用的参数有
+ `activities`:要收集的打点列表
- * `ProfilerActivity.CPU`:收集PyTorch算子、TorchScript函数以及用户自定义代码标签
- * `ProfilerActivity.CUDA`:收集设备kernel打点
+ + `ProfilerActivity.CPU`:收集 PyTorch 算子、TorchScript 函数以及用户自定义代码标签
+ + `ProfilerActivity.CUDA`:收集设备 kernel 打点
+ `record_shapes`:是否记录算子输入的形状
+ `profile_memory`:是否统计模型张量内存消耗
-+ `use_cuda`:是否统计设备kernel执行时间
++ `use_cuda`:是否统计设备 kernel 执行时间
+ `with_stack`:是否打印调用栈
```Python
@@ -36,14 +41,16 @@ with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
with record_function("model_inference"):
model(inputs)
```
+
打印出上面执行的统计数据:
+
```Python
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
```
-输出如下
+输出如下:
-```
+```text
--------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls
--------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
@@ -60,16 +67,20 @@ print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
--------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 253.751ms
```
-从输出中可以发现,大部分的执行时间花在conv2d。
-需要说明的是,cpu time是指这个算子执行的总时间;同时,该算子有可能调用其他算子,self cpu time是该算子的总时间减去调用其他算子的时间。
+从输出中可以发现,大部分的执行时间花在 conv2d。
+
+需要说明的是,cpu time 是指这个算子执行的总时间;同时,该算子有可能调用其他算子,self cpu time 是该算子的总时间减去调用其他算子的时间。
+
+要获得更精细的结果粒度并包括运算符输入形状,需要设置 `group_by_input_shape=True`(注意:这需要将 profiler 的输入参数 `record_shape` 设置为 `True`)
-要获得更精细的结果粒度并包括运算符输入形状,需要设置`group_by_input_shape=True`(注意:这需要将profile的输入参数`record_shape`设置为True)
```Python
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10))
```
-输出如下
-```
+
+输出如下:
+
+```text
--------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ --------------------------------------------------------------------------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls Input Shapes
--------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ --------------------------------------------------------------------------------
@@ -85,9 +96,11 @@ print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total
aten::thnn_conv2d 0.01% 15.000us 14.36% 34.465ms 34.465ms 1 [[5, 3, 224, 224], [64, 3, 7, 7], [], [], [], []]
--------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ --------------------------------------------------------------------------------
```
-从输出可以看到,resnet18模型中卷积包含了几种不同的形状。
-Profiler还可用于分析在GPU和其他AI加速芯片上执行的模型的性能:
+从输出可以看到,resnet18 模型中卷积包含了几种不同的形状。
+
+Profiler 还可用于分析在 GPU 和其他 AI 加速芯片上执行的模型的性能:
+
```Python
model = models.resnet18().cuda()
inputs = torch.randn(5, 3, 224, 224).cuda()
@@ -100,9 +113,9 @@ with profile(activities=[
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=20))
```
-输出如下
+输出如下:
-```
+```text
------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
@@ -130,11 +143,13 @@ print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=20))
Self CPU time total: 143.583ms
Self CUDA time total: 168.781ms
```
-从输出可以看到,`diopiConvolution2d`和`diopiBatchNorm`是两个算子耗时最长。
-4. 分析内存消耗
+从输出可以看到,`diopiConvolution2d` 和 `diopiBatchNorm` 是两个算子耗时最长。
+
+### 4. 分析内存消耗
+
+PyTorch Profiler 还可以统计算子分配或释放的内存量。要启用内存分析功能,请将 `profile_memory` 设置成 `True`。
-PyTorch profiler还可以统计算子分配或释放的内存量。要启用内存分析功能,请将profile_memory设置成True。
```Python
model = models.resnet18()
inputs = torch.randn(5, 3, 224, 224)
@@ -143,8 +158,10 @@ with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shap
print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))
```
-输出如下
-```
+
+输出如下:
+
+```text
--------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg CPU Mem Self CPU Mem # of Calls
--------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
@@ -162,28 +179,27 @@ print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))
Self CPU time total: 119.442ms
```
+### 5. 使用 Chrome trace viewer 进行可视化
-5. 使用chrome trace viewer进行可视化
+Profiling 结果可以输出成 json 文件
-Profiling结果可以输出成json文件
```Python
model = models.resnet18().cuda()
inputs = torch.randn(5, 3, 224, 224).cuda()
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
model(inputs)
-
+
prof.export_chrome_trace("trace.json")
```
-使用Chrome trace viewer (chrome://tracing)工具查看trace.json文件,可视化结果如下图
+使用 Chrome trace viewer (`chrome://tracing`) 工具查看 `trace.json` 文件,可视化结果如下图:
+
+
-
-

-
+### 6. 打印调用链
-6. 打印调用链
+Profiler 可用于分析 Python 和 TorchScript 堆栈跟踪。
-Profiler可用于分析Python和TorchScript堆栈跟踪。
```Python
with profile(
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
@@ -194,8 +210,10 @@ with profile(
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total", row_limit=2))
```
-输出如下
-```
+
+输出如下:
+
+```text
------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -----------------------------------------------------------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls Source Location
------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -----------------------------------------------------------------
@@ -215,13 +233,16 @@ print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total"
Self CPU time total: 139.666ms
Self CUDA time total: 169.640ms
```
-7. 使用Profiler分析长时间运行任务
-Profiler提供了一个额外的API来处理长时间运行的作业(如模型训练)。跟踪所有的执行可能很慢,并导致非常大的跟踪文件。要避免这种情况,请使用可选参数:
- 1. `schedule`:指定一个函数,该函数以整数参数作为输入,并返回一个动作给Profiler。使用这个参数的最佳方式是使用`torch.profiler.schedule`辅助函数,它可以为您生成一个schedule
- 2. `on_trace_ready`:指定一个函数,该函数将Profiler的引用作为输入,并在每次准备好新跟踪时由Profiler调用。
+### 7. 使用 Profiler 分析长时间运行任务
+
+Profiler 提供了一个额外的 API 来处理长时间运行的作业(如模型训练)。跟踪所有的执行可能很慢,并导致非常大的跟踪文件。要避免这种情况,请使用可选参数:
+
+1. `schedule`:指定一个函数,该函数以整数参数作为输入,并返回一个动作给 Profiler。使用这个参数的最佳方式是使用 `torch.profiler.schedule` 辅助函数,它可以为您生成一个 schedule。
+2. `on_trace_ready`:指定一个函数,该函数将 Profiler 的引用作为输入,并在每次准备好新跟踪时由 Profiler 调用。
+
+为了说明 API 是如何工作的,让我们首先考虑以下带有 `torch.profiler.schedule` 函数的示例:
-为了说明API是如何工作的,让我们首先考虑以下带有`torch.profiler.schedule`函数的示例:
```Python
from torch.profiler import schedule
@@ -232,22 +253,25 @@ my_schedule = schedule(
active=3,
repeat=2)
```
-Profiler假设长时间运行的任务由多个步骤组成,步骤编号从零开始。上面的示例定义了分析器的以下操作序列:
-1. 参数`skip_first`告诉分析器在前10个步骤中忽略追踪(`skip_first`的默认值为零);
-2. 在前`skip_first`个步骤之后,分析器开始执行分析器周期;
+
+Profiler 假设长时间运行的任务由多个步骤组成,步骤编号从零开始。上面的示例定义了分析器的以下操作序列:
+
+1. 参数 `skip_first` 告诉分析器在前 10 个步骤中忽略追踪(`skip_first` 的默认值为零);
+2. 在前 `skip_first` 个步骤之后,分析器开始执行分析器周期;
3. 每个周期包括三个阶段:
- 1. 空闲阶段(`wait=5`步骤),在此阶段分析器处于非活动状态;
- 2. 预热阶段(`warmup=1`步骤),在此阶段分析器开始追踪,但结果会被丢弃。此阶段用于丢弃追踪开始时分析器获取的样本,因为它们通常会被额外的开销所影响;
- 3. 活动追踪阶段(`active=3`步骤),在此阶段分析器进行追踪和记录数据;
-4. 可选的repeat参数指定循环的上限。默认情况下(零值),分析器将在任务运行时执行循环。
+ 1. 空闲阶段(`wait=5` 步骤),在此阶段分析器处于非活动状态;
+ 2. 预热阶段(`warmup=1` 步骤),在此阶段分析器开始追踪,但结果会被丢弃。此阶段用于丢弃追踪开始时分析器获取的样本,因为它们通常会被额外的开销所影响;
+ 3. 活动追踪阶段(`active=3` 步骤),在此阶段分析器进行追踪和记录数据;
+4. 可选的 repeat 参数指定循环的上限。默认情况下(零值),分析器将在任务运行时执行循环。
-因此,在上面的示例中,分析器将跳过前15个步骤,将下一个步骤用于预热,积极记录接下来的3个步骤,再跳过另外5个步骤,将下一个步骤用于预热,再积极记录另外3个步骤。由于指定了repeat=2参数值,分析器将在第一个两个周期后停止记录。
+因此,在上面的示例中,分析器将跳过前 15 个步骤,将下一个步骤用于预热,积极记录接下来的 3 个步骤,再跳过另外 5 个步骤,将下一个步骤用于预热,再积极记录另外 3 个步骤。由于指定了 `repeat=2` 参数值,分析器将在第一个两个周期后停止记录。
-在每个周期结束时,分析器调用指定的on_trace_ready函数,并将自身作为参数传递。该函数用于处理新的追踪结果,可以通过获取表格输出或将输出保存为追踪文件来进行处理。
+在每个周期结束时,分析器调用指定的 `on_trace_ready` 函数,并将自身作为参数传递。该函数用于处理新的追踪结果,可以通过获取表格输出或将输出保存为追踪文件来进行处理。
-要向分析器发送下一个步骤已开始的信号,请调用prof.step()函数。当前分析器步骤存储在prof.step_num中。
+要向分析器发送下一个步骤已开始的信号,请调用 `prof.step()` 函数。当前分析器步骤存储在 `prof.step_num` 中。
以下示例显示了如何使用上述概念:
+
```Python
def trace_handler(p):
output = p.key_averages().table(sort_by="self_cuda_time_total", row_limit=10)
@@ -266,27 +290,32 @@ with profile(
model(inputs)
p.step()
```
+
## 使用案例
-### 案例一 Mobilenet v2多卡训练性能分析与优化
-##### 1. 问题描述:
+### 案例一 Mobilenet v2 多卡训练性能分析与优化
- 开发人员使用某个版本的DeepLink完成Mobilenet v2的适配后,发现该模型在NV上单机八卡训练很慢,需要进行性能优化,提升训练性能。
+#### 问题描述
-##### 2. 使用DeepLink Profer进行性能分析
- 1. 修改`mmpretrain`的`tools/train.py`,在`runner.train()`之前开启Profiler,将收集到的性能分析数据存入`mobilenetv2_profiler-slow`
-```Python
-from mmengine.hooks import ProfilerHook
+开发人员使用某个版本的 DeepLink 完成 Mobilenet v2 的适配后,发现该模型在 NV 上单机八卡训练很慢,需要进行性能优化,提升训练性能。
-profiler_hook = ProfilerHook(by_epoch = False, profile_times=10, activity_with_cpu=True, activity_with_cuda=True, json_trace_path='mobilenetv2_profiler-slow')
-runner.register_custom_hooks([profiler_hook])
-```
- 2. 使用chrome trace viewer查看,发现conv2d耗时长,从图中可以看到conv2d调用到了`thnn_conv2d`,而不是预期的`cudnn_convolution`
-
-

-
+#### 使用 DeepLink Profer 进行性能分析
+
+1. 修改 `mmpretrain` 的 `tools/train.py`,在 `runner.train()` 之前开启 Profiler,将收集到的性能分析数据存入 `mobilenetv2_profiler-slow`:
+
+ ```Python
+ from mmengine.hooks import ProfilerHook
+
+ profiler_hook = ProfilerHook(by_epoch = False, profile_times=10, activity_with_cpu=True, activity_with_cuda=True, json_trace_path='mobilenetv2_profiler-slow')
+ runner.register_custom_hooks([profiler_hook])
+ ```
+
+2. 使用 chrome trace viewer 查看,发现 conv2d 耗时长,从图中可以看到 conv2d 调用到了`thnn_conv2d`,而不是预期的`cudnn_convolution`。
+
+ 
+
+3. 最后定位到 DeepLink 某个版本新增了 `torch._C._set_cudnn_enabled(false)`,关闭了 cudnn,把这句话删除速度恢复正常。
- 3. 最后定位到DeepLink某个版本新增了 `torch._C._set_cudnn_enabled(false)`,关闭了cudnn,把这句话删除速度恢复正常。
+## 参考资料
-## 参考
-1. [PyTorch profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html)
++ [PyTorch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html)
From 2770001c28652a803ce7d78870f645d46c2f73cd Mon Sep 17 00:00:00 2001
From: liwenjian-sensetime
<109193776+liwenjian-sensetime@users.noreply.github.com>
Date: Thu, 7 Dec 2023 11:06:22 +0800
Subject: [PATCH 03/58] update env python 3.10 (#503)
---
.github/workflows/main.yml | 2 +-
dipu/scripts/ci/camb/ci_camb_env.sh | 10 +++++-----
dipu/scripts/ci/nv/ci_nv_env.sh | 12 ++++++------
3 files changed, 12 insertions(+), 12 deletions(-)
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 6f946ae407..9b3f4cff4e 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -128,7 +128,7 @@ jobs:
cd ${CAMB_CI_PATH}/${GITHUB_RUN_NUMBER}/Build-Camb
rm -rf scripts
ln -s ${CAMB_CI_PATH}/${GITHUB_RUN_NUMBER}/source-main/dipu/third_party/DIOPI/scripts scripts
- source /mnt/cache/share/platform/env/camb_ci_diopi_impl
+ source /mnt/cache/share/platform/env/pt2.0_diopi
bash scripts/increment_coverage.sh ${REQUIRE_COVERAGE}
"""
diff --git a/dipu/scripts/ci/camb/ci_camb_env.sh b/dipu/scripts/ci/camb/ci_camb_env.sh
index 7527809648..858b9d4654 100644
--- a/dipu/scripts/ci/camb/ci_camb_env.sh
+++ b/dipu/scripts/ci/camb/ci_camb_env.sh
@@ -1,9 +1,9 @@
PLATFORM=/mnt/lustre/share/platform
-ENV_NAME=dipu_poc
+ENV_NAME=pt2.0_diopi
export PATH=`python ${PLATFORM}/env/clear_path.py PATH`
export LD_LIBRARY_PATH=`python ${PLATFORM}/env/clear_path.py LD_LIBRARY_PATH`
-GCC_ROOT=/mnt/lustre/share/platform/dep/gcc-7.5
-CONDA_ROOT=${PLATFORM}/env/miniconda3.8
+GCC_ROOT=/mnt/lustre/share/platform/dep/gcc-10.2
+CONDA_ROOT=${PLATFORM}/env/miniconda3.10
export NEUWARE_HOME=/usr/local/neuware
export CC=${GCC_ROOT}/bin/gcc
@@ -13,8 +13,8 @@ export CXX=${GCC_ROOT}/bin/g++
export DIOPI_ROOT=$(pwd)/third_party/DIOPI/impl/lib/
export DIPU_ROOT=$(pwd)/torch_dipu
export LD_LIBRARY_PATH=$DIPU_ROOT:$LD_LIBRARY_PATH
-export PYTHONPATH=${PYTORCH_DIR}/install_path/lib/python3.8/site-packages:${PYTHONPATH}
-export PATH=${GCC_ROOT}/bin:${PYTORCH_DIR}/install_path/bin:${CONDA_ROOT}/envs/dipu_poc/bin:${CONDA_ROOT}/bin:${PATH}
+export PYTHONPATH=${PLATFORM}/dep/DIOPI_pytorch/pytorch2.0:${PYTHONPATH}
+export PATH=${GCC_ROOT}/bin:${CONDA_ROOT}/envs/dipu_poc/bin:${CONDA_ROOT}/bin:${PATH}
export LD_PRELOAD=${GCC_ROOT}/lib64/libstdc++.so.6
diff --git a/dipu/scripts/ci/nv/ci_nv_env.sh b/dipu/scripts/ci/nv/ci_nv_env.sh
index d885dc983e..2f26b9d899 100644
--- a/dipu/scripts/ci/nv/ci_nv_env.sh
+++ b/dipu/scripts/ci/nv/ci_nv_env.sh
@@ -2,14 +2,14 @@ PLATFORM=/mnt/cache/share/platform
ENV_NAME=pt2.0_diopi
export PATH=`python ${PLATFORM}/env/clear_path.py PATH`
export LD_LIBRARY_PATH=`python ${PLATFORM}/env/clear_path.py LD_LIBRARY_PATH`
-GCC_ROOT=${PLATFORM}/dep/gcc-7.5
-CONDA_ROOT=${PLATFORM}/env/miniconda3.8
+GCC_ROOT=${PLATFORM}/dep/gcc-10.2
+CONDA_ROOT=${PLATFORM}/env/miniconda3.10
export CC=${GCC_ROOT}/bin/gcc
export CXX=${GCC_ROOT}/bin/g++
-export CUDA_PATH=${PLATFORM}/dep/cuda11.7-cudnn8.5
-export MPI_ROOT=${PLATFORM}/dep/openmpi-4.0.5-cuda11.7
-export NCCL_ROOT=${PLATFORM}/dep/nccl-2.13.4-cuda11.7
+export CUDA_PATH=${PLATFORM}/dep/cuda11.8-cudnn8.9
+export MPI_ROOT=${PLATFORM}/dep/openmpi-4.0.5-cuda11.8
+export NCCL_ROOT=${PLATFORM}/dep/nccl-2.15.5-cuda11.8
export GTEST_ROOT=${PLATFORM}/dep/googletest-gcc5.4
@@ -24,7 +24,7 @@ export DIOPI_ROOT=$(pwd)/third_party/DIOPI/impl/lib/
export DIPU_ROOT=$(pwd)/torch_dipu
export DIOPI_PATH=$(pwd)/third_party/DIOPI/proto
export DIPU_PATH=${DIPU_ROOT}
-export PYTORCH_DIR=${PLATFORM}/env/miniconda3.8/envs/pt2.0_diopi/lib/python3.8/site-packages
+export PYTORCH_DIR=${PLATFORM}/dep/DIOPI_pytorch/pytorch2.0_cu118
export LD_LIBRARY_PATH=$DIPU_ROOT:$LD_LIBRARY_PATH
export PYTHONPATH=${PYTORCH_DIR}:${PYTHONPATH}
export PATH=${GCC_ROOT}/bin:${CONDA_ROOT}/envs/dipu_poc/bin:${CONDA_ROOT}/bin:${PLATFORM}/dep/binutils-2.27/bin:${PATH}
From 30894d09b0eafc5717dbf89de99bf165ae25efd3 Mon Sep 17 00:00:00 2001
From: tangzhiyi11
Date: Thu, 7 Dec 2023 19:48:18 +0800
Subject: [PATCH 04/58] [dicp][ascend] get soc_version from aclrt (#505)
---
dicp/dicp/vendor/AscendGraph/codegen/graph_utils.h | 6 +++---
dicp/dicp/vendor/AscendGraph/compile_job.py | 5 +++--
2 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/dicp/dicp/vendor/AscendGraph/codegen/graph_utils.h b/dicp/dicp/vendor/AscendGraph/codegen/graph_utils.h
index 380670146f..69e06fec8a 100644
--- a/dicp/dicp/vendor/AscendGraph/codegen/graph_utils.h
+++ b/dicp/dicp/vendor/AscendGraph/codegen/graph_utils.h
@@ -12,6 +12,7 @@
#include
#include
+#include "acl/acl.h"
#include "all_ops.h"
#include "ascend_string.h"
#include "ge_api.h"
@@ -83,10 +84,9 @@ class AclgraphBuilder {
public:
explicit AclgraphBuilder() {
// 1. system init
- std::string kSocVersion = "Ascend910ProB";
+ auto kSocVersion = aclrtGetSocName();
std::map global_options = {
- {AscendString(ge::ir_option::SOC_VERSION),
- AscendString(kSocVersion.c_str())},
+ {AscendString(ge::ir_option::SOC_VERSION), AscendString(kSocVersion)},
{AscendString(ge::ir_option::PRECISION_MODE), "allow_fp32_to_fp16"},
};
auto status = aclgrphBuildInitialize(global_options);
diff --git a/dicp/dicp/vendor/AscendGraph/compile_job.py b/dicp/dicp/vendor/AscendGraph/compile_job.py
index 6b3b2b8228..625dc3dfb3 100644
--- a/dicp/dicp/vendor/AscendGraph/compile_job.py
+++ b/dicp/dicp/vendor/AscendGraph/compile_job.py
@@ -34,6 +34,7 @@ def __init__(self, source_code) -> None:
'-std=c++11',
'-O3',
'-Wall',
+ '-I/usr/local/Ascend/ascend-toolkit/latest/include',
'-I/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_proto/inc',
'-I/usr/local/Ascend/ascend-toolkit/latest/include/graph',
'-I/usr/local/Ascend/ascend-toolkit/latest/include/ge',
@@ -46,10 +47,10 @@ def __init__(self, source_code) -> None:
'-lge_runner',
source_path,
'-o' + self._lib_path,
- '-Wl,-rpath,/usr/local/Ascend/ascend-toolkit/latest/compiler/lib64/stub',
'/usr/local/Ascend/ascend-toolkit/latest/compiler/lib64/stub/libgraph.so',
'/usr/local/Ascend/ascend-toolkit/latest/compiler/lib64/stub/libge_runner.so',
- '/usr/local/Ascend/ascend-toolkit/latest/lib64/libgraph_base.so']
+ '/usr/local/Ascend/ascend-toolkit/latest/lib64/libgraph_base.so',
+ '/usr/local/Ascend/ascend-toolkit/latest/runtime/lib64/stub/libascendcl.so',]
def _compile(self):
if not os.path.exists(self._lib_path):
From 61f57ebc05df32c630097c174766a96760d2fa0c Mon Sep 17 00:00:00 2001
From: Chengyuan Li <37681002+cyLi-Tiger@users.noreply.github.com>
Date: Fri, 8 Dec 2023 17:01:42 +0800
Subject: [PATCH 05/58] lcy/clang-tidy (#483)
* fix namespace declaration format
* update diopi_functions.yaml
* update clang-tidy
* update clang-tidy
* change tab into spaces
* allow const_cast
* fix bug
* fix comment
* fix comments
* fix comments
---
.../autogen_diopi_wrapper.py | 20 +-
.../diopi_functions.yaml | 255 +++++++++++++-----
.../diopi_wrapper_template.py | 96 ++++---
.../csrc_dipu/aten/ops/DIPUCopy.hpp | 4 +-
4 files changed, 248 insertions(+), 127 deletions(-)
diff --git a/dipu/scripts/autogen_diopi_wrapper/autogen_diopi_wrapper.py b/dipu/scripts/autogen_diopi_wrapper/autogen_diopi_wrapper.py
index 5fc67a107d..366d5e6eda 100644
--- a/dipu/scripts/autogen_diopi_wrapper/autogen_diopi_wrapper.py
+++ b/dipu/scripts/autogen_diopi_wrapper/autogen_diopi_wrapper.py
@@ -118,7 +118,7 @@ def create_transform_input_to_cpu_code(fun_config):
for input in optional_tensor_list_inputs:
input_process_code += f"\nc10::List> {input}_cpu;\n"
input_process_code += f"for (int i = 0; i < {input}.size();++i)" + " {\n"
- input_process_code += f"\t{input}_cpu.push_back({input}[i].has_value() && {input}[i].value().defined() ? c10::make_optional({input}[i].value().cpu()) : {input}[i]);\n"
+ input_process_code += f" {input}_cpu.push_back({input}[i].has_value() && {input}[i].value().defined() ? c10::make_optional({input}[i].value().cpu()) : {input}[i]);\n"
input_process_code += "}\n"
outputs = re.findall('Tensor\([a-z]!\)[ ]+([\w\d_]+){1}', schema[:schema.find('->')])
@@ -151,7 +151,7 @@ def create_print_op_args_code(fun_config):
code += "if (dumpOpArgLevel() > 1) {\n"
for input in inputs:
input = input.strip()
- code += f'\tstd::cout << "\t{opname}:\t{input}:" << dumpArg({input}) << std::endl;\n'
+ code += f' std::cout << "\t{opname}:\t{input}:" << dumpArg({input}) << std::endl;\n'
code += "}"
return code
@@ -455,11 +455,11 @@ def create_result_compare_code(fun_config):
code = ''
if len(return_param) == 1 :
compare_code = f'_allclose(result_cpu, result_device)'
- code += f'std::cout << "autocompare:\t{op_name}\t{return_param[0]}:" << std::endl << "\t" << dumpArg(result_cpu) << std::endl << "\t" << dumpArg(result_device) << std::endl << "\t" << {compare_code} << std::endl;\n';
+ code += f'std::cout << "autocompare:\t{op_name}\t{return_param[0]}:" << std::endl << " " << dumpArg(result_cpu) << std::endl << " " << dumpArg(result_device) << std::endl << " " << {compare_code} << std::endl;\n';
elif len(return_param) > 1:
for i in range(len(return_param)):
compare_code = f'_allclose(std::get<{i}>(result_cpu), std::get<{i}>(result_device))'
- code += f'std::cout << "autocompare:\t{op_name}\t{return_param[i]}:" << std::endl << "\t" << dumpArg(std::get<{i}>(result_cpu)) << std::endl << "\t" << dumpArg(std::get<{i}>(result_device)) << std::endl << "\t" << {compare_code} << std::endl;\n';
+ code += f'std::cout << "autocompare:\t{op_name}\t{return_param[i]}:" << std::endl << " " << dumpArg(std::get<{i}>(result_cpu)) << std::endl << " " << dumpArg(std::get<{i}>(result_device)) << std::endl << " " << {compare_code} << std::endl;\n';
inputs = re.findall('Tensor +([\w\d_]+)', schema[:schema.find('->')])
inputs += re.findall('Tensor *\([a-z]!\) *\[ *\] +([\w\d_]+)', schema[:schema.find('->')])
@@ -474,8 +474,8 @@ def create_code_to_print_fun_call_info_from_schema(fun_config):
op_name = get_op_name_from_schema(fun_config['schema'])
diopi_func = fun_config.get('interface', '')
diopi_func = diopi_func[0 : diopi_func.find('(')]
- debug_code = "if (dumpOpArgLevel() > 0) {\n\t"
- debug_code += f'printf("--%-50s %-30s \\n", "[{op_name}]:", "{diopi_func}");' + '\n'
+ debug_code = "if (dumpOpArgLevel() > 0) {\n"
+ debug_code += f' printf("--%-50s %-30s \\n", "[{op_name}]:", "{diopi_func}");' + '\n'
debug_code += "}\n"
return debug_code
@@ -539,10 +539,10 @@ def create_device_check_code(fun_config):
for args in set(tensors):
if not args.endswith('?'):
- code += f'\tTORCH_CHECK(({args}.defined() == false) || ({args}.device().type() == dipu::DIPU_DEVICE_TYPE), __FILE__, ":", __LINE__, ": {op_name}: {args} should be on dipu");\n'
+ code += f' TORCH_CHECK(({args}.defined() == false) || ({args}.device().type() == dipu::DIPU_DEVICE_TYPE), __FILE__, ":", __LINE__, ": {op_name}: {args} should be on dipu");\n'
else:
args = args[0:-1]
- code += f'\tTORCH_CHECK(({args}.has_value() == false) || ({args}.value().defined() == false) || ({args}.value().device().type() == dipu::DIPU_DEVICE_TYPE), __FILE__, ":", __LINE__, "{op_name}: {args} should be on dipu");\n'
+ code += f' TORCH_CHECK(({args}.has_value() == false) || ({args}.value().defined() == false) || ({args}.value().device().type() == dipu::DIPU_DEVICE_TYPE), __FILE__, ":", __LINE__, "{op_name}: {args} should be on dipu");\n'
if len(tensors) > 0:
code += "}"
@@ -588,7 +588,9 @@ def functions_code_gen(fun_config):
if input.strip().endswith('?'):
input = input.replace('?', '')
input_process_code += f"\n::diopiConstTensorHandle_t {input}{diopi_tensor_suffix} = nullptr;\n"
- input_process_code += f"if ({input}.has_value() && {input}.value().defined()) {input}{diopi_tensor_suffix} = dipu::diopi_helper::toDiopiTensorHandle({input}.value());\n\n"
+ input_process_code += f"if ({input}.has_value() && {input}.value().defined())" + "{\n"
+ input_process_code += f" {input}{diopi_tensor_suffix} = dipu::diopi_helper::toDiopiTensorHandle({input}.value());\n"
+ input_process_code += "}\n"
else:
input_process_code += f"::diopiConstTensorHandle_t {input}{diopi_tensor_suffix} = dipu::diopi_helper::toDiopiTensorHandle({input});\n"
diff --git a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
index 4b58185a24..46bebbd3f2 100755
--- a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
+++ b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
@@ -36,15 +36,15 @@
- schema: "aten::add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)"
custom_code_at_the_beginning: |
- if (other.numel() == 1) {
- return dipu_add_scalar_out(self, other.cpu().item(), alpha, out);
- } else if (self.numel() == 1) {
+ if (other.numel() == 1 && other.is_cpu()) {
+ return dipu_add_scalar_out(self, other.item(), alpha, out);
+ }
+ if (self.numel() == 1 && self.is_cpu()) {
if (alpha.toDouble() == 1.0) {
- return dipu_add_scalar_out(other, self.cpu().item(), alpha, out);
- } else {
- dipu_fill__scalar(out, self.cpu().item());
- return dipu_add__tensor(out, other, alpha);
+ return dipu_add_scalar_out(other, self.item(), alpha, out);
}
+ dipu_fill__scalar(out, self.item());
+ return dipu_add__tensor(out, other, alpha);
}
interface: diopiAdd(ctx, out, self, other, alpha)
@@ -55,7 +55,8 @@
custom_code_at_the_beginning: |
if (other.numel() == 1 && other.is_cpu()) {
return dipu_sub_scalar_out(self, other.item(), alpha, out);
- } else if (self.numel() == 1 && self.is_cpu()) {
+ }
+ if (self.numel() == 1 && self.is_cpu()) {
at::Tensor selfTensor = at::empty_like(other);
dipu_fill__scalar(selfTensor, self.item());
return dipu_sub_out(selfTensor, other, alpha, out);
@@ -94,7 +95,8 @@
custom_code_at_the_beginning: |
if (other.numel() == 1 && other.is_cpu()) {
return dipu_div_scalar_out(self, other.item(), out);
- } else if (self.numel() == 1 && self.is_cpu()) {
+ }
+ if (self.numel() == 1 && self.is_cpu()) {
return dipu_div_scalar_out(other, self.item(), out);
}
interface: diopiDiv(ctx, out, self, other, RoundModeNone)
@@ -108,7 +110,8 @@
custom_code_at_the_beginning: |
if (other.numel() == 1 && other.is_cpu()) {
return dipu_div_scalar_mode_out(self, other.item(), rounding_mode, out);
- } else if (self.numel() == 1 && self.is_cpu()) {
+ }
+ if (self.numel() == 1 && self.is_cpu()) {
return dipu_div_scalar_mode_out(other, self.item(), rounding_mode, out);
}
const auto mode = toDiopiRoundMode(rounding_mode.has_value() ? rounding_mode.value().data():"none");
@@ -135,7 +138,8 @@
custom_code_at_the_beginning: |
if (other.numel() == 1 && other.is_cpu()) {
return dipu_mul_scalar_out(self, other.item(), out);
- } else if (self.numel() == 1 && self.is_cpu()) {
+ }
+ if (self.numel() == 1 && self.is_cpu()) {
return dipu_mul_scalar_out(other, self.item(), out);
}
interface: diopiMul(ctx, out, self, other)
@@ -191,13 +195,19 @@
- schema: "aten::native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))"
interface: diopiBatchNorm(ctx, out, save_mean, save_invstd, input, weight, bias, const_cast(running_mean), const_cast(running_var), training, momentum, eps);
+ custom_code_before_call_diopi: |
+ // NOTE: const_cast here is safe according to pytorch's source code
+ // NOLINTBEGIN(cppcoreguidelines-pro-type-const-cast)
+ custom_code_before_return: |
+ // NOLINTEND(cppcoreguidelines-pro-type-const-cast)
- schema: "aten::native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)"
custom_code_at_the_beginning: |
const int64_t dim_c = input.size(1);
auto out0 = at::empty_like(input);
auto options = input.options().dtype(at::kFloat);
- at::Tensor out1, out2;
+ at::Tensor out1;
+ at::Tensor out2;
if (!training) {
// do not require save_mean/save_invstd when in test mode
out1 = at::empty({0}, options);
@@ -207,6 +217,11 @@
out2 = at::empty({dim_c}, options);
}
interface: diopiBatchNorm(ctx, out0, out1, out2, input, weight, bias, const_cast(running_mean), const_cast(running_var), training, momentum, eps);
+ custom_code_before_call_diopi: |
+ // NOTE: const_cast here is safe according to pytorch's source code
+ // NOLINTBEGIN(cppcoreguidelines-pro-type-const-cast)
+ custom_code_before_return: |
+ // NOLINTEND(cppcoreguidelines-pro-type-const-cast)
- schema: "native_batch_norm_backward(Tensor grad_out, Tensor input, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_invstd, bool train, float eps, bool[3] output_mask) -> (Tensor, Tensor, Tensor)"
custom_code_at_the_beginning: |
@@ -235,7 +250,7 @@
- schema: "native_layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor out, Tensor save_mean, Tensor save_invstd)"
custom_code_at_the_beginning: |
const auto input_shape = input.sizes();
- const int axis = input_shape.size() - normalized_shape.size();
+ const int axis = static_cast(input_shape.size()) - static_cast(normalized_shape.size());
const int64_t M = c10::multiply_integers(input_shape.cbegin(), input_shape.cbegin() + axis);
std::vector stats_shape(input_shape.size(), 1);
std::copy(input_shape.begin(), input_shape.begin() + axis, stats_shape.begin());
@@ -290,7 +305,8 @@
custom_code_at_the_beginning: |
if (other.numel() == 1 && other.is_cpu()) {
return dipu_eq_scalar_out(self, other.item(), out);
- } else if (self.numel() == 1 && self.is_cpu()) {
+ }
+ if (self.numel() == 1 && self.is_cpu()) {
return dipu_eq_scalar_out(other, self.item(), out);
}
interface: diopiEq(ctx, out, self, other)
@@ -312,7 +328,8 @@
custom_code_at_the_beginning: |
if (other.numel() == 1 && other.is_cpu()) {
return dipu_lt_scalar_out(self, other.item(), out);
- } else if (self.numel() == 1 && self.is_cpu()) {
+ }
+ if (self.numel() == 1 && self.is_cpu()) {
return dipu_lt_scalar_out(other, self.item(), out);
}
interface: diopiLt(ctx, out, self, other)
@@ -334,7 +351,8 @@
custom_code_at_the_beginning: |
if (other.numel() == 1 && other.is_cpu()) {
return dipu_ne_scalar_out(self, other.item(), out);
- } else if (self.numel() == 1 && self.is_cpu()) {
+ }
+ if (self.numel() == 1 && self.is_cpu()) {
return dipu_ne_scalar_out(other, self.item(), out);
}
interface: diopiNe(ctx, out, self, other)
@@ -356,7 +374,8 @@
custom_code_at_the_beginning: |
if (other.numel() == 1 && other.is_cpu()) {
return dipu_ge_scalar_out(self, other.item(), out);
- } else if (self.numel() == 1 && self.is_cpu()) {
+ }
+ if (self.numel() == 1 && self.is_cpu()) {
return dipu_ge_scalar_out(other, self.item(), out);
}
interface: diopiGe(ctx, out, self, other)
@@ -378,7 +397,8 @@
custom_code_at_the_beginning: |
if (other.numel() == 1 && other.is_cpu()) {
return dipu_gt_scalar_out(self, other.item(), out);
- } else if (self.numel() == 1 && self.is_cpu()) {
+ }
+ if (self.numel() == 1 && self.is_cpu()) {
return dipu_gt_scalar_out(other, self.item(), out);
}
interface: diopiGt(ctx, out, self, other)
@@ -400,7 +420,8 @@
custom_code_at_the_beginning: |
if (other.numel() == 1 && other.is_cpu()) {
return dipu_le_scalar_out(self, other.item(), out);
- } else if (self.numel() == 1 && self.is_cpu()) {
+ }
+ if (self.numel() == 1 && self.is_cpu()) {
return dipu_le_scalar_out(other, self.item(), out);
}
interface: diopiLe(ctx, out, self, other)
@@ -551,7 +572,7 @@
auto out = at::empty(output_shape, input.options());
interface: diopiConvTranspose2d(ctx, out, input, weight, bias, stride, padding, output_padding, groups, dilation)
forward_process_code: |
- bool bias_has_value = (bias.has_value() == true) ? bias.value().requires_grad() : false;
+ bool bias_has_value = (bias.has_value()) ? bias.value().requires_grad() : false;
saved_data:
[
stride,
@@ -577,10 +598,7 @@
if (bias_has_value) {
bias_sizes.push_back(grad_output.size(1));
}
- std::array output_mask;
- output_mask[0] = input.requires_grad();
- output_mask[1] = weight.requires_grad();
- output_mask[2] = bias_has_value;
+ std::array output_mask = {input.requires_grad(), weight.requires_grad(), bias_has_value};
backward_schema: "convolution_transpose_backward(Tensor grad_output, Tensor input, Tensor weight, int[] bias_sizes, int[] stride, int[] padding, int[] dilation, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)"
backward_return_code: |
std::vector outputs = {
@@ -662,7 +680,9 @@
- schema: "topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)"
custom_code_at_the_beginning: |
std::vector output_size(self.sizes().begin(), self.sizes().end());
- dim = dim < 0 ? (dim + output_size.size()) : dim;
+ if (dim < 0) {
+ dim = dim + static_cast(output_size.size());
+ }
output_size[dim] = k;
auto values = at::empty(output_size, self.options());
auto indices = at::empty(output_size, self.options().dtype(at::kLong));
@@ -693,7 +713,9 @@
device: [all, -cuda]
custom_fallback: True
custom_code_at_the_beginning: |
- at::Tensor grad_input, grad_weight, grad_bias;
+ at::Tensor grad_input;
+ at::Tensor grad_weight;
+ at::Tensor grad_bias;
if (output_mask[0]) {
grad_input = at::empty(input.sizes(), grad_output.options());
}
@@ -850,15 +872,17 @@
- schema: "stack(Tensor[] tensors, int dim=0) -> Tensor"
custom_code_at_the_beginning: |
- dim += dim < 0 ? tensors[0].sizes().size()+1 : 0;
- auto num_tensors = tensors.size();
+ if (dim < 0) {
+ dim += static_cast(tensors[0].sizes().size()) + 1;
+ }
+ auto num_tensors = static_cast(tensors.size());
auto shape = tensors[0].sizes();
std::vector tmp;
for (int i = 0; i < dim; i++) {
tmp.push_back(shape[i]);
}
tmp.push_back(num_tensors);
- for (int i = dim; i < shape.size(); i++) {
+ for (int i = static_cast(dim); i < shape.size(); i++) {
tmp.push_back(shape[i]);
}
const std::vector& const_tmp = tmp;
@@ -873,28 +897,45 @@
- schema: "stack.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)"
custom_code_at_the_beginning: |
- dim += dim < 0 ? tensors[0].sizes().size() : 0;
+ if (dim < 0) {
+ dim += static_cast(tensors[0].sizes().size());
+ }
std::vector diopiTensorHandles(tensors.size());
for (size_t i = 0; i < tensors.size(); ++i) {
diopiTensorHandles[i] = dipu::diopi_helper::toDiopiTensorHandle(tensors.at(i));
}
- interface: diopiStack(ctx, out, diopiTensorHandles.data(), tensors.size(), dim)
+ interface: diopiStack(ctx, out, diopiTensorHandles.data(), static_cast(tensors.size()), dim)
- schema: "sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)"
custom_code_at_the_beginning: |
- auto dim_ = dim < 0 ? (dim + self.sizes().size()) : dim;
+ int64_t dim_ = 0;
+ if (dim < 0) {
+ dim_ = dim + static_cast(self.sizes().size());
+ } else {
+ dim_ = dim;
+ }
auto values = at::empty(self.sizes(), self.options());
auto indices = at::empty(self.sizes(), self.options().dtype(at::kLong));
interface: diopiSort(ctx, values, indices, self, dim_, descending, nullptr)
- schema: "sort.values(Tensor self, int dim=-1, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)"
custom_code_at_the_beginning: |
- auto dim_ = dim < 0 ? (dim + self.sizes().size()) : dim;
+ int64_t dim_ = 0;
+ if (dim < 0) {
+ dim_ = dim + static_cast(self.sizes().size());
+ } else {
+ dim_ = dim;
+ }
interface: diopiSort(ctx, values, indices, self, dim_, descending, nullptr)
- schema: "sort.values_stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)"
custom_code_at_the_beginning: |
- auto dim_ = dim < 0 ? (dim + self.sizes().size()) : dim;
+ int64_t dim_ = 0;
+ if (dim < 0) {
+ dim_ = dim + static_cast(self.sizes().size());
+ } else {
+ dim_ = dim;
+ }
bool stable_ = stable.has_value() ? stable.value() : false;
const bool *p = &stable_;
interface: diopiSort(ctx, values, indices, self, dim_, descending, p)
@@ -1047,7 +1088,7 @@
}
const auto& self_sizes = self.sizes();
- for (int i = self_sizes.size() - 1, j = output_size.size() - 1;i >= 0;i--, j--) {
+ for (int i = static_cast(self_sizes.size()) - 1, j = static_cast(output_size.size()) - 1;i >= 0;i--, j--) {
output_size[j] *= self_sizes.at(i);
}
@@ -1057,15 +1098,20 @@
- schema: rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
custom_code_at_the_beginning: |
auto out = at::empty_like(self);
+ // NOLINTNEXTLINE(readability-suspicious-call-argument)
return dipu_sub_out(other, self, alpha, out);
interface: diopiSub(ctx, out, other, self, alpha)
- schema: "unique_dim(Tensor self, int dim, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor out, Tensor indices, Tensor counts)"
custom_code_at_the_beginning: |
- at::Tensor out, counts, indices;
+ at::Tensor out;
+ at::Tensor counts;
+ at::Tensor indices;
if (return_inverse) {
const auto ndims = self.sizes().size();
- dim += (dim < 0 ? ndims : 0);
+ if (dim < 0) {
+ dim += static_cast(ndims);
+ }
indices = at::empty({self.sizes().at(dim)}, self.options().dtype(at::kLong));
}
diopiTensorHandle_t out_ptr = nullptr;
@@ -1080,7 +1126,9 @@
- schema: "_unique2(Tensor self, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor out, Tensor indices, Tensor counts)"
custom_code_at_the_beginning: |
- at::Tensor out, counts, indices;
+ at::Tensor out;
+ at::Tensor counts;
+ at::Tensor indices;
if (return_inverse) {
indices = at::empty(self.sizes(), self.options().dtype(at::kLong));
}
@@ -1100,7 +1148,7 @@
std::transform(tensors.begin(), tensors.end(), diopiTensorHandles.begin(), [](const at::Tensor& tensor){
return dipu::diopi_helper::toDiopiTensorHandle(tensor);
});
- interface: diopiCat(ctx, out, diopiTensorHandles.data(), tensors.size(), dim);
+ interface: diopiCat(ctx, out, diopiTensorHandles.data(), static_cast(tensors.size()), dim);
- schema: "masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor"
custom_code_at_the_beginning: |
@@ -1125,7 +1173,7 @@
- schema: "min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) min, Tensor(b!) min_indices)"
custom_code_at_the_beginning: |
- dim += ((dim >= 0) ? 0 : self.sizes().size());
+ dim += ((dim >= 0) ? 0 : static_cast(self.sizes().size()));
interface: diopiMin(ctx, min, min_indices, self, dim)
- schema: "max(Tensor self) -> Tensor"
@@ -1143,7 +1191,7 @@
- schema: "max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_indices) -> (Tensor(a!) max, Tensor(b!) max_indices)"
custom_code_at_the_beginning: |
- dim += ((dim >= 0) ? 0 : self.sizes().size());
+ dim += ((dim >= 0) ? 0 : static_cast(self.sizes().size()));
if (max_indices.numel() <= 0) {
auto output_size = self.sizes().vec();
if (keepdim) {
@@ -1265,11 +1313,11 @@
custom_code_at_the_beginning: |
std::vector size(2);
custom_code_before_call_diopi: |
- if (output_size.size() > 0) {
+ if (!output_size.empty()) {
std::copy(output_sizeVector.begin(), output_sizeVector.end(), size.begin());
} else {
- size[0] = std::floor(self.size(-2) * scales_h.value_or(1.0));
- size[1] = std::floor(self.size(-1) * scales_w.value_or(1.0));
+ size[0] = std::floor(static_cast(self.size(-2)) * scales_h.value_or(1.0));
+ size[1] = std::floor(static_cast(self.size(-1)) * scales_w.value_or(1.0));
}
interface: diopiUpsampleNearest(ctx, out, self, size);
@@ -1278,11 +1326,11 @@
custom_code_at_the_beginning: |
std::vector size(2);
custom_code_before_call_diopi: |
- if (output_size.size() > 0) {
+ if (!output_size.empty()) {
std::copy(output_sizeVector.begin(), output_sizeVector.end(), size.begin());
} else {
- size[0] = std::floor(self.size(-2) * scales_h.value_or(1.0));
- size[1] = std::floor(self.size(-1) * scales_w.value_or(1.0));
+ size[0] = std::floor(static_cast(self.size(-2)) * scales_h.value_or(1.0));
+ size[1] = std::floor(static_cast(self.size(-1)) * scales_w.value_or(1.0));
}
const char* mode = "bilinear";
interface: diopiUpsampleLinear(ctx, out, self, size, align_corners, mode);
@@ -1292,11 +1340,11 @@
custom_code_at_the_beginning: |
std::vector size(2);
custom_code_before_call_diopi: |
- if (output_size.size() > 0) {
+ if (!output_size.empty()) {
std::copy(output_sizeVector.begin(), output_sizeVector.end(), size.begin());
} else {
- size[0] = std::floor((*(input_sizeVector.rbegin() + 1)) * scales_h.value_or(1.0));
- size[1] = std::floor((*(input_sizeVector.rbegin())) * scales_w.value_or(1.0));
+ size[0] = std::floor(static_cast(*(input_sizeVector.rbegin() + 1)) * scales_h.value_or(1.0));
+ size[1] = std::floor(static_cast(*(input_sizeVector.rbegin())) * scales_w.value_or(1.0));
}
interface: diopiUpsampleNearestBackward(ctx, grad_input, grad_output, size, input_size)
@@ -1305,11 +1353,11 @@
custom_code_at_the_beginning: |
std::vector size(2);
custom_code_before_call_diopi: |
- if (output_size.size() > 0) {
+ if (!output_size.empty()) {
std::copy(output_sizeVector.begin(), output_sizeVector.end(), size.begin());
} else {
- size[0] = std::floor((*(input_sizeVector.rbegin() + 1)) * scales_h.value_or(1.0));
- size[1] = std::floor((*(input_sizeVector.rbegin())) * scales_w.value_or(1.0));
+ size[0] = std::floor(static_cast(*(input_sizeVector.rbegin() + 1)) * scales_h.value_or(1.0));
+ size[1] = std::floor(static_cast(*(input_sizeVector.rbegin())) * scales_w.value_or(1.0));
}
const char* mode = "bilinear";
interface: diopiUpsampleLinearBackward(ctx, grad_input, grad_output, size, input_size, align_corners, mode)
@@ -1418,7 +1466,7 @@
custom_code_at_the_beginning: |
auto shape = self.sizes();
std::vector output_shape(shape.begin(), shape.end());
- dim += dim >= 0 ? 0 : shape.size();
+ dim += dim >= 0 ? 0 : static_cast(shape.size());
output_shape[dim] = index.numel();
auto out = at::empty({output_shape}, self.options());
interface: diopiIndexSelect(ctx, out, self, dim, index)
@@ -1527,7 +1575,35 @@
at::Tensor neg_log_likelihood = at::empty({batch_size}, options);
at::Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, options);
backward_return_code: |
- std::vector outputs(7);
+ /* Note: This kernel's output size will be checked by pytorch/torch/csrc/autograd/custom_function.h
+ *
+ * ''' custom_function.h
+ * auto num_outputs = static_cast(outputs.size());
+ * // Returning too many results is ok, but only as long as they're all
+ * // undefined. Truncate the result vector in that case.
+ * if (num_outputs > num_forward_inputs) {
+ * bool all_undef = true;
+ * for (const auto i : c10::irange(num_forward_inputs, num_outputs)) {
+ * all_undef &= (!outputs[i].defined());
+ * }
+ * if (all_undef) {
+ * outputs.resize(num_forward_inputs);
+ * num_outputs = num_forward_inputs;
+ * }
+ * }
+ *
+ * if (num_outputs != num_forward_inputs) {
+ * std::string msg("function ");
+ * msg += name() + " returned an incorrect number of gradients (expected ";
+ * msg += c10::to_string(num_forward_inputs) + ", got ";
+ * msg += c10::to_string(num_outputs) + ")";
+ * throw std::runtime_error(msg);
+ * }
+ * '''
+ */
+
+ constexpr int kSameAsInputSize = 7;
+ std::vector outputs(kSameAsInputSize);
outputs[0] = result;
return outputs;
@@ -1610,7 +1686,35 @@
at::Tensor neg_log_likelihood = at::empty({batch_size}, options);
at::Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, options);
backward_return_code: |
- std::vector outputs(7);
+ /* Note: This kernel's output size will be checked by pytorch/torch/csrc/autograd/custom_function.h
+ *
+ * ''' custom_function.h
+ * auto num_outputs = static_cast(outputs.size());
+ * // Returning too many results is ok, but only as long as they're all
+ * // undefined. Truncate the result vector in that case.
+ * if (num_outputs > num_forward_inputs) {
+ * bool all_undef = true;
+ * for (const auto i : c10::irange(num_forward_inputs, num_outputs)) {
+ * all_undef &= (!outputs[i].defined());
+ * }
+ * if (all_undef) {
+ * outputs.resize(num_forward_inputs);
+ * num_outputs = num_forward_inputs;
+ * }
+ * }
+ *
+ * if (num_outputs != num_forward_inputs) {
+ * std::string msg("function ");
+ * msg += name() + " returned an incorrect number of gradients (expected ";
+ * msg += c10::to_string(num_forward_inputs) + ", got ";
+ * msg += c10::to_string(num_outputs) + ")";
+ * throw std::runtime_error(msg);
+ * }
+ * '''
+ */
+
+ constexpr int kSameAsInputSize = 7;
+ std::vector outputs(kSameAsInputSize);
outputs[0] = result;
return outputs;
@@ -1755,7 +1859,7 @@
indices_tensor_vec[i] = (indices[i].has_value() && indices[i].value().defined()) ? indices[i].value().to(self.device()) : at::Tensor();
indices_vec[i] = diopi_helper::toDiopiTensorHandle(indices_tensor_vec[i]);
}
- interface: diopiIndex(ctx, &out_ptr, self, indices_vec.data(), indices_vec.size())
+ interface: diopiIndex(ctx, &out_ptr, self, indices_vec.data(), static_cast(indices_vec.size()))
custom_code_before_return: |
dipu::getCurrentDIPUStream().synchronize();
out = *reinterpret_cast(out_ptr);
@@ -1769,7 +1873,7 @@
indices_tensor_vec[i] = (indices[i].has_value() && indices[i].value().defined()) ? indices[i].value().to(self.device()) : at::Tensor();
indices_vec[i] = diopi_helper::toDiopiTensorHandle(indices_tensor_vec[i]);
}
- interface: diopiIndexPut(ctx, self, self, values, indices_vec.data(), indices_vec.size(), accumulate)
+ interface: diopiIndexPut(ctx, self, self, values, indices_vec.data(), static_cast(indices_vec.size()), accumulate)
- schema: "_cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor"
custom_code_at_the_beginning: |
@@ -1832,15 +1936,15 @@
int num_blocks = 1;
for(int i = 0; i < 2; i++){
- num_blocks *= int((input_shape[i + 2] + 2 * padding[i] - dilation[i] * (kernel_size[i] - 1) - 1) / stride[i]) + 1;
+ num_blocks *= static_cast((input_shape[i + 2] + 2 * padding[i] - dilation[i] * (kernel_size[i] - 1) - 1) / stride[i]) + 1;
}
- int channels = input_shape[1];
+ int channels = static_cast(input_shape[1]);
for(int i = 0; i < 2; i++){
- channels *= kernel_size[i];
+ channels *= static_cast(kernel_size[i]);
}
std::vector out_shape({channels, num_blocks});
- if(batched_input == true){
+ if(batched_input){
out_shape.insert(out_shape.begin(), input_shape[0]);
}
auto out = at::empty({out_shape}, self.options());
@@ -1856,13 +1960,13 @@
input_shape.insert(input_shape.begin(), 1);
}
- int channels = input_shape[1];
+ int channels = static_cast(input_shape[1]);
for(int i = 0; i < 2; i++){
- channels = channels / kernel_size[i];
+ channels = channels / static_cast(kernel_size[i]);
}
std::vector out_shape({channels, output_size.at(0).expect_int(), output_size.at(1).expect_int()});
- if(batched_input == true){
+ if(batched_input){
out_shape.insert(out_shape.begin(), input_shape[0]);
}
auto out = at::empty({out_shape}, self.options());
@@ -1907,7 +2011,12 @@
auto shape = input.size(1);
auto out0 = at::empty({shape}, input.options().dtype(at::kFloat));
auto out1 = at::empty({shape}, input.options().dtype(at::kFloat));
- interface: diopiBatchNormGatherStatsWithCounts(ctx, out0, out1, input, mean, invstd, const_cast(running_mean), const_cast(running_var), momentum, eps, counts)
+ interface: diopiBatchNormGatherStatsWithCounts(ctx, out0, out1, input, mean, invstd, const_cast(running_mean), const_cast(running_var), static_cast(momentum), static_cast(eps), counts)
+ custom_code_before_call_diopi: |
+ // NOTE: const_cast here is safe according to pytorch's source code
+ // NOLINTBEGIN(cppcoreguidelines-pro-type-const-cast)
+ custom_code_before_return: |
+ // NOLINTEND(cppcoreguidelines-pro-type-const-cast)
- schema: batch_norm_backward_reduce(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g) -> (Tensor, Tensor, Tensor, Tensor)
custom_code_at_the_beginning: |
@@ -1936,7 +2045,7 @@
- schema: batch_norm_elemt(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps) -> Tensor
custom_code_at_the_beginning: |
auto out = at::empty_like(input);
- interface: diopiBatchNormElemt(ctx, out, input, weight, bias, mean, invstd, eps);
+ interface: diopiBatchNormElemt(ctx, out, input, weight, bias, mean, invstd, static_cast(eps));
- schema: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!)
interface: diopiSmoothL1Loss(ctx, out, self, target, static_cast(reduction), static_cast(beta));
@@ -2143,7 +2252,7 @@
auto selfVec = self.vec();
auto scalarsCpu = scalars.cpu();
for (size_t i = 0;i < self.size();i++) {
- dipu_addcmul_(selfVec[i], tensor1[i], tensor2[i], scalarsCpu[i].item());
+ dipu_addcmul_(selfVec[i], tensor1[i], tensor2[i], scalarsCpu[static_cast(i)].item());
}
return;
interface: diopiAddcmulInp(ctx, self, tensor1, tensor2, scalars)
@@ -2174,7 +2283,7 @@
auto selfVec = self.vec();
auto scalarsCpu = scalars.cpu();
for (size_t i = 0;i < self.size();i++) {
- dipu_addcdiv_(selfVec[i], tensor1[i], tensor2[i], scalarsCpu[i].item());
+ dipu_addcdiv_(selfVec[i], tensor1[i], tensor2[i], scalarsCpu[static_cast(i)].item());
}
return;
interface: diopiAddcdivInp(ctx, self, tensor1, tensor2, scalars)
@@ -2262,12 +2371,14 @@
custom_fallback: True
custom_code_at_the_beginning: |
std::vector diopiTensorHandles(self.size(), nullptr);
+ // NOTE: const_cast here is safe according to pytorch's source code
+ // NOLINTBEGIN(cppcoreguidelines-pro-type-const-cast)
std::transform(self.begin(), self.end(), diopiTensorHandles.begin(), [](const at::Tensor& t){
return dipu::diopi_helper::toDiopiTensorHandle(const_cast(t));
});
- interface: diopiAmpForeachNonFiniteCheckAndUnscaleInp(ctx, diopiTensorHandles.data(), self.size(), found_inf, inv_scale)
- autocompare: disable
+ // NOLINTEND(cppcoreguidelines-pro-type-const-cast)
+ interface: diopiAmpForeachNonFiniteCheckAndUnscaleInp(ctx, diopiTensorHandles.data(), static_cast(self.size()), found_inf, inv_scale)
- schema: _amp_update_scale_(Tensor(a!) self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor(a!)
custom_fallback: True
- interface: diopiAmpUpdateScaleInp(ctx, self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval)
+ interface: diopiAmpUpdateScaleInp(ctx, self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, static_cast(growth_interval))
diff --git a/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py b/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py
index 7eda79b15c..22076410d1 100644
--- a/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py
+++ b/dipu/scripts/autogen_diopi_wrapper/diopi_wrapper_template.py
@@ -16,29 +16,37 @@
#include
#include "CustomFallbackFunctions.hpp"
#include "csrc_dipu/aten/ops/DIPUCopy.hpp"
+#include
$header_include_code
-namespace dipu::native {
-
-using dipu::diopi_helper::toDiopiGeneratorHandle;
+// NOTE: some kernels (e.g. _foreach_add_.List) have custom codes at the beginning ending with early return.
+// This is a workaround indended to skip some of the autogened codes (e.g. type cast, calling DIOPI, etc.).
+// NOLINTBEGIN(readability-redundant-control-flow)
+namespace dipu {
-using namespace dipu::diopi_helper;
+namespace native {
+
+using dipu::diopi_helper::toDiopiGeneratorHandle;
+using dipu::diopi_helper::toDiopiSize;
+using dipu::diopi_helper::toDiopiRoundMode;
-$functions_code
+$functions_code
+} // namespace native
+} // namespace dipu
-} // namespace dipu::native
+// NOLINTEND(readability-redundant-control-flow)
namespace at {
DIPU_LIBRARY_IMPL(aten, DIPU_DEVICE_TYPE_MACRO, m) {
- $op_register_code
+ $op_register_code
}
DIPU_LIBRARY_IMPL(aten, DIPU_AUTOGRAD_DEVICE_TYPE_MACRO, m) {
- $autograd_op_register_code
+ $autograd_op_register_code
}
} // namespace at
@@ -49,34 +57,34 @@
"""
// $comment
$cppsignautre {
- dipu::profile::RecordBlockCreator _(__FUNCTION__);
- $custom_code_at_the_beginning
+ dipu::profile::RecordBlockCreator _(__FUNCTION__);
+ $custom_code_at_the_beginning
- ::diopiContext context(dipu::getCurrentDIPUStream().rawstream());
- auto ctx = &context;
+ ::diopiContext context(dipu::getCurrentDIPUStream().rawstream());
+ auto ctx = &context;
- $input_process_code
+ $input_process_code
- $output_process_code
+ $output_process_code
- $attrs_process_code
+ $attrs_process_code
- $device_check_code
+ $device_check_code
- $custom_code_before_call_diopi
+ $custom_code_before_call_diopi
- dipu::profile::RecordBlockCreator dipuRecorder(R"($diopi_fun_call_code)");
- ::diopiError_t ret = $diopi_fun_call_code
- dipuRecorder.end();
- if (checkDiopiReturnValue()) {
- TORCH_CHECK(ret == ::diopiSuccess, __FILE__, ":", __LINE__, R"($diopi_fun_call_code)", " error, error code is ", ret, "error message is ", diopiGetLastErrorString());
- }
+ dipu::profile::RecordBlockCreator dipuRecorder(R"($diopi_fun_call_code)");
+ ::diopiError_t ret = $diopi_fun_call_code
+ dipuRecorder.end();
+ if (checkDiopiReturnValue()) {
+ TORCH_CHECK(ret == ::diopiSuccess, __FILE__, ":", __LINE__, R"($diopi_fun_call_code)", " error, error code is ", ret, "error message is ", diopiGetLastErrorString());
+ }
- $custom_code_before_return
+ $custom_code_before_return
- synchronizeIfEnable();
+ synchronizeIfEnable();
- $return_code
+ $return_code
}
"""
@@ -94,29 +102,29 @@
"""
class $autograd_function_name : public torch::autograd::Function<$autograd_function_name> {
public:
- static $return_code forward(torch::autograd::AutogradContext *ctx, $param_list) {
- $forward_process_code
+ static $return_code forward(torch::autograd::AutogradContext *ctx, $param_list) {
+ $forward_process_code
- $save_for_backward_code
+ $save_for_backward_code
- at::AutoDispatchBelowADInplaceOrView g;
- return $call_forward_impl_code;
- }
+ at::AutoDispatchBelowADInplaceOrView g;
+ return $call_forward_impl_code;
+ }
static std::vector backward(torch::autograd::AutogradContext *ctx, std::vector grad_outputs) {
- $load_saved_data_code
+ $load_saved_data_code
- $cal_grad_code
+ $cal_grad_code
- $call_backward_impl_code
+ $call_backward_impl_code
- $backward_return_code
+ $backward_return_code
}
};
$cppsignautre {
- auto result = $autograd_function_name::apply($arg_name_list);
- $wrappter_custom_return
+ auto result = $autograd_function_name::apply($arg_name_list);
+ $wrappter_custom_return
}
"""
@@ -125,15 +133,15 @@ class $autograd_function_name : public torch::autograd::Function<$autograd_funct
"""
// $comment
$cppsignautre {
- std::cout << std::endl << __FUNCTION__ << std::endl;
- $transform_input_to_cpu_code
+ std::cout << std::endl << __FUNCTION__ << std::endl;
+ $transform_input_to_cpu_code
- $execute_op_on_cpu_code
+ $execute_op_on_cpu_code
- $execute_op_on_device_code
+ $execute_op_on_device_code
- $transform_result_to_cpu_code
+ $transform_result_to_cpu_code
- $result_compare_code
+ $result_compare_code
}
"""
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUCopy.hpp b/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUCopy.hpp
index 47f519984e..c7298900ae 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUCopy.hpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUCopy.hpp
@@ -15,12 +15,12 @@ namespace dipu {
namespace native {
// NOTICE: these 2 func defined in AutoGenedKernels.cpp
// if dipu autogen support header file gen, remove this
-at::Tensor dipu_wrap_diopi_cast_dtype(const at::Tensor& src,
+at::Tensor dipu_wrap_diopi_cast_dtype(const at::Tensor& self,
at::ScalarType dtype);
// if dipu autogen support proxy one torch op to multiple diopi op, remove
// this.
-at::Tensor& dipu_wrap_diopi_copy_inp(at::Tensor& dst, const at::Tensor& src,
+at::Tensor& dipu_wrap_diopi_copy_inp(at::Tensor& self, const at::Tensor& src,
bool non_blocking);
} // namespace native
From 16028db2b438b819ac5618bbe90b012a9c81197c Mon Sep 17 00:00:00 2001
From: Aaron
Date: Mon, 11 Dec 2023 11:35:18 +0800
Subject: [PATCH 06/58] [FIX] fix virtual memory error of using SUPA (#468)
* [FIX] fix virtual memory of SUPA
* [FIX] fix incorrect copy
* [FIX] remove useless copy and add missing 'supa'in cmakelists.txt
---
dipu/CMakeLists.txt | 6 ++--
.../csrc_dipu/vendor/supa/copyinplace.cpp | 17 +++++++----
.../csrc_dipu/vendor/supa/deviceimpl.cpp | 29 ++++++++++++++-----
3 files changed, 37 insertions(+), 15 deletions(-)
diff --git a/dipu/CMakeLists.txt b/dipu/CMakeLists.txt
index d94770c289..24b368a9de 100644
--- a/dipu/CMakeLists.txt
+++ b/dipu/CMakeLists.txt
@@ -44,7 +44,7 @@ elseif (${DEVICE} IN_LIST DEVICE_TOPSRIDER)
elseif (${DEVICE} IN_LIST DEVICE_SUPA)
set(USE_SUPA ON)
set(UsedVendor supa)
- set(DIOPI_IMPL_OPT "")
+ set(DIOPI_IMPL_OPT "supa")
#SUPA DEVICE DOES NOT NEED TO BUILD DIOPI, so set the target to "" to control the workflow.
elseif (${DEVICE} IN_LIST DEVICE_DROPLET)
set(USE_DROPLET ON)
@@ -81,14 +81,14 @@ if(NOT DEFINED DIPU_ABI_V)
OUTPUT_VARIABLE DIPU_ABI_V)
endif()
-if(NOT DEFINED DIPU_COMPILED_WITH_CXX11_ABI)
+if(NOT DEFINED DIPU_COMPILED_WITH_CXX11_ABI)
execute_process(
COMMAND
sh -x -c
"python -c 'import torch;print(1 if torch.compiled_with_cxx11_abi() else 0)'"
OUTPUT_VARIABLE DIPU_COMPILED_WITH_CXX11_ABI)
endif()
-
+
if(DIPU_COMPILED_WITH_CXX11_ABI GREATER 0)
set(DIPU_COMPILED_WITH_CXX11_ABI 1)
else()
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp b/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp
index 0b84a9e8ab..9149e8e985 100644
--- a/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp
+++ b/dipu/torch_dipu/csrc_dipu/vendor/supa/copyinplace.cpp
@@ -18,11 +18,18 @@ class SUPACopyInplace : public DIPUCopyInpOnDIOPI {
SUPACopyInplace() = default;
~SUPACopyInplace() = default;
- // assume it can handle between device.
- void copyNodirectBetweenDevices(at::Tensor& dst, const at::Tensor& src,
- bool non_blocking,
- CopyParamsInfo& info) override {
- dipu_wrap_diopi_copy_inp(dst, src, non_blocking);
+ void run(at::Tensor& dst, const at::Tensor& src, bool non_blocking) override {
+ auto curStream = dipu::getCurrentDIPUStream();
+ ::diopiContext context(curStream.rawstream());
+ auto ctx = &context;
+ auto diopi_src = dipu::diopi_helper::toDiopiTensorHandle(src);
+ auto diopi_dst = dipu::diopi_helper::toDiopiTensorHandle(dst);
+ TORCH_CHECK(diopiError_t::diopiSuccess ==
+ diopiCopyInp(ctx, diopi_src, diopi_dst));
+ // syncAfterCopy
+ if (!non_blocking) {
+ dipu::devapis::syncStream(curStream.rawstream());
+ }
}
};
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/supa/deviceimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/supa/deviceimpl.cpp
index c04b74e79f..f2f2983869 100644
--- a/dipu/torch_dipu/csrc_dipu/vendor/supa/deviceimpl.cpp
+++ b/dipu/torch_dipu/csrc_dipu/vendor/supa/deviceimpl.cpp
@@ -184,6 +184,8 @@ DIPU_API void freeHost(void* p) { free(p); }
extern "C" {
void* br_device_malloc(uint64_t bytes);
void br_device_free(void* ptr);
+// get physical address from ptr(virtual)
+void* get_phy_ptr(const void* ptr);
}
DIPU_API OpStatus mallocDevice(void** p, size_t nbytes, bool throwExcepion) {
@@ -206,47 +208,60 @@ DIPU_API bool isPinnedPtr(const void* p) { return false; }
// (asynchronous) set val
DIPU_API void memSetAsync(const deviceStream_t stream, void* ptr, int val,
size_t size) {
- SUPA_CALL(suMemsetAsync(ptr, val, size, stream));
+ auto phy_gpu_addr = get_phy_ptr(ptr);
+ SUPA_CALL(suMemsetAsync(phy_gpu_addr, val, size, stream));
}
// (synchronous) copy from device to a device
DIPU_API void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void* dst,
deviceId_t srcDevId, const void* src) {
// SUPA uses Unified Virtual Address
- SUPA_CALL(suMemcpy(dst, src, nbytes, suMemcpyDeviceToDevice));
+ auto phy_src_gpu_addr = get_phy_ptr(src);
+ auto phy_dst_gpu_addr = get_phy_ptr(dst);
+ SUPA_CALL(suMemcpy(phy_dst_gpu_addr, phy_src_gpu_addr, nbytes,
+ suMemcpyDeviceToDevice));
}
// (synchronous) copy from host to a device
DIPU_API void memCopyH2D(size_t nbytes, /*deviceId_t dstDevId,*/ void* dst,
/*Host srcDev,*/ const void* src) {
- SUPA_CALL(suMemcpy(dst, src, nbytes, suMemcpyHostToDevice));
+ auto phy_dst_gpu_addr = get_phy_ptr(dst);
+ SUPA_CALL(suMemcpy(phy_dst_gpu_addr, src, nbytes, suMemcpyHostToDevice));
}
// (synchronous) copy from a device to host
DIPU_API void memCopyD2H(size_t nbytes, /*Host dstDev,*/ void* dst,
/*deviceId_t srcDevId,*/ const void* src) {
- SUPA_CALL(suMemcpy(dst, src, nbytes, suMemcpyDeviceToHost));
+ auto phy_src_gpu_addr = get_phy_ptr(src);
+ SUPA_CALL(suMemcpy(dst, phy_src_gpu_addr, nbytes, suMemcpyDeviceToHost));
}
// (asynchronous) copy from device to a device
DIPU_API void memCopyD2DAsync(const deviceStream_t stream, size_t nbytes,
deviceId_t dstDevId, void* dst,
deviceId_t srcDevId, const void* src) {
- SUPA_CALL(suMemcpyAsync(dst, src, nbytes, stream, suMemcpyDeviceToDevice));
+ auto phy_src_gpu_addr = get_phy_ptr(src);
+ auto phy_dst_gpu_addr = get_phy_ptr(dst);
+ SUPA_CALL(suMemcpyAsync(phy_dst_gpu_addr, phy_src_gpu_addr, nbytes, stream,
+ suMemcpyDeviceToDevice));
}
// (asynchronous) copy from host to a device
DIPU_API void memCopyH2DAsync(const deviceStream_t stream, size_t nbytes,
/*deviceId_t dstDevId,*/ void* dst,
/*Host srcDev,*/ const void* src) {
- SUPA_CALL(suMemcpyAsync(dst, src, nbytes, stream, suMemcpyHostToDevice));
+ auto phy_dst_gpu_addr = get_phy_ptr(dst);
+ SUPA_CALL(suMemcpyAsync(phy_dst_gpu_addr, src, nbytes, stream,
+ suMemcpyHostToDevice));
}
// (asynchronous) copy from a device to host
DIPU_API void memCopyD2HAsync(const deviceStream_t stream, size_t nbytes,
/*Host dstDev,*/ void* dst,
/*deviceId_t srcDevId,*/ const void* src) {
- SUPA_CALL(suMemcpyAsync(dst, src, nbytes, stream, suMemcpyDeviceToHost));
+ auto phy_src_gpu_addr = get_phy_ptr(src);
+ SUPA_CALL(suMemcpyAsync(dst, phy_src_gpu_addr, nbytes, stream,
+ suMemcpyDeviceToHost));
}
} // end namespace devapis
} // end namespace dipu
From b178d4c5f5bdb49ae224c237f836d211670e2836 Mon Sep 17 00:00:00 2001
From: wyz5864 <109072365+wyz5864@users.noreply.github.com>
Date: Mon, 11 Dec 2023 11:36:25 +0800
Subject: [PATCH 07/58] make conv2d out at right memory-format (#502)
---
.../autogen_diopi_wrapper/diopi_functions.yaml | 6 +++---
dipu/tests/python/unittests/test_conv2d.py | 17 +++++++++++++++++
2 files changed, 20 insertions(+), 3 deletions(-)
diff --git a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
index 46bebbd3f2..af1e62e564 100755
--- a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
+++ b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
@@ -515,7 +515,7 @@
int64_t out_height = (height + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1) / stride[0] + 1;
int64_t out_width = (width + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1) / stride[1] + 1;
c10::SmallVector output_size = {batch_size, out_channel, out_height, out_width};
- at::Tensor out = at::empty(output_size, input.options());
+ at::Tensor out = at::empty(output_size, input.options().memory_format(input.suggest_memory_format()));
interface: diopiConvolution2d(&context, out, input, weight, bias, stride, padding, dilation, groups)
- schema: "convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)"
@@ -527,10 +527,10 @@
at::Tensor grad_bias;
std::vector bias_sizes;
if (output_mask[0]) {
- grad_input = at::empty(input.sizes(), input.options());
+ grad_input = at::empty_like(input);
}
if (output_mask[1]) {
- grad_weight = at::empty(weight.sizes(), weight.options().dtype(at::kFloat));
+ grad_weight = at::empty(weight.sizes(), weight.options().dtype(at::kFloat).memory_format(weight.suggest_memory_format()));
}
if (output_mask[2]) {
bias_sizes.push_back(grad_output.size(1));
diff --git a/dipu/tests/python/unittests/test_conv2d.py b/dipu/tests/python/unittests/test_conv2d.py
index e93181c670..b33677aef3 100644
--- a/dipu/tests/python/unittests/test_conv2d.py
+++ b/dipu/tests/python/unittests/test_conv2d.py
@@ -39,6 +39,23 @@ def test_conv_2d(self):
)
# print("conv2d output compare successfully")
+ def test_conv2d_nhwc(self):
+ device = torch.device("dipu")
+
+ m = nn.Conv2d(2, 3, 3).to(device=device, memory_format=torch.channels_last)
+ self.assertTrue(m.weight.is_contiguous(memory_format=torch.channels_last))
+
+ x = torch.rand(2, 2, 5, 5).to(device=device, memory_format=torch.channels_last)
+ x.requires_grad_()
+ self.assertTrue(x.is_contiguous(memory_format=torch.channels_last))
+
+ y = m(x)
+ self.assertTrue(y.is_contiguous(memory_format=torch.channels_last))
+
+ y.backward(torch.rand_like(y))
+ self.assertTrue(x.grad.is_contiguous(memory_format=torch.channels_last))
+ self.assertTrue(m.weight.grad.is_contiguous(memory_format=torch.channels_last))
+
if __name__ == "__main__":
run_tests()
From ad46e399c64976f525211a4ea2fad0d25ffa9ca3 Mon Sep 17 00:00:00 2001
From: tangzhiyi11
Date: Mon, 11 Dec 2023 11:49:30 +0800
Subject: [PATCH 08/58] [dicp][ascend] add fusion switch file for ascend (#512)
---
dicp/dicp/vendor/AscendGraph/codegen/fusion_switch.cfg | 10 ++++++++++
dicp/dicp/vendor/AscendGraph/codegen/graph_compile.cpp | 8 +++++---
dicp/dicp/vendor/AscendGraph/codegen/graph_utils.h | 8 +++++++-
dicp/dicp/vendor/AscendGraph/compile_job.py | 3 ++-
4 files changed, 24 insertions(+), 5 deletions(-)
create mode 100644 dicp/dicp/vendor/AscendGraph/codegen/fusion_switch.cfg
diff --git a/dicp/dicp/vendor/AscendGraph/codegen/fusion_switch.cfg b/dicp/dicp/vendor/AscendGraph/codegen/fusion_switch.cfg
new file mode 100644
index 0000000000..71834659c8
--- /dev/null
+++ b/dicp/dicp/vendor/AscendGraph/codegen/fusion_switch.cfg
@@ -0,0 +1,10 @@
+{
+ "Switch":{
+ "GraphFusion":{
+ "ALL":"on"
+ },
+ "UBFusion":{
+ "ALL":"on"
+ }
+ }
+}
diff --git a/dicp/dicp/vendor/AscendGraph/codegen/graph_compile.cpp b/dicp/dicp/vendor/AscendGraph/codegen/graph_compile.cpp
index fbced63f60..99f422dcaa 100644
--- a/dicp/dicp/vendor/AscendGraph/codegen/graph_compile.cpp
+++ b/dicp/dicp/vendor/AscendGraph/codegen/graph_compile.cpp
@@ -1,7 +1,8 @@
#include "graph_utils.h"
static void compile(const std::string& graph_path,
- const std::string& graph_json_file) {
+ const std::string& graph_json_file,
+ const std::string& fusion_switch_file) {
std::string graph_name = "BuildGraph";
Graph graph(graph_name.c_str());
std::ifstream f(graph_json_file);
@@ -18,13 +19,14 @@ static void compile(const std::string& graph_path,
}
}
- AclgraphBuilder builder;
+ AclgraphBuilder builder{fusion_switch_file};
builder.saveGraph(graph_path, graph, options);
}
int main(int argc, char* argv[]) {
std::string graph_path{argv[1]};
std::string graph_json_file{argv[2]};
- compile(graph_path, graph_json_file);
+ std::string fusion_switch_file{argv[3]};
+ compile(graph_path, graph_json_file, fusion_switch_file);
return 0;
}
diff --git a/dicp/dicp/vendor/AscendGraph/codegen/graph_utils.h b/dicp/dicp/vendor/AscendGraph/codegen/graph_utils.h
index 69e06fec8a..2cbacf3bcb 100644
--- a/dicp/dicp/vendor/AscendGraph/codegen/graph_utils.h
+++ b/dicp/dicp/vendor/AscendGraph/codegen/graph_utils.h
@@ -82,11 +82,14 @@ ge::Operator genInput(const std::string op_name,
class AclgraphBuilder {
public:
- explicit AclgraphBuilder() {
+ explicit AclgraphBuilder(const std::string& fusion_switch_file)
+ : _fusion_switch_file(fusion_switch_file) {
// 1. system init
auto kSocVersion = aclrtGetSocName();
std::map global_options = {
{AscendString(ge::ir_option::SOC_VERSION), AscendString(kSocVersion)},
+ {AscendString(ge::ir_option::FUSION_SWITCH_FILE),
+ AscendString(_fusion_switch_file.c_str())},
{AscendString(ge::ir_option::PRECISION_MODE), "allow_fp32_to_fp16"},
};
auto status = aclgrphBuildInitialize(global_options);
@@ -122,6 +125,9 @@ class AclgraphBuilder {
aclgrphBuildFinalize();
std::cout << "aclgrphBuildFinalize success!" << std::endl;
}
+
+ private:
+ std::string _fusion_switch_file;
};
ge::Format get_ascend_format(const std::string& format) {
diff --git a/dicp/dicp/vendor/AscendGraph/compile_job.py b/dicp/dicp/vendor/AscendGraph/compile_job.py
index 625dc3dfb3..93b70dca43 100644
--- a/dicp/dicp/vendor/AscendGraph/compile_job.py
+++ b/dicp/dicp/vendor/AscendGraph/compile_job.py
@@ -28,6 +28,7 @@ def __init__(self, source_code) -> None:
graph_util_path = load_and_run.__file__.replace('/load_and_run.py', '')
source_path = graph_util_path + '/graph_compile.cpp'
json_util_path = graph_util_path + '/nlohmann'
+ self.fusion_switch_file = graph_util_path + '/fusion_switch.cfg'
self._cmd = ['/usr/bin/c++',
'-D_GLIBCXX_USE_CXX11_ABI=0',
'-fPIC',
@@ -67,7 +68,7 @@ def get_key(self):
def build_graph(self, output_path, graph_path):
self._compile()
- cmd = [self._lib_path, output_path, graph_path]
+ cmd = [self._lib_path, output_path, graph_path, self.fusion_switch_file]
try:
subprocess.check_output(cmd, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
From 0bbb2ee32caa4aaa06a7193aab22bbef8e3c904b Mon Sep 17 00:00:00 2001
From: Lingjie
Date: Wed, 13 Dec 2023 14:45:53 +0800
Subject: [PATCH 09/58] [dipu] Speedup profiler ctor when not enabled (#526)
* speedup profiler ctor
* clean & format include
---
.../csrc_dipu/profiler/profiler.cpp | 71 ++++++++-----------
dipu/torch_dipu/csrc_dipu/profiler/profiler.h | 71 ++++++++++++-------
2 files changed, 75 insertions(+), 67 deletions(-)
diff --git a/dipu/torch_dipu/csrc_dipu/profiler/profiler.cpp b/dipu/torch_dipu/csrc_dipu/profiler/profiler.cpp
index ea23bf43f0..4789b49848 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/profiler.cpp
+++ b/dipu/torch_dipu/csrc_dipu/profiler/profiler.cpp
@@ -1,12 +1,17 @@
#include "profiler.h"
-#include
#include
-#include
+#include
+#include
#include
+#include
#include
+#include "csrc_dipu/profiler/CorrelationIDManager.h"
+
+#include "ThreadUtil.h"
+
namespace dipu {
namespace profile {
@@ -265,22 +270,20 @@ void abandonAllRecords() {
resetId();
}
-RecordCreator::RecordCreator(const string_t& name, size_t opId,
+RecordCreator::RecordCreator(string_t name, size_t opId,
uint64_t linkCorrelationId,
- const ExtraRecordInfo& extraInfo) {
+ ExtraRecordInfo extraInfo) {
if (isEnable()) {
- name_ = name;
+ name_ = std::move(name);
opId_ = opId;
begin_ = torch::profiler::impl::getTime();
end_ = false;
linkCorrelationId_ = linkCorrelationId;
- extraInfo_ = extraInfo;
+ extraInfo_ = std::move(extraInfo);
}
}
-RecordCreator::~RecordCreator() { end(); }
-
-void RecordCreator::end() {
+void RecordCreator::end() noexcept {
if (!end_) {
RecordsImpl::get().addRecord(
Record{name_, opId_, begin_,
@@ -295,12 +298,12 @@ void RecordCreator::end() {
DeviceRecordCreator::DeviceRecordCreator(string_t name, deviceStream_t stream,
int streamId, size_t opId,
uint64_t linkCorrelationId,
- const ExtraRecordInfo& extraInfo) {
+ ExtraRecordInfo extraInfo) {
if (isEnable()) {
DeviceRecordsImpl::get().ensureSetup(stream);
- name_ = name;
+ name_ = std::move(name);
opId_ = opId;
- extraInfo_ = extraInfo;
+ extraInfo_ = std::move(extraInfo);
stream_ = stream;
streamId_ = streamId;
pStart_.reset(new DeviceEvent());
@@ -311,9 +314,7 @@ DeviceRecordCreator::DeviceRecordCreator(string_t name, deviceStream_t stream,
}
}
-DeviceRecordCreator::~DeviceRecordCreator() { end(); }
-
-void DeviceRecordCreator::end() {
+void DeviceRecordCreator::end() noexcept {
if (!end_) {
TORCH_CHECK(pStart_, "dipu profiler error with pStart_ is not inited");
TORCH_CHECK(pStop_, "dipu profiler error with pStop_ is not inited");
@@ -329,12 +330,12 @@ void DeviceRecordCreator::end() {
}
static std::string extraceFunction(const std::string& functionName) {
- auto start = functionName.find_first_not_of(":");
+ auto start = functionName.find_first_not_of(':');
if (start == std::string::npos) {
return "";
}
- auto end = functionName.find_first_of("(");
+ auto end = functionName.find_first_of('(');
if (end == std::string::npos) {
end = functionName.size();
}
@@ -345,32 +346,18 @@ static std::string extraceFunction(const std::string& functionName) {
return functionName.substr(start, end - start);
}
-RecordBlockCreator::RecordBlockCreator(string_t name,
- const ExtraRecordInfo& extraInfo,
- deviceStream_t stream, int streamId,
- bool enProfile) {
- if (enProfile && isEnable()) {
- size_t opId = generateId();
- uint64_t correlationId =
- CorrelationIDManager::instance().getCorrelationID();
- name = extraceFunction(name);
- pHostRecord_.reset(new RecordCreator("LaunchKernel_" + name, opId,
- correlationId, extraInfo));
- pDeviceRecord_.reset(new DeviceRecordCreator(name, stream, streamId, opId,
- correlationId, extraInfo));
- }
-}
-
-void RecordBlockCreator::end() {
- if (!finish_) {
- pHostRecord_.reset();
- pDeviceRecord_.reset();
- }
- finish_ = true;
+void RecordBlockCreator::initialize(string_t name, ExtraRecordInfo extraInfo,
+ deviceStream_t stream,
+ c10::StreamId streamId) {
+ size_t opId = generateId();
+ uint64_t correlationId = CorrelationIDManager::instance().getCorrelationID();
+ name = extraceFunction(name);
+ pHostRecord_ = std::make_unique("LaunchKernel_" + name, opId,
+ correlationId, extraInfo);
+ pDeviceRecord_ = std::make_unique(
+ std::move(name), stream, streamId, opId, correlationId,
+ std::move(extraInfo));
}
-
-RecordBlockCreator::~RecordBlockCreator() { end(); }
-
} // namespace profile
} // namespace dipu
diff --git a/dipu/torch_dipu/csrc_dipu/profiler/profiler.h b/dipu/torch_dipu/csrc_dipu/profiler/profiler.h
index eed733567c..7cb5a750d5 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/profiler.h
+++ b/dipu/torch_dipu/csrc_dipu/profiler/profiler.h
@@ -1,23 +1,23 @@
#pragma once
-#include
-#include
-#include
+#include
#include
#include