diff --git a/dicp/dicp/vendor/AscendGraph/conversion.py b/dicp/dicp/vendor/AscendGraph/conversion.py index c662c33ea..c1b11e21a 100644 --- a/dicp/dicp/vendor/AscendGraph/conversion.py +++ b/dicp/dicp/vendor/AscendGraph/conversion.py @@ -500,17 +500,17 @@ def ne(self, a, b): @register_conversion([aten.lt.Scalar, aten.lt.Tensor]) def lt(self, x, y): - y_shape = [1] - if isinstance(y, torch.fx.proxy.Proxy): - y_shape = list(y.node.meta['val'].shape) x_shape = list(x.node.meta['val'].shape) + y_shape = [] if not isinstance( + y, torch.fx.proxy.Proxy) else list(y.node.meta['val'].shape) out = list(fx_traceback.get_current_meta()['val'].shape) out_shape = self.get_shape_proxy(out) x, y = self.binary_cmp_cast_input(x, y) - - if self.shape_prod(x_shape) < self.shape_prod(out): + dynamic_shape = symint_in_shape(x_shape) or symint_in_shape( + y_shape) or symint_in_shape(out) + if dynamic_shape and (self.shape_prod(x_shape) < self.shape_prod(out)): x = self.get_proxy(ascend_op.BroadcastTo, (x, out_shape)) - if self.shape_prod(y_shape) < self.shape_prod(out): + if dynamic_shape and (self.shape_prod(y_shape) < self.shape_prod(out)): y = self.get_proxy(ascend_op.BroadcastTo, (y, out_shape)) return self.get_proxy(ascend_op.Less, (x, y)) @@ -834,6 +834,26 @@ def compute_stacked_indices(self, indices, src_shape): @register_conversion(torch.ops.aten.index_put.default) def index_put_default(self, x, indices, values): + x_shape = list(x.node.meta['val'].shape) + + # When the element type of indices is bool, the masked_fill operator + # should be used to achieve this. Currently, only indices with a length + # of 1 are supported. + if any([index.node.meta['val'].dtype in [torch.bool] + for index in indices if index is not None]): + assert len(indices) == 1 + index = indices[0] + index_shape = list(index.node.meta['val'].shape) + index_shape_size = len(index_shape) + x_shape_size = len(x_shape) + if index_shape_size == x_shape_size: + return self.masked_fill(x, index, values) + reshape_shape = index_shape + [1] * \ + (x_shape_size - index_shape_size) + reshape_op = self.get_const_proxy(reshape_shape, torch.int32) + index = self.get_proxy(ascend_op.Reshape, (index, reshape_op)) + return self.masked_fill(x, index, values) + # following comment is from tensorflow tensor_scatter_nd_update: # index_depth = indices.shape[-1] # batch_shape = indices.shape[:-1] @@ -845,7 +865,6 @@ def index_put_default(self, x, indices, values): # tf.tensor_scatter_nd_update param 'indices' is different from # indices in torch.ops.aten.index_put.default, we use broadcast and # stack to construct param 'indices' in tf.tensor_scatter_nd_update - x_shape = list(x.node.meta['val'].shape) stacked_indices, indices_broadcast_shape, stacked_indices_last_dim = \ self.compute_stacked_indices(indices, x.node.meta['val'].shape) values_broadcast_shape = indices_broadcast_shape + x_shape[stacked_indices_last_dim:] # batch_shape + inner_shape diff --git a/dicp/test/model/test_hf.py b/dicp/test/model/test_hf.py index 7b978b54d..fe439fe06 100644 --- a/dicp/test/model/test_hf.py +++ b/dicp/test/model/test_hf.py @@ -46,4 +46,4 @@ response_list.append(response.split('\n')) for idx, dicp_result in enumerate(response_list): - assert dicp_result == cuda_results[idx] + assert dicp_result == cuda_results[idx], f"dicp result:{dicp_result}, cuda_result:{cuda_results[idx]}" diff --git a/dicp/test/model/test_llama.py b/dicp/test/model/test_llama.py index 0353b3aee..433538bfb 100644 --- a/dicp/test/model/test_llama.py +++ b/dicp/test/model/test_llama.py @@ -101,4 +101,4 @@ def test_inference( prompt, max_gen_len=max_gen_len, temperature=temperature, top_p=top_p, device=device ) dicp_result = dicp_result[0].split("\n") - assert dicp_result == cuda_results[i] + assert dicp_result == cuda_results[i], f"dicp result:{dicp_result}, cuda_result:{cuda_results[i]}" diff --git a/dicp/test/model/test_stable_diffusion.py b/dicp/test/model/test_stable_diffusion.py index 4f8ae9ac8..a94a1258a 100644 --- a/dicp/test/model/test_stable_diffusion.py +++ b/dicp/test/model/test_stable_diffusion.py @@ -50,8 +50,10 @@ def test_inference( dicp_pipe = StableDiffusionPipeline.from_pretrained(model_path).to(device) dicp_pipe.text_encoder = torch.compile(dicp_pipe.text_encoder, backend=backend, dynamic=dynamic) dicp_pipe.unet = torch.compile(dicp_pipe.unet, backend=backend, dynamic=dynamic) - if backend == "ascendgraph": - dicp_pipe.vae.decoder = torch.compile(dicp_pipe.vae.decoder, backend=backend, dynamic=dynamic) + + # Temporarily run decoder on CPU + # if backend == "ascendgraph": + # dicp_pipe.vae.decoder = torch.compile(dicp_pipe.vae.decoder, backend=backend, dynamic=dynamic) dicp_image = dicp_pipe(prompt, num_inference_steps=num_inference_steps).images[0] similarity = get_similarity(cpu_image, dicp_image) diff --git a/dicp/test/op/test_index_put.py b/dicp/test/op/test_index_put.py index 98beeb8c3..26ee1a17d 100644 --- a/dicp/test/op/test_index_put.py +++ b/dicp/test/op/test_index_put.py @@ -34,7 +34,7 @@ class TestIndexPut(): ((1, 2, 10, 8 ,7, 11), (None, None, (2, 3), (4, 1, 1), None, (1, 2, 1)), (4, 2, 3, 1, 2, 7)))]) @pytest.mark.parametrize("compiled_model", compiled_model) - def test_torch_split(self, sizes, dtype, compiled_model): + def test_torch_index_put(self, sizes, dtype, compiled_model): device = get_device() size = sizes.dynamic if compiled_model.dynamic else sizes.static x_size = size[0] @@ -59,3 +59,27 @@ def test_torch_split(self, sizes, dtype, compiled_model): dicp_output = compiled_model.model(dicp_input1, dicp_indices, dicp_value) assert torch.allclose(output.cpu(), dicp_output.cpu(), equal_nan=True) + + @pytest.mark.parametrize("dtype", [torch.float16]) + @pytest.mark.parametrize("sizes", [Size((5,), (5, 3)), Size((3, 5), (5, 3)), Size((2, 3, 4), (2, 4))]) + @pytest.mark.parametrize("compiled_model", compiled_model) + def test_torch_index_put_to_masked_fill(self, sizes, dtype, compiled_model): + device = get_device() + size = sizes.dynamic if compiled_model.dynamic else sizes.static + mask_size = size if len(size) == 1 else size[0] + + input = torch.randn(size, dtype=dtype) + mask = torch.randn(mask_size, dtype=dtype) > 0 + value = torch.tensor(1).to(dtype) + indices = [mask] + + dicp_input = input.to(device) + dicp_indices = [mask.to(device)] + dicp_value = value.to(device) + + output = model(input, indices, value) + dynamo.reset() + update_dynamo_config(compiled_model.dynamic) + dicp_output = compiled_model.model(dicp_input, dicp_indices, dicp_value) + + assert torch.allclose(output.cpu(), dicp_output.cpu(), equal_nan=True) diff --git a/dipu/.clang-tidy b/dipu/.clang-tidy index 8c9ddabec..14fefff48 100644 --- a/dipu/.clang-tidy +++ b/dipu/.clang-tidy @@ -66,6 +66,8 @@ CheckOptions: value: true - key: readability-implicit-bool-conversion.AllowPointerConditions value: true + - key: readability-simplify-boolean-expr.SimplifyDeMorgan + value: false # --- Google's naming convention BEGIN --- # modified part is marked as comment - key: readability-identifier-naming.ClassCase diff --git a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml index 47efb6a19..5ff126b9a 100755 --- a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml +++ b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml @@ -60,7 +60,7 @@ return dipu_sub_scalar_out(self, other.item(), alpha, out); } if (self.numel() == 1 && self.is_cpu()) { - at::Tensor selfTensor = at::empty_like(other); + at::Tensor selfTensor = nodispatch::empty_like(other); dipu_fill__scalar(selfTensor, self.item()); return dipu_sub_out(selfTensor, other, alpha, out); } @@ -78,7 +78,7 @@ - schema: "div.Scalar(Tensor self, Scalar other) -> Tensor" custom_code_at_the_beginning: | - auto out = at::empty_like(self); + auto out = nodispatch::empty_like(self); interface: diopiDivScalar(ctx, out, self, other, RoundModeNone) - schema: "div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)" @@ -101,7 +101,7 @@ } if (self.numel() == 1 && self.is_cpu()) { // todo:(wentao) temp solution, need using a type promotion strategy - at::Tensor selfD = at::empty_like(other); + at::Tensor selfD = nodispatch::empty_like(other); dipu_fill__scalar(selfD, self.item()); return dipu_div_out(selfD, other, out); } @@ -124,7 +124,7 @@ interface: diopiDiv(ctx, out, self, other, mode) - schema: "mul.Scalar(Tensor self, Scalar other) -> Tensor" - custom_code_at_the_beginning: auto out = at::empty_like(self); + custom_code_at_the_beginning: auto out = nodispatch::empty_like(self); interface: diopiMulScalar(ctx, out, self, other) - schema: "mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)" @@ -164,7 +164,7 @@ - schema: "logical_and(Tensor self, Tensor other) -> Tensor" custom_code_at_the_beginning: | auto shape = at::infer_size(self.sizes(), other.sizes()); - auto out = at::empty(shape, self.options().dtype(at::kBool)); + auto out = nodispatch::empty(shape, self.options().dtype(at::kBool)); interface: diopiLogicalAnd(ctx, out, self, other); - schema: "logical_or.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)" @@ -181,7 +181,7 @@ - schema: "logical_or(Tensor self, Tensor other) -> Tensor" custom_code_at_the_beginning: | auto shape = at::infer_size(self.sizes(), other.sizes()); - auto out = at::empty(shape, self.options().dtype(at::kBool)); + auto out = nodispatch::empty(shape, self.options().dtype(at::kBool)); interface: diopiLogicalOr(ctx, out, self, other); - schema: "logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)" @@ -196,7 +196,7 @@ - schema: "logical_not(Tensor self) -> Tensor" custom_code_at_the_beginning: | - auto out = at::empty(self.sizes(), self.options().dtype(at::kBool)); + auto out = nodispatch::empty(self.sizes(), self.options().dtype(at::kBool)); interface: diopiLogicalNot(ctx, out, self); - schema: "aten::native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))" @@ -212,7 +212,7 @@ const int64_t dim_c = input.size(1); const auto input_shape = input.sizes(); const int axis = input_shape.size(); - auto out0 = at::empty_like(input, input.options(), \ + auto out0 = nodispatch::empty_like(input, input.options(), \ (axis==4?\ (c10::optional(${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt})):\ (axis==5?\ @@ -224,11 +224,11 @@ at::Tensor out2; if (!training) { // do not require save_mean/save_invstd when in test mode - out1 = at::empty({0}, options); - out2 = at::empty({0}, options); + out1 = nodispatch::empty({0}, options); + out2 = nodispatch::empty({0}, options); } else { - out1 = at::empty({dim_c}, options); - out2 = at::empty({dim_c}, options); + out1 = nodispatch::empty({dim_c}, options); + out2 = nodispatch::empty({dim_c}, options); } interface: diopiBatchNorm(ctx, out0, out1, out2, input, weight, bias, const_cast(running_mean), const_cast(running_var), training, momentum, eps); custom_code_before_call_diopi: | @@ -243,30 +243,30 @@ auto options = input.options().dtype(at::kFloat); const auto input_shape = input.sizes(); const int axis = input_shape.size(); - at::Tensor out0 = at::empty_like(input, input.options(), \ + at::Tensor out0 = nodispatch::empty_like(input, input.options(), \ (axis==4?\ (c10::optional(${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt})):\ (axis==5?\ (c10::optional(${PREFERRED_MEMORY_FORMAT_PLACEHOLDER_3D:-c10::nullopt})):\ c10::optional(c10::nullopt))\ )); - at::Tensor out1 = at::empty({dim_c}, options); - at::Tensor out2 = at::empty({dim_c}, options); + at::Tensor out1 = nodispatch::empty({dim_c}, options); + at::Tensor out2 = nodispatch::empty({dim_c}, options); interface: diopiBatchNormBackward(ctx, out0, out1, out2, grad_out, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps) - schema: "native_group_norm(Tensor input, Tensor? weight, Tensor? bias, SymInt N, SymInt C, SymInt HxW, int group, float eps) -> (Tensor, Tensor, Tensor)" custom_code_at_the_beginning: | - auto out0 = at::empty_like(input); + auto out0 = nodispatch::empty_like(input); auto options = input.options().dtype(dipu::native::mixed_output_scalar_type(input, weight, bias)); - auto out1 = at::empty({N.expect_int(), group}, options); - auto out2 = at::empty({N.expect_int(), group}, options); + auto out1 = nodispatch::empty({N.expect_int(), group}, options); + auto out2 = nodispatch::empty({N.expect_int(), group}, options); interface: diopiGroupNorm(ctx, out0, out1, out2, input, weight, bias, group, eps); - schema: "native_group_norm_backward(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, SymInt N, SymInt C, SymInt HxW, int group, bool[3] output_mask) -> (Tensor, Tensor, Tensor)" custom_code_at_the_beginning: | - auto out0 = output_mask[0] ? at::empty_like(input) : at::Tensor(); - auto out1 = output_mask[1] ? at::empty_like(weight.value()) : at::Tensor(); - auto out2 = output_mask[2] ? at::empty_like(weight.value()) : at::Tensor(); + auto out0 = output_mask[0] ? nodispatch::empty_like(input) : at::Tensor(); + auto out1 = output_mask[1] ? nodispatch::empty_like(weight.value()) : at::Tensor(); + auto out2 = output_mask[2] ? nodispatch::empty_like(weight.value()) : at::Tensor(); interface: diopiGroupNormBackward(ctx, out0, out1, out2, grad_out, input, weight, mean, rstd, group); - schema: "native_layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor out, Tensor save_mean, Tensor save_invstd)" @@ -277,9 +277,9 @@ std::vector stats_shape(input_shape.size(), 1); std::copy(input_shape.begin(), input_shape.begin() + axis, stats_shape.begin()); auto options = input.options(); - auto save_mean = at::empty(stats_shape, options); - auto save_invstd = at::empty(stats_shape, options); - auto out = at::empty_like( + auto save_mean = nodispatch::empty(stats_shape, options); + auto save_invstd = nodispatch::empty(stats_shape, options); + auto out = nodispatch::empty_like( input, c10::nullopt /* dtype */, c10::nullopt /* layout */, @@ -293,9 +293,9 @@ - schema: "native_layer_norm_backward(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)" custom_code_at_the_beginning: | auto options = grad_out.options(); - auto grad_input = output_mask[0] ? at::empty(input.sizes(), options) : at::Tensor(); - auto grad_weight = output_mask[1] ? at::empty(weight.value().sizes(), options) : at::Tensor(); - auto grad_bias = output_mask[2] ? at::empty(bias.value().sizes(), options) : at::Tensor(); + auto grad_input = output_mask[0] ? nodispatch::empty(input.sizes(), options) : at::Tensor(); + auto grad_weight = output_mask[1] ? nodispatch::empty(weight.value().sizes(), options) : at::Tensor(); + auto grad_bias = output_mask[2] ? nodispatch::empty(bias.value().sizes(), options) : at::Tensor(); interface: diopiLayerNormBackward(ctx, grad_input, grad_weight, grad_bias, grad_out, input, weight, bias, mean, rstd, normalized_shape); - schema: "aten::native_layer_norm_backward.out(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask, *, Tensor(a!) grad_input, Tensor(b!) grad_weight, Tensor(c!) grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))" @@ -312,12 +312,12 @@ auto out_tensor_size = self.sizes().vec(); out_tensor_size[self.dim() - 2] = output_size[0].expect_int(); out_tensor_size[self.dim() - 1] = output_size[1].expect_int(); - at::Tensor out = at::empty(out_tensor_size, self.options()); + at::Tensor out = nodispatch::empty(out_tensor_size, self.options()); interface: diopiAdaptiveAvgPool2d(ctx, out, self, output_size) - schema: "_adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor" custom_code_at_the_beginning: | - auto out = at::empty_like(self); + auto out = nodispatch::empty_like(self); interface: diopiAdaptiveAvgPool2dBackward(ctx, out, grad_output, self); - schema: "avg_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!)" @@ -470,7 +470,7 @@ interface: diopiReluInp(ctx, self) - schema: "relu(Tensor self) -> Tensor" - custom_code_at_the_beginning: auto out = at::empty_like(self); + custom_code_at_the_beginning: auto out = nodispatch::empty_like(self); interface: diopiRelu(ctx, out, self) - schema: "randperm.out(int n, *, Tensor(a!) out) -> Tensor(a!)" @@ -489,7 +489,7 @@ ::diopiConstTensorHandle_t self_dtype_diopi = dipu::diopi_helper::toDiopiTensorHandle(self_dtype); if (out.numel() == 0) { std::vector output_shape = infer_reduce_op_shape(self.sizes(), dim.value_or(std::vector()), keepdim); - out = at::empty(output_shape, self.options()); + out = nodispatch::empty(output_shape, self.options()); } ::diopiSize_t diopi_size = toDiopiSize(dim); interface: diopiSum(ctx, out, self_dtype_diopi, diopi_size) @@ -503,7 +503,7 @@ register_op: False custom_code_at_the_beginning: | const auto reductionDiopi = static_cast<::diopiReduction_t>(reduction); - at::Tensor out = at::empty_like(self); + at::Tensor out = nodispatch::empty_like(self); interface: diopiCrossEntropyLossBackward(ctx, out, grad_output, self, target, weight, reductionDiopi, ignore_index.expect_int(), label_smoothing) - schema: "cross_entropy_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, float label_smoothing=0.0) -> Tensor" @@ -514,9 +514,9 @@ at::Tensor out; auto options = self.options(); if (reductionDiopi == ReductionNone) { - out = at::empty(target.sizes(), options); + out = nodispatch::empty(target.sizes(), options); } else { - out = at::empty({}, options); + out = nodispatch::empty({}, options); } interface: diopiCrossEntropyLoss(ctx, out, self, target, weight, reductionDiopi, ignore_index_int, label_smoothing) backward_schema: "cross_entropy_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, float label_smoothing=0.0) -> Tensor" @@ -546,7 +546,7 @@ int64_t out_height = (height + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1) / stride[0] + 1; int64_t out_width = (width + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1) / stride[1] + 1; c10::SmallVector output_size = {batch_size, out_channel, out_height, out_width}; - at::Tensor out = at::empty(output_size, input.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-input.suggest_memory_format()}); + at::Tensor out = nodispatch::empty(output_size, input.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-input.suggest_memory_format()}); interface: diopiConvolution2d(&context, out, input, weight, bias, stride, padding, dilation, groups) - schema: "convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)" @@ -558,14 +558,14 @@ at::Tensor grad_bias; std::vector bias_sizes; if (output_mask[0]) { - grad_input = at::empty(input.sizes(), input.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt}); + grad_input = nodispatch::empty(input.sizes(), input.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt}); } if (output_mask[1]) { - grad_weight = at::empty(weight.sizes(), weight.options().dtype(at::kFloat).memory_format(weight.suggest_memory_format())); + grad_weight = nodispatch::empty(weight.sizes(), weight.options().dtype(at::kFloat).memory_format(weight.suggest_memory_format())); } if (output_mask[2]) { bias_sizes.push_back(grad_output.size(1)); - grad_bias = at::empty(bias_sizes, grad_output.options()); + grad_bias = nodispatch::empty(bias_sizes, grad_output.options()); } custom_code_before_call_diopi: | ::diopiSize_t* bias_sizes_ptr = output_mask[2] ? &bias_sizesDiopiSize : nullptr; @@ -578,10 +578,10 @@ at::Tensor grad_input; at::Tensor grad_weight; at::Tensor grad_bias; - grad_input = at::empty(input.sizes(), input.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt}); - grad_weight = at::empty(weight.sizes(), weight.options().dtype(at::kFloat)); + grad_input = nodispatch::empty(input.sizes(), input.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt}); + grad_weight = nodispatch::empty(weight.sizes(), weight.options().dtype(at::kFloat)); if (output_mask[2]) { - grad_bias = at::empty({grad_output.size(1)}, grad_output.options()); + grad_bias = nodispatch::empty({grad_output.size(1)}, grad_output.options()); } custom_code_before_call_diopi: | ::diopiSize_t* bias_sizes_ptr = output_mask[2] ? &bias_sizesDiopiSize : nullptr; @@ -600,7 +600,7 @@ const int64_t w_out = (w_in - 1) * stride[1] - 2 * padding[1] + (dilation[1] * (kernel_width - 1) + 1) + output_padding[1]; const int64_t c_out = weight.size(1) * groups; auto output_shape = input.sizes().size() == 3 ? std::vector{c_out, h_out, w_out} : std::vector{n, c_out, h_out, w_out}; - auto out = at::empty(output_shape, input.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt}); + auto out = nodispatch::empty(output_shape, input.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt}); interface: diopiConvTranspose2d(ctx, out, input, weight, bias, stride, padding, output_padding, groups, dilation) forward_process_code: | bool bias_has_value = (bias.has_value()) ? bias.value().requires_grad() : false; @@ -639,11 +639,11 @@ - schema: "native_dropout(Tensor input, float p, bool? train) -> (Tensor, Tensor)" custom_code_at_the_beginning: | - at::Tensor out0 = at::empty_like(input); + at::Tensor out0 = nodispatch::empty_like(input); at::Tensor out1; bool train_ = train.value_or(false); if (train_) { - out1 = at::empty(input.sizes(), input.options().dtype(at::kByte));; + out1 = nodispatch::empty(input.sizes(), input.options().dtype(at::kByte));; } diopiGeneratorHandle_t generatorDiopiGenerator = toDiopiGeneratorHandle(getDefaultDIPUGenerator()); interface: diopiDropout(ctx, out0, out1, input, p, train_, generatorDiopiGenerator) @@ -672,7 +672,7 @@ - schema: "abs(Tensor self) -> Tensor" custom_code_at_the_beginning: | - auto out = at::empty_like(self); + auto out = nodispatch::empty_like(self); interface: diopiAbs(ctx, out, self) - schema: "abs_(Tensor(a!) self) -> Tensor(a!)" @@ -715,8 +715,8 @@ dim = dim + static_cast(output_size.size()); } output_size[dim] = k; - auto values = at::empty(output_size, self.options()); - auto indices = at::empty(output_size, self.options().dtype(at::kLong)); + auto values = nodispatch::empty(output_size, self.options()); + auto indices = nodispatch::empty(output_size, self.options().dtype(at::kLong)); interface: diopiTopk(ctx, values, indices, self, k, dim, largest, sorted) - schema: "mean.out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)" @@ -730,7 +730,7 @@ torch_ver: ["200",] custom_code_at_the_beginning: | std::vector output_shape = infer_reduce_op_shape(self.sizes(), dim.value_or(std::vector()), keepdim); - auto out = at::empty(output_shape, self.options()); + auto out = nodispatch::empty(output_shape, self.options()); bool unbiased = correction.value_or(1) == 1; ::diopiSize_t diopi_size = toDiopiSize(dim); interface: diopiStd(ctx, out, self, diopi_size, unbiased); @@ -746,7 +746,7 @@ torch_ver: ["211",] custom_code_at_the_beginning: | std::vector output_shape = infer_reduce_op_shape(self.sizes(), dim.value_or(std::vector()), keepdim); - auto out = at::empty(output_shape, self.options()); + auto out = nodispatch::empty(output_shape, self.options()); bool unbiased = correction.value_or(1).toLong() == 1; ::diopiSize_t diopi_size = toDiopiSize(dim); interface: diopiStd(ctx, out, self, diopi_size, unbiased); @@ -766,13 +766,13 @@ at::Tensor grad_weight; at::Tensor grad_bias; if (output_mask[0]) { - grad_input = at::empty(input.sizes(), grad_output.options()); + grad_input = nodispatch::empty(input.sizes(), grad_output.options()); } if (output_mask[1]) { - grad_weight = at::empty(weight.sizes(), grad_output.options()); + grad_weight = nodispatch::empty(weight.sizes(), grad_output.options()); } if (output_mask[2]) { - grad_bias = at::empty({grad_output.size(-1)}, grad_output.options()); + grad_bias = nodispatch::empty({grad_output.size(-1)}, grad_output.options()); } interface: diopiLinearBackward(ctx, grad_input, grad_weight, grad_bias, grad_output, input, weight) @@ -782,7 +782,7 @@ custom_code_at_the_beginning: | std::vector output_size(input.sizes().begin(), input.sizes().end()); output_size.back() = weight.sizes()[0]; - auto out = at::empty(output_size, input.options()); + auto out = nodispatch::empty(output_size, input.options()); interface: diopiLinear(ctx, out, input, weight, bias) - schema: "_log_softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype, *, Tensor(a!) out) -> Tensor(a!)" @@ -810,7 +810,7 @@ register_op: False size_attr: [kernel_size, stride, padding, dilation] custom_code_at_the_beginning: | - auto out = at::empty(input.sizes(), grad_output.options()); + auto out = nodispatch::empty(input.sizes(), grad_output.options()); interface: diopiMaxPool2dBackward(ctx, out, grad_output, input, kernel_size, stride, padding, dilation, ceil_mode, indices) - schema: "max_pool2d(Tensor input, int[2] kernel_size=1, int[2] stride=1, int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor" @@ -824,7 +824,7 @@ int64_t out_height = std::floor((height + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1) / stride[0] + 1); int64_t out_width = std::floor((width + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1) / stride[1] + 1); c10::SmallVector output_size = {batch_size, channel, out_height, out_width}; - at::Tensor out = at::empty(output_size, input.options()); + at::Tensor out = nodispatch::empty(output_size, input.options()); interface: diopiMaxPool2d(&context, out, input, kernel_size, stride, padding, dilation, ceil_mode) autograd: True saved_data: [kernel_size, stride, padding, dilation, input, ceil_mode] @@ -861,7 +861,7 @@ if (reduction != 0) { output = torch::tensor(0.0, self.options()); } else { - output = at::empty(target.sizes(), self.options()); + output = nodispatch::empty(target.sizes(), self.options()); } - schema: nll_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!) @@ -873,7 +873,7 @@ - schema: nll_loss2d_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight) -> Tensor grad_input interface: diopiNLLLossBackward(&context, grad_input, grad_output, self, target, weight, static_cast(reduction), ignore_index.expect_int()); custom_code_at_the_beginning: | - auto grad_input = at::empty(self.sizes(), self.options()); + auto grad_input = nodispatch::empty(self.sizes(), self.options()); - schema: "threshold_backward.grad_input(Tensor grad_output, Tensor self, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!)" interface: diopiThresholdBackward(ctx, grad_input, grad_output, self, &threshold) @@ -937,7 +937,7 @@ } const std::vector& const_tmp = tmp; shape = at::ArrayRef(const_tmp); - auto out = at::empty({shape}, tensors[0].options()); + auto out = nodispatch::empty({shape}, tensors[0].options()); std::vector diopiTensorHandles(tensors.size()); for (size_t i = 0; i < tensors.size(); ++i) { @@ -964,8 +964,8 @@ } else { dim_ = dim; } - auto values = at::empty(self.sizes(), self.options()); - auto indices = at::empty(self.sizes(), self.options().dtype(at::kLong)); + auto values = nodispatch::empty(self.sizes(), self.options()); + auto indices = nodispatch::empty(self.sizes(), self.options().dtype(at::kLong)); interface: diopiSort(ctx, values, indices, self, dim_, descending, nullptr) - schema: "sort.values(Tensor self, int dim=-1, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)" @@ -1000,7 +1000,7 @@ - schema: "tril(Tensor self, int diagonal=0) -> Tensor" custom_code_at_the_beginning: | - auto out = at::empty_like(self); + auto out = nodispatch::empty_like(self); interface: diopiTril(ctx, out, self, diagonal) - schema: "tril.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)" @@ -1008,12 +1008,12 @@ - schema: "multinomial(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor" custom_code_at_the_beginning: | - auto out = at::empty_like(self); + auto out = nodispatch::empty_like(self); if (self.dim() == 2){ - out = at::empty({self.size(0), num_samples}, self.options().dtype(at::kLong)); + out = nodispatch::empty({self.size(0), num_samples}, self.options().dtype(at::kLong)); } else if (self.dim() == 1) { - out = at::empty({num_samples,}, self.options().dtype(at::kLong)); + out = nodispatch::empty({num_samples,}, self.options().dtype(at::kLong)); } interface: diopiMultinomial(ctx, out, self, num_samples, replacement, generator) @@ -1022,14 +1022,14 @@ - schema: "roll(Tensor self, int[1] shifts, int[1] dims=[]) -> Tensor" custom_code_at_the_beginning: | - auto out = at::empty_like(self); + auto out = nodispatch::empty_like(self); ::diopiSize_t diopi_shifts = toDiopiSize(shifts); ::diopiSize_t diopi_dims = toDiopiSize(dims); interface: diopiRoll(ctx, out, self, diopi_shifts, diopi_dims) - schema: "leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor" custom_code_at_the_beginning: | - auto out = at::empty_like(self); + auto out = nodispatch::empty_like(self); interface: diopiLeakyRelu(ctx, out, self, negative_slope) - schema: "leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!)" @@ -1047,9 +1047,9 @@ at::Tensor out; auto options = self.options(); if (reductionDiopi == ReductionNone) { - out = at::empty(self.sizes(), options); + out = nodispatch::empty(self.sizes(), options); } else { - out = at::empty({}, options); + out = nodispatch::empty({}, options); } interface: diopiMSELoss(ctx, out, self, target, reductionDiopi) @@ -1060,7 +1060,7 @@ - schema: "mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor grad_input" custom_code_at_the_beginning: | - auto grad_input = at::empty(self.sizes(), grad_output.options()); + auto grad_input = nodispatch::empty(self.sizes(), grad_output.options()); const auto reductionDiopi = static_cast<::diopiReduction_t>(reduction); interface: diopiMSELossBackward(ctx, grad_input, grad_output, self, target, reductionDiopi) @@ -1122,7 +1122,7 @@ custom_code_at_the_beginning: | auto promoted_dtype = at::native::get_dtype_from_self(self, dtype, /*promote_integers=*/true); const auto self_dtype = at::native::to(self, promoted_dtype); - auto out = at::empty({}, self_dtype.options()); + auto out = nodispatch::empty({}, self_dtype.options()); ::diopiConstTensorHandle_t self_dtype_diopi = dipu::diopi_helper::toDiopiTensorHandle(self_dtype); interface: diopiProd(ctx, out, self_dtype_diopi, nullptr) @@ -1145,12 +1145,12 @@ output_size[j] *= self_sizes.at(i); } - at::Tensor out = at::empty(output_size, self.options()); + at::Tensor out = nodispatch::empty(output_size, self.options()); interface: diopiRepeat(ctx, out, self, repeats) - schema: rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor custom_code_at_the_beginning: | - auto out = at::empty_like(self); + auto out = nodispatch::empty_like(self); // NOLINTNEXTLINE(readability-suspicious-call-argument) return dipu_sub_out(other, self, alpha, out); interface: diopiSub(ctx, out, other, self, alpha) @@ -1165,7 +1165,7 @@ if (dim < 0) { dim += static_cast(ndims); } - indices = at::empty({self.sizes().at(dim)}, self.options().dtype(at::kLong)); + indices = nodispatch::empty({self.sizes().at(dim)}, self.options().dtype(at::kLong)); } diopiTensorHandle_t out_ptr = nullptr; diopiTensorHandle_t counts_ptr = nullptr; @@ -1183,7 +1183,7 @@ at::Tensor counts; at::Tensor indices; if (return_inverse) { - indices = at::empty(self.sizes(), self.options().dtype(at::kLong)); + indices = nodispatch::empty(self.sizes(), self.options().dtype(at::kLong)); } diopiTensorHandle_t out_ptr = nullptr; diopiTensorHandle_t counts_ptr = nullptr; @@ -1205,7 +1205,7 @@ - schema: "masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor" custom_code_at_the_beginning: | - auto out = at::empty_like(self); + auto out = nodispatch::empty_like(self); interface: diopiMaskedFill(ctx, out, self, mask, value) - schema: "masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!)" @@ -1213,7 +1213,7 @@ - schema: "masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor" custom_code_at_the_beginning: | - auto out = at::empty_like(self); + auto out = nodispatch::empty_like(self); interface: diopiMaskedFillScalar(ctx, out, self, mask, value) - schema: "masked_fill_.Scalar(Tensor(a!) self, Tensor mask, Scalar value) -> Tensor(a!)" @@ -1221,7 +1221,7 @@ - schema: "min(Tensor self) -> Tensor" custom_code_at_the_beginning: | - auto out = at::empty({}, self.options()); + auto out = nodispatch::empty({}, self.options()); interface: diopiMinAll(ctx, out, self) - schema: "min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) min, Tensor(b!) min_indices)" @@ -1231,7 +1231,7 @@ - schema: "max(Tensor self) -> Tensor" custom_code_at_the_beginning: | - auto out = at::empty({}, self.options()); + auto out = nodispatch::empty({}, self.options()); interface: diopiMaxAll(ctx, out, self) - schema: "maximum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)" @@ -1314,8 +1314,8 @@ custom_code_at_the_beginning: | // todo:(wentao) temp solution, need using a type promotion strategy at::Tensor out = self.numel() == 1 && self.is_cpu() ? - at::empty_like(other) : - at::empty_like(self); + nodispatch::empty_like(other) : + nodispatch::empty_like(self); out = dipu_div_out(self,other,out); interface: diopiFloorInp(ctx,out) @@ -1323,7 +1323,7 @@ custom_code_at_the_beginning: | auto shape = at::infer_size(condition.sizes(), self.sizes()); shape = at::infer_size(shape, other.sizes()); - auto out = at::empty(shape, self.options()); + auto out = nodispatch::empty(shape, self.options()); interface: diopiWhere(ctx, out, condition,self, other) - schema: "gelu.out(Tensor self, *, str approximate='none', Tensor(a!) out) -> Tensor(a!)" @@ -1333,7 +1333,7 @@ interface: diopiGeluBackward(ctx, grad_input, grad_output, self, approximate.data()) - schema: "hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor" - custom_code_at_the_beginning: auto out = at::empty_like(self); + custom_code_at_the_beginning: auto out = nodispatch::empty_like(self); custom_code_before_call_diopi: | min_valDiopiScalar = dipu::diopi_helper::toDiopiScalar(min_val, at::kDouble); max_valDiopiScalar = dipu::diopi_helper::toDiopiScalar(max_val, at::kDouble); @@ -1361,7 +1361,7 @@ - schema: "hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor grad_input" custom_code_at_the_beginning: | - auto grad_input = at::empty(self.sizes(), grad_output.options()); + auto grad_input = nodispatch::empty(self.sizes(), grad_output.options()); interface: diopiHardtanhBackward(ctx, grad_input, grad_output, self, min_val, max_val) - schema: "upsample_nearest2d.out(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)" @@ -1390,7 +1390,7 @@ size[0] = std::floor(self.size(-2) * scales_h.value_or(1.0)); size[1] = std::floor(self.size(-1) * scales_w.value_or(1.0)); } - auto out = at::empty({self.size(0),self.size(1),size[0],size[1]},self.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt}); + auto out = nodispatch::empty({self.size(0),self.size(1),size[0],size[1]},self.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt}); interface: diopiUpsampleNearest(ctx, out, self, size); - schema: "upsample_bilinear2d.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)" @@ -1420,7 +1420,7 @@ size[0] = std::floor(self.size(-2) * scales_h.value_or(1.0)); size[1] = std::floor(self.size(-1) * scales_w.value_or(1.0)); } - auto out = at::empty({self.size(0),self.size(1),size[0],size[1]},self.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt}); + auto out = nodispatch::empty({self.size(0),self.size(1),size[0],size[1]},self.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt}); const char* mode = "bilinear"; interface: diopiUpsampleLinear(ctx, out, self, size, align_corners, mode); @@ -1444,7 +1444,7 @@ auto symInt2Int = [](const c10::SymInt& t)-> int64_t {return t.expect_int();}; std::vector grad_input_shape(input_size.size()); std::transform(input_size.cbegin(), input_size.cend(), grad_input_shape.begin(), symInt2Int); - auto grad_input = at::empty(grad_input_shape,grad_output.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt}); + auto grad_input = nodispatch::empty(grad_input_shape,grad_output.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt}); custom_code_before_call_diopi: | if (output_size.size() > 0) { std::copy(output_sizeVector.begin(), output_sizeVector.end(), size.begin()); @@ -1475,7 +1475,7 @@ auto symInt2Int = [](const c10::SymInt& t)-> int64_t {return t.expect_int();}; std::vector grad_input_shape(input_size.size()); std::transform(input_size.cbegin(), input_size.cend(), grad_input_shape.begin(), symInt2Int); - auto grad_input = at::empty(grad_input_shape,grad_output.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt}); + auto grad_input = nodispatch::empty(grad_input_shape,grad_output.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt}); custom_code_before_call_diopi: | if (output_size.size() > 0) { std::copy(output_sizeVector.begin(), output_sizeVector.end(), size.begin()); @@ -1488,7 +1488,7 @@ - schema: "sin(Tensor self) -> Tensor" custom_code_at_the_beginning: | - auto out = at::empty_like(self); + auto out = nodispatch::empty_like(self); interface: diopiSin(ctx, out, self) - schema: "sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)" @@ -1499,7 +1499,7 @@ - schema: "cos(Tensor self) -> Tensor" custom_code_at_the_beginning: | - auto out = at::empty_like(self); + auto out = nodispatch::empty_like(self); interface: diopiCos(ctx, out, self) - schema: "cos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)" @@ -1530,7 +1530,7 @@ - schema: normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor autocompare: disable custom_code_at_the_beginning: | - auto out = at::empty_like(mean); + auto out = nodispatch::empty_like(mean); interface: diopiNormalTensorScalar(ctx, out, mean, std, generator) - schema: normal.float_Tensor_out(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) @@ -1540,7 +1540,7 @@ - schema: normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor autocompare: disable custom_code_at_the_beginning: | - auto out = at::empty_like(std); + auto out = nodispatch::empty_like(std); interface: diopiNormalScalarTensor(ctx, out, mean, std, generator) - schema: normal.Tensor_Tensor_out(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) @@ -1550,7 +1550,7 @@ - schema: normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor autocompare: disable custom_code_at_the_beginning: | - auto out = at::empty_like(mean); + auto out = nodispatch::empty_like(mean); interface: diopiNormalTensor(ctx, out, mean, std, generator) - schema: normal.float_float_out(float mean, float std, SymInt[] size, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) @@ -1602,7 +1602,7 @@ } } - auto out = at::empty(output_shape, self.options()); + auto out = nodispatch::empty(output_shape, self.options()); interface: diopiMatmul(ctx, out, self, other) @@ -1631,7 +1631,7 @@ - schema: "flip(Tensor self, int[] dims) -> Tensor" custom_code_at_the_beginning: | - auto out = at::empty_like(self); + auto out = nodispatch::empty_like(self); ::diopiSize_t diopi_size = toDiopiSize(dims); interface: diopiFlip(ctx, out,self,diopi_size) @@ -1647,12 +1647,12 @@ std::vector output_shape(shape.begin(), shape.end()); dim += dim >= 0 ? 0 : static_cast(shape.size()); output_shape[dim] = index.numel(); - auto out = at::empty({output_shape}, self.options()); + auto out = nodispatch::empty({output_shape}, self.options()); interface: diopiIndexSelect(ctx, out, self, dim, index) - schema: "hardswish(Tensor self) -> Tensor" custom_code_at_the_beginning: | - auto out = at::empty(self.sizes(), self.options()); + auto out = nodispatch::empty(self.sizes(), self.options()); interface: diopiHardswish(ctx, out, self) - schema: "hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)" @@ -1663,7 +1663,7 @@ - schema: "hardswish_backward(Tensor grad_output, Tensor self) -> Tensor grad_input" custom_code_at_the_beginning: | - auto grad_input = at::empty(self.sizes(), grad_output.options()); + auto grad_input = nodispatch::empty(self.sizes(), grad_output.options()); interface: diopiHardswishBackward(ctx, grad_input, grad_output, self) - schema: "sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)" @@ -1681,9 +1681,9 @@ at::Tensor out; auto options = self.options(); if (reductionDiopi == ReductionNone) { - out = at::empty(self.sizes(), options); + out = nodispatch::empty(self.sizes(), options); } else { - out = at::empty({}, options); + out = nodispatch::empty({}, options); } interface: diopiBCELoss(ctx, out, self, target, weight, reductionDiopi) @@ -1696,7 +1696,7 @@ register_op: False custom_code_at_the_beginning: | const auto reductionDiopi = static_cast<::diopiReduction_t>(reduction); - at::Tensor grad_input = at::empty_like(log_probs); + at::Tensor grad_input = nodispatch::empty_like(log_probs); interface: diopiCTCLossBackward(ctx, grad_input, grad_output, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, blank, reductionDiopi, zero_infinity) @@ -1712,12 +1712,12 @@ int64_t max_target_length = target_lengths.max().cpu().item().to(); auto options = log_probs.options(); if (reductionDiopi == ReductionNone) { - out = at::empty({batch_size}, options); + out = nodispatch::empty({batch_size}, options); } else { - out = at::empty({1}, options); + out = nodispatch::empty({1}, options); } - at::Tensor neg_log_likelihood = at::empty({batch_size}, options); - at::Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, options); + at::Tensor neg_log_likelihood = nodispatch::empty({batch_size}, options); + at::Tensor log_alpha = nodispatch::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, options); interface: diopiCTCLoss(ctx, out, neg_log_likelihood, log_alpha, log_probs, targets, input_lengths, target_lengths, blank, reductionDiopi, zero_infinity) forward_process_code: | auto targets_dev = targets.to(log_probs.device()); @@ -1751,8 +1751,8 @@ int64_t num_labels = log_probs.size(2); int64_t max_target_length = target_lengths.max().cpu().item().to(); - at::Tensor neg_log_likelihood = at::empty({batch_size}, options); - at::Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, options); + at::Tensor neg_log_likelihood = nodispatch::empty({batch_size}, options); + at::Tensor log_alpha = nodispatch::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, options); backward_return_code: | /* Note: This kernel's output size will be checked by pytorch/torch/csrc/autograd/custom_function.h * @@ -1794,15 +1794,15 @@ custom_code_at_the_beginning: | const auto reductionDiopi = static_cast<::diopiReduction_t>(reduction); - auto input_lengths_tensor = at::empty({static_cast(input_lengths.size())}, at::kLong); - auto target_lengths_tensor = at::empty({static_cast(target_lengths.size())}, at::kLong); + auto input_lengths_tensor = nodispatch::empty_cpu({static_cast(input_lengths.size())}, at::kLong); + auto target_lengths_tensor = nodispatch::empty_cpu({static_cast(target_lengths.size())}, at::kLong); std::copy(input_lengths.begin(), input_lengths.end(), static_cast(input_lengths_tensor.data_ptr())); std::copy(target_lengths.begin(), target_lengths.end(), static_cast(target_lengths_tensor.data_ptr())); input_lengths_tensor = input_lengths_tensor.to(log_probs.device()); target_lengths_tensor = target_lengths_tensor.to(log_probs.device()); - at::Tensor grad_input = at::empty_like(log_probs); + at::Tensor grad_input = nodispatch::empty_like(log_probs); interface: diopiCTCLossBackward(ctx, grad_input, grad_output, log_probs, targets, input_lengths_tensor, target_lengths_tensor, neg_log_likelihood, log_alpha, blank, reductionDiopi, zero_infinity) @@ -1813,8 +1813,8 @@ outs: [neg_log_likelihood, log_alpha] custom_code_at_the_beginning: | const auto reductionDiopi = static_cast<::diopiReduction_t>(reduction); - auto input_lengths_tensor = at::empty({static_cast(input_lengths.size())}, at::kLong); - auto target_lengths_tensor = at::empty({static_cast(target_lengths.size())}, at::kLong); + auto input_lengths_tensor = nodispatch::empty_cpu({static_cast(input_lengths.size())}, at::kLong); + auto target_lengths_tensor = nodispatch::empty_cpu({static_cast(target_lengths.size())}, at::kLong); std::copy(input_lengths.begin(), input_lengths.end(), static_cast(input_lengths_tensor.data_ptr())); std::copy(target_lengths.begin(), target_lengths.end(), static_cast(target_lengths_tensor.data_ptr())); @@ -1827,12 +1827,12 @@ int64_t max_target_length = target_lengths_tensor.max().cpu().item().to(); auto options = log_probs.options(); if (reductionDiopi == ReductionNone) { - out = at::empty({batch_size}, options); + out = nodispatch::empty({batch_size}, options); } else { - out = at::empty({1}, options); + out = nodispatch::empty({1}, options); } - at::Tensor neg_log_likelihood = at::empty({batch_size}, options); - at::Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, options); + at::Tensor neg_log_likelihood = nodispatch::empty({batch_size}, options); + at::Tensor log_alpha = nodispatch::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, options); interface: diopiCTCLoss(ctx, out, neg_log_likelihood, log_alpha, log_probs, targets, input_lengths_tensor, target_lengths_tensor, blank, reductionDiopi, zero_infinity) backward_schema: "ctc_loss_intlist_backward(Tensor grad_output, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, int reduction=Mean, bool zero_infinity=False) -> Tensor grad_input" saved_data: @@ -1862,8 +1862,8 @@ // int64_t max_target_length = target_lengths_tensor.max().cpu().item().to(); int64_t max_target_length = *std::max_element(target_lengths.begin(), target_lengths.end()); - at::Tensor neg_log_likelihood = at::empty({batch_size}, options); - at::Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, options); + at::Tensor neg_log_likelihood = nodispatch::empty({batch_size}, options); + at::Tensor log_alpha = nodispatch::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, options); backward_return_code: | /* Note: This kernel's output size will be checked by pytorch/torch/csrc/autograd/custom_function.h * @@ -1903,8 +1903,8 @@ int64_t batch_size = log_probs.size(1); int64_t num_labels = log_probs.size(2); int64_t max_target_length = target_lengths.max().cpu().item().to(); - auto neg_log_likelihood = at::empty({batch_size}, log_probs.options()); - auto log_alpha = at::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, log_probs.options()); + auto neg_log_likelihood = nodispatch::empty({batch_size}, log_probs.options()); + auto log_alpha = nodispatch::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, log_probs.options()); interface: diopiCTCLoss(ctx, neg_log_likelihood, neg_log_likelihood, log_alpha, log_probs, targets, input_lengths, target_lengths, blank, ReductionNone, zero_infinity); # TODO: param log_alpha ? - schema: "_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor neg_log_likelihood, Tensor log_alpha)" @@ -1915,10 +1915,10 @@ int64_t batch_size = log_probs.size(1); int64_t num_labels = log_probs.size(2); int64_t max_target_length = *std::max_element(target_lengths.begin(), target_lengths.end());; - auto neg_log_likelihood = at::empty({batch_size}, log_probs.options()); - auto log_alpha = at::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, log_probs.options()); - auto input_lengths_tensor = at::empty({static_cast(input_lengths.size())}, at::kLong); - auto target_lengths_tensor = at::empty({static_cast(target_lengths.size())}, at::kLong); + auto neg_log_likelihood = nodispatch::empty({batch_size}, log_probs.options()); + auto log_alpha = nodispatch::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, log_probs.options()); + auto input_lengths_tensor = nodispatch::empty_cpu({static_cast(input_lengths.size())}, at::kLong); + auto target_lengths_tensor = nodispatch::empty_cpu({static_cast(target_lengths.size())}, at::kLong); std::copy(input_lengths.begin(), input_lengths.end(), static_cast(input_lengths_tensor.data_ptr())); std::copy(target_lengths.begin(), target_lengths.end(), static_cast(target_lengths_tensor.data_ptr())); interface: diopiCTCLoss(ctx, neg_log_likelihood, neg_log_likelihood, log_alpha, log_probs, targets, input_lengths_tensor, target_lengths_tensor, blank, ReductionNone, zero_infinity); # TODO: param log_alpha ? @@ -1926,7 +1926,7 @@ - schema: "_ctc_loss_backward.Tensor(Tensor grad, Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor grad_input" device: [-camb, all] custom_code_at_the_beginning: | - auto grad_input = at::empty(log_probs.sizes(), grad.options()); + auto grad_input = nodispatch::empty(log_probs.sizes(), grad.options()); interface: diopiCTCLossBackward(ctx, grad_input, grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, blank, ReductionNone, zero_infinity) - schema: "_ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor grad_input" @@ -1934,9 +1934,9 @@ no_device_check_args: [input_lengths_tensor, target_lengths_tensor] device: [-camb, all] custom_code_at_the_beginning: | - auto grad_input = at::empty(log_probs.sizes(), grad.options()); - auto input_lengths_tensor = at::empty({static_cast(input_lengths.size())}, at::kLong); - auto target_lengths_tensor = at::empty({static_cast(target_lengths.size())}, at::kLong); + auto grad_input = nodispatch::empty(log_probs.sizes(), grad.options()); + auto input_lengths_tensor = nodispatch::empty_cpu({static_cast(input_lengths.size())}, at::kLong); + auto target_lengths_tensor = nodispatch::empty_cpu({static_cast(target_lengths.size())}, at::kLong); std::copy(input_lengths.begin(), input_lengths.end(), static_cast(input_lengths_tensor.data_ptr())); std::copy(target_lengths.begin(), target_lengths.end(), static_cast(target_lengths_tensor.data_ptr())); interface: diopiCTCLossBackward(ctx, grad_input, grad, log_probs, targets, input_lengths_tensor, target_lengths_tensor, neg_log_likelihood, log_alpha, blank, ReductionNone, zero_infinity) @@ -2067,7 +2067,7 @@ auto output_shape = at::infer_size(x1_shape, x2_shape); *output_shape.rbegin() = x2.size(-2); *(output_shape.rbegin() + 1) = x1.size(-2); - auto out = at::empty(output_shape, x1.options()); + auto out = nodispatch::empty(output_shape, x1.options()); interface: diopiCdist(ctx, out, x1, x2, p, compute_mode_ptr) - schema: "_cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor grad_input" @@ -2079,7 +2079,7 @@ auto grad_shape = at::infer_size(x1_shape, x2_shape); *grad_shape.rbegin() = x1.size(-1); *(grad_shape.rbegin() + 1) = x1.size(-2); - auto grad_input = at::empty(grad_shape, grad.options()); + auto grad_input = nodispatch::empty(grad_shape, grad.options()); interface: diopiCdistBackward(ctx, grad_input, grad, x1, x2, p, cdist) - schema: "erfinv.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)" @@ -2091,7 +2091,7 @@ - schema: "polar(Tensor abs, Tensor angle) -> Tensor" custom_code_at_the_beginning: | auto dtype = c10::toComplexType(abs.scalar_type()); - auto out = at::empty(abs.sizes(), abs.options().dtype(dtype)); + auto out = nodispatch::empty(abs.sizes(), abs.options().dtype(dtype)); interface: diopiPolar(ctx, out, abs, angle) - schema: "polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!)" @@ -2129,7 +2129,7 @@ if(batched_input){ out_shape.insert(out_shape.begin(), input_shape[0]); } - auto out = at::empty({out_shape}, self.options()); + auto out = nodispatch::empty({out_shape}, self.options()); interface: diopiIm2Col(ctx, out, self, kernel_size, dilation, padding, stride) - schema: "col2im(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor" @@ -2151,7 +2151,7 @@ if(batched_input){ out_shape.insert(out_shape.begin(), input_shape[0]); } - auto out = at::empty({out_shape}, self.options()); + auto out = nodispatch::empty({out_shape}, self.options()); interface: diopiCol2Im(ctx, out, self, output_size, kernel_size, dilation, padding, stride) - schema: "sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)" @@ -2162,7 +2162,7 @@ - schema: "isnan(Tensor self) -> Tensor" custom_code_at_the_beginning: | - auto out = at::empty(self.sizes(), self.options().dtype(at::kBool)); + auto out = nodispatch::empty(self.sizes(), self.options().dtype(at::kBool)); interface: diopiIsNan(ctx, out, self) - schema: "embedding_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor grad_weight" @@ -2184,15 +2184,15 @@ - schema: batch_norm_stats(Tensor input, float eps) -> (Tensor, Tensor) custom_code_at_the_beginning: | auto shape = input.size(1); - auto out0 = at::empty({shape}, input.options().dtype(at::kFloat)); - auto out1 = at::empty({shape}, input.options().dtype(at::kFloat)); + auto out0 = nodispatch::empty({shape}, input.options().dtype(at::kFloat)); + auto out1 = nodispatch::empty({shape}, input.options().dtype(at::kFloat)); interface: diopiBatchNormStats(ctx, out0, out1, input, eps) - schema: batch_norm_gather_stats_with_counts(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, Tensor counts) -> (Tensor, Tensor) custom_code_at_the_beginning: | auto shape = input.size(1); - auto out0 = at::empty({shape}, input.options().dtype(at::kFloat)); - auto out1 = at::empty({shape}, input.options().dtype(at::kFloat)); + auto out0 = nodispatch::empty({shape}, input.options().dtype(at::kFloat)); + auto out1 = nodispatch::empty({shape}, input.options().dtype(at::kFloat)); interface: diopiBatchNormGatherStatsWithCounts(ctx, out0, out1, input, mean, invstd, const_cast(running_mean), const_cast(running_var), static_cast(momentum), static_cast(eps), counts) custom_code_before_call_diopi: | // NOTE: const_cast here is safe according to pytorch's source code @@ -2208,25 +2208,25 @@ at::Tensor out2; at::Tensor out3; if(input_g){ - out0 = at::empty({shape}, input.options().dtype(at::kFloat), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt}); - out1 = at::empty({shape}, input.options().dtype(at::kFloat), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt}); + out0 = nodispatch::empty({shape}, input.options().dtype(at::kFloat), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt}); + out1 = nodispatch::empty({shape}, input.options().dtype(at::kFloat), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt}); } if(weight_g){ - out2 = at::empty({shape}, input.options().dtype(at::kFloat)); + out2 = nodispatch::empty({shape}, input.options().dtype(at::kFloat)); } if(bias_g){ - out3 = at::empty({shape}, input.options().dtype(at::kFloat)); + out3 = nodispatch::empty({shape}, input.options().dtype(at::kFloat)); } interface: diopiBatchNormBackwardReduce(ctx, out0, out1, out2, out3, grad_out, input, mean, invstd, weight, input_g, weight_g, bias_g) - schema: batch_norm_backward_elemt(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor sum_dy, Tensor sum_dy_xmu, Tensor count) -> Tensor custom_code_at_the_beginning: | - auto out = at::empty_like(grad_out, grad_out.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt}); + auto out = nodispatch::empty_like(grad_out, grad_out.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt}); interface: diopiBatchNormBackwardElemt(ctx, out, grad_out, input, mean, invstd, weight, sum_dy, sum_dy_xmu, count); - schema: batch_norm_elemt(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps) -> Tensor custom_code_at_the_beginning: | - auto out = at::empty_like(input, input.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt}); + auto out = nodispatch::empty_like(input, input.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt}); interface: diopiBatchNormElemt(ctx, out, input, weight, bias, mean, invstd, static_cast(eps)); - schema: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!) @@ -2251,7 +2251,7 @@ std::vector out(self.size()); for (size_t i = 0;i < self.size();i++) { auto& in = self[i]; - out[i] = at::empty(in.sizes(), in.options()); + out[i] = nodispatch::empty(in.sizes(), in.options()); dipu_add_out(self.at(i), other.at(i), alpha, out[i]); } return out; @@ -2272,7 +2272,7 @@ custom_code_at_the_beginning: | std::vector out(self.size()); for (size_t i = 0;i < self.size();i++) { - out[i] = at::empty(self[i].sizes(), self[i].options()); + out[i] = nodispatch::empty(self[i].sizes(), self[i].options()); dipu_add_scalar_out(self[i], scalar, 1.0 , out[i]); } return out; @@ -2293,7 +2293,7 @@ custom_code_at_the_beginning: | std::vector out(self.size()); for (size_t i = 0;i < self.size();i++) { - out[i] = at::empty_like(self[i]); + out[i] = nodispatch::empty_like(self[i]); dipu_mul_out(self[i], other[i], out[i]); } return out; @@ -2325,7 +2325,7 @@ std::vector out(self.size()); for (size_t i = 0;i < self.size();i++) { auto& in = self[i]; - out[i] = at::empty(in.sizes(), in.options()); + out[i] = nodispatch::empty(in.sizes(), in.options()); dipu_mul_scalar_out(self.at(i), scalar, out[i]); } return out; @@ -2337,7 +2337,7 @@ std::vector out(self.size()); for (size_t i = 0;i < self.size();i++) { auto& in = self[i]; - out[i] = at::empty(in.sizes(), in.options()); + out[i] = nodispatch::empty(in.sizes(), in.options()); dipu_mul_scalar_out(self.at(i), scalars[i], out[i]); } return out; @@ -2358,7 +2358,7 @@ custom_code_at_the_beginning: | std::vector out(self.size()); for (size_t i = 0;i < self.size();i++) { - out[i] = at::empty_like(self[i]); + out[i] = nodispatch::empty_like(self[i]); dipu_div_out(self[i], other[i], out[i]); } return out; @@ -2390,7 +2390,7 @@ std::vector out(self.size()); for (size_t i = 0;i < self.size();i++) { auto& in = self[i]; - out[i] = at::empty(in.sizes(), in.options()); + out[i] = nodispatch::empty(in.sizes(), in.options()); dipu_div_scalar_out(self.at(i), scalar, out[i]); } return out; @@ -2402,7 +2402,7 @@ std::vector out(self.size()); for (size_t i = 0;i < self.size();i++) { auto& in = self[i]; - out[i] = at::empty(in.sizes(), in.options()); + out[i] = nodispatch::empty(in.sizes(), in.options()); dipu_div_scalar_out(self.at(i), scalars[i], out[i]); } return out; @@ -2485,7 +2485,7 @@ custom_code_at_the_beginning: | std::vector out(self.size()); for (size_t i = 0;i < self.size();i++) { - out[i] = at::empty_like(self[i]); + out[i] = nodispatch::empty_like(self[i]); dipu_sqrt_out(self[i], out[i]); } return out; @@ -2497,7 +2497,7 @@ std::vector out(self.size()); for (size_t i = 0;i < self.size();i++) { auto& in = self[i]; - out[i] = at::empty(in.sizes(), in.options()); + out[i] = nodispatch::empty(in.sizes(), in.options()); dipu_neg_out(self.at(i), out[i]); } return out; @@ -2509,7 +2509,7 @@ std::vector out(self.size()); for (size_t i = 0;i < self.size();i++) { auto& in = self[i]; - out[i] = at::empty({}, in.options()); + out[i] = nodispatch::empty({}, in.options()); dipu_norm_out(in, ord, {}, false, out[i]); } return out; @@ -2520,7 +2520,7 @@ - schema: "wrap_diopi_cast_dtype(Tensor(a) self, ScalarType dtype) -> Tensor(a)" register_op: False custom_code_at_the_beginning: | - auto out = at::empty_like(self, self.options().dtype(dtype)); + auto out = nodispatch::empty_like(self, self.options().dtype(dtype)); interface: diopiCastDtype(ctx, out, self); # a diopi func wrapper. diff --git a/dipu/tests/python/unittests/test_profiler_vendor.py b/dipu/tests/python/unittests/test_profiler_vendor.py index ff9ff1f59..f94c81a66 100644 --- a/dipu/tests/python/unittests/test_profiler_vendor.py +++ b/dipu/tests/python/unittests/test_profiler_vendor.py @@ -32,7 +32,7 @@ def test_aot_profiler(self): with torch_dipu.profiler.NativeProfile(path, True): x.add_(y) - self.assertTrue(check_string_in_directory(path, "test_profiler.py")) + self.assertTrue(check_string_in_directory(path, "test_profiler_vendor.py")) self.assertTrue(check_string_in_directory(path, "aten::add_")) self.assertTrue(check_string_in_directory(path, "Add")) @@ -54,13 +54,13 @@ def fn(x): y = opt_model(input) z = y + y - self.assertTrue(check_string_in_directory(path, "test_profiler.py")) + self.assertTrue(check_string_in_directory(path, "test_profiler_vendor.py")) self.assertTrue(check_string_in_directory(path, "aten::add")) self.assertTrue(check_string_in_directory(path, "mulrelu")) self.assertTrue(check_string_in_directory(path, "softmax")) @onlyOn("CUDA") - def test_profiler(self): + def test_profiler_cuda(self): model = models.resnet18().cuda() inputs = torch.randn(5, 3, 224, 224).cuda() @@ -110,7 +110,7 @@ def test_profiler(self): prof.export_chrome_trace(f"{tmpdir}/resnet18_profiler_cuda.json") @onlyOn("MLU") - def test_profiler(self): + def test_profiler_mlu(self): model = models.resnet18().cuda() inputs = torch.randn(5, 3, 224, 224).cuda() diff --git a/dipu/third_party/kineto b/dipu/third_party/kineto index 926533879..08e23777d 160000 --- a/dipu/third_party/kineto +++ b/dipu/third_party/kineto @@ -1 +1 @@ -Subproject commit 92653387900fe7637a4065e710332dc81b73366d +Subproject commit 08e23777de5f6c95f1c58b2e98654a0ce70f8a1c diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/NodispatchUtils.hpp b/dipu/torch_dipu/csrc_dipu/aten/ops/NodispatchUtils.hpp index 38fed1826..c19247ba3 100644 --- a/dipu/torch_dipu/csrc_dipu/aten/ops/NodispatchUtils.hpp +++ b/dipu/torch_dipu/csrc_dipu/aten/ops/NodispatchUtils.hpp @@ -6,10 +6,17 @@ #pragma once +#include #include #include +#include +#include +#include #include +#include #include +#include +#include #include #include "csrc_dipu/aten/DIPUATenFunctions.h" @@ -30,9 +37,69 @@ inline at::Tensor empty( memory_format)); } +// The code that calls this overloaded function is all for allocating CPU memory +inline at::Tensor empty_cpu( + at::IntArrayRef size, at::ScalarType dtype, + c10::optional device_opt = c10::nullopt, + c10::optional memory_format = c10::nullopt) { + return dipu_aten::empty_cpu(size, dtype, at::Layout::Strided, + device_or_default(device_opt), false, + c10::get_contiguous_memory_format()); +} + +inline at::Tensor empty_like( + const at::Tensor& self, c10::optional dtype, + c10::optional layout, c10::optional device, + c10::optional pin_memory, + c10::optional optional_memory_format) { + at::TensorOptions options_ = at::TensorOptions() + .dtype(dtype) + .layout(layout) + .device(device) + .pinned_memory(pin_memory); + at::TensorOptions options = + self.options().merge_in(options_).merge_memory_format( + optional_memory_format); + + TORCH_CHECK(!(options.layout() != c10::kStrided && + optional_memory_format.has_value()), + "memory format option is only supported by strided tensors"); + + auto memory_format = + options.memory_format_opt().value_or(at::MemoryFormat::Preserve); + + at::Tensor result; + + // TODO(liuweiyu): need to implement nodispatch::empty_strided + if (memory_format == at::MemoryFormat::Preserve) { + result = + empty(self.sizes(), options.memory_format(self.suggest_memory_format()), + c10::nullopt); + } else { + result = + empty(self.sizes(), options.memory_format(memory_format), c10::nullopt); + } + + if (self.opt_names()) { + at::namedinference::propagate_names(result, self.names()); + } + + // never propagate Conjugate, Negative, and ZeroTensor dispatch key + result._set_conj(false); + result._set_neg(false); + result._set_zero(false); + return result; +} + // an simplified version of `at::empty_like` but without dispatch -inline at::Tensor empty_like(const at::Tensor& self) { - return empty(self.sizes(), self.options()); +inline at::Tensor empty_like( + const at::Tensor& self, at::TensorOptions options = {}, + c10::optional memory_format = c10::nullopt) { + return nodispatch::empty_like( + self, c10::optTypeMetaToScalarType(options.dtype_opt()), + options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), + c10::impl::check_tensor_options_and_extract_memory_format(options, + memory_format)); } } // namespace nodispatch diff --git a/dipu/torch_dipu/csrc_dipu/profiler/collection.cpp b/dipu/torch_dipu/csrc_dipu/profiler/collection.cpp index 0efbd3364..4ccaaa190 100644 --- a/dipu/torch_dipu/csrc_dipu/profiler/collection.cpp +++ b/dipu/torch_dipu/csrc_dipu/profiler/collection.cpp @@ -48,6 +48,55 @@ using result_ptr_t = std::shared_ptr; using trace_ptr_t = std::unique_ptr; +namespace { +struct TagToIOType { + DIPUInputOutputEncoder::Tag tag; + DIPUInputOutputEncoder::IOType io_type; +}; + +constexpr int tagCount = + (static_cast(DIPUInputOutputEncoder::Tag::TERMINATOR)) + 1; +constexpr std::array tag_map = {{ + {DIPUInputOutputEncoder::Tag::Tensor, + DIPUInputOutputEncoder::IOType::Shapes}, + {DIPUInputOutputEncoder::Tag::UndefinedTensor, + DIPUInputOutputEncoder::IOType::Shapes}, + {DIPUInputOutputEncoder::Tag::TensorListBegin, + DIPUInputOutputEncoder::IOType::Shapes}, + {DIPUInputOutputEncoder::Tag::ScalarList, + DIPUInputOutputEncoder::IOType::ConcreteInputs}, + {DIPUInputOutputEncoder::Tag::Scalar, + DIPUInputOutputEncoder::IOType::Shapes}, + {DIPUInputOutputEncoder::Tag::Other, + DIPUInputOutputEncoder::IOType::Shapes}, + {DIPUInputOutputEncoder::Tag::TERMINATOR, + DIPUInputOutputEncoder::IOType::None}, +}}; + +constexpr bool allTagsMapped(int idx = 0) { + return tag_map[idx].tag == DIPUInputOutputEncoder::Tag::TERMINATOR || + ((idx == static_cast(tag_map[idx].tag)) && + allTagsMapped(idx + 1)); +} +static_assert(allTagsMapped(), "tag_map is out of order"); + +constexpr DIPUInputOutputEncoder::IOType tagToIOType( + DIPUInputOutputEncoder::Tag tag) { + return tag_map[static_cast(tag)].io_type; +} + +constexpr int32_t kScalarListLengthLimit = 30; + +bool dipu_get_record_concrete_inputs_enabled() { +#if DIPU_TORCH_VERSION == 20000 + return false; +#else + return torch::profiler::impl::get_record_concrete_inputs_enabled(); +#endif +} + +} // namespace + void DIPUInputOutputEncoder::push(c10::ArrayRef values) { for (const auto& value : values) { if (value.isTensor()) { @@ -65,6 +114,9 @@ void DIPUInputOutputEncoder::push(c10::ArrayRef values) { push(t); } tags_.emplace_back(Tag::TERMINATOR); + } else if (isSupportedScalarList(value)) { + tags_.emplace_back(Tag::ScalarList); + ivalues_.emplace_back(value); } else { tags_.emplace_back(Tag::Other); } @@ -73,7 +125,9 @@ void DIPUInputOutputEncoder::push(c10::ArrayRef values) { } void DIPUInputOutputEncoder::push(const at::Tensor& t) { - if (t.defined() && !t.is_nested()) { // TODO(caikun-pjlab) fix nested sizes + // TODO(caikun-pjlab) fix nested sizes + if (t.defined() && !t.is_nested() && + !t.unsafeGetTensorImpl()->has_symbolic_sizes_strides()) { tags_.emplace_back(Tag::Tensor); tensor_metadata_.emplace_back(t); tensor_sizes_strides_.copy(t.sizes()); @@ -86,12 +140,42 @@ void DIPUInputOutputEncoder::push(const at::Tensor& t) { } } -// This is a custom-iterator-like getter to obtain input shapes and dtypes. -auto DIPUInputOutputEncoder::getNextShapesAndDtypes() { +bool DIPUInputOutputEncoder::isSupportedScalarList( + const c10::IValue& list_candidate) { + // Scalar list can be very long. If a list is too long, we shouldn't + // collect it. This function checks whether the list is a scalar list + // and whether its length is sufficiently short. + + if (!dipu_get_record_concrete_inputs_enabled()) { + return false; + } + + if (!list_candidate.isList()) { + return false; + } + auto list_ref = list_candidate.toListRef(); + if (C10_UNLIKELY(list_ref.empty())) { + return true; + } + if (C10_UNLIKELY(!list_ref[0].isScalar())) { + return false; + } + if (C10_UNLIKELY(list_ref.size() > kScalarListLengthLimit)) { + return false; + } + return true; +} + +// io_type is used to filter the ivalues between 'Shapes' and 'Concrete Args'. +// Shapes are used to represent the shapes of tensors. We save only the shapes +// of the tensors because tensors can be large. +// Concrete args are separated to clarify that they are the actual values. +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +auto DIPUInputOutputEncoder::getIValueGenerator(const IOType& io_type) { return [this, tag_it = tags_.begin(), tensor_metadata_it = tensor_metadata_.begin(), tensor_size_strides_it = tensor_sizes_strides_.begin(), - ivals_it = ivalues_.begin()]() mutable { + ivals_it = ivalues_.begin(), io_type]() mutable { auto decode_tensor = [&]() -> TensorMetadata { const auto& raw_metadata = *tensor_metadata_it++; std::vector sizes; @@ -108,29 +192,48 @@ auto DIPUInputOutputEncoder::getNextShapesAndDtypes() { }; std::vector out; + auto push_value = [&out, io_type](const Tag& tag, op_input_t input) { + if (io_type == tagToIOType(tag)) { + out.push_back(std::move(input)); + } else { + out.emplace_back(c10::nullopt); + } + }; + bool terminate = false; while (!terminate && tag_it != tags_.end()) { switch (*tag_it) { case Tag::Tensor: - out.emplace_back(decode_tensor()); + push_value(*tag_it, decode_tensor()); break; case Tag::TensorListBegin: { std::vector arg; + bool found_undefined = false; while (*(++tag_it) != Tag::TERMINATOR) { - TORCH_INTERNAL_ASSERT(*tag_it == Tag::Tensor, (int)(*tag_it)); + if (*tag_it == Tag::UndefinedTensor) { + found_undefined = true; + continue; + } + TORCH_INTERNAL_ASSERT(*tag_it == Tag::Tensor, + static_cast(*tag_it)); arg.emplace_back(decode_tensor()); } - out.emplace_back(std::move(arg)); + if (found_undefined) { + push_value(*tag_it, c10::nullopt); + } else { + push_value(Tag::TensorListBegin, std::move(arg)); + } } break; + case Tag::ScalarList: case Tag::Scalar: - out.emplace_back(*ivals_it++); + push_value(*tag_it, *ivals_it++); break; case Tag::UndefinedTensor: case Tag::Other: - out.emplace_back(c10::nullopt); + push_value(*tag_it, c10::nullopt); break; case Tag::TERMINATOR: @@ -147,6 +250,14 @@ auto DIPUInputOutputEncoder::getNextShapesAndDtypes() { }; } +auto DIPUInputOutputEncoder::getInputShapeGenerator() { + return getIValueGenerator(IOType::Shapes); +} + +auto DIPUInputOutputEncoder::getConcreteInputGenerator() { + return getIValueGenerator(IOType::ConcreteInputs); +} + void DIPUInputOutputEncoder::clear() { tags_.clear(); tensor_metadata_.clear(); @@ -289,7 +400,8 @@ void DIPUThreadLocalSubqueue::TorchOpStorage::materialize( } } - auto input_getter = inputs_outputs_.getNextShapesAndDtypes(); + auto input_shape_getter = inputs_outputs_.getInputShapeGenerator(); + auto concrete_input_getter = inputs_outputs_.getConcreteInputGenerator(); // TODO(caikun-pjlab): CTAD will take care of template args when we move to // C++17 @@ -302,10 +414,10 @@ void DIPUThreadLocalSubqueue::TorchOpStorage::materialize( ExtraFields e { std::move(event->basic_fields_), DIPUThreadLocalSubqueue::TorchOpStorage::OpList::correlationID(event), - time_converter(event->end_time_), input_getter(), + time_converter(event->end_time_), input_shape_getter(), #if DIPU_TORCH_VERSION == 20000 #else - input_getter(), + concrete_input_getter(), #endif jit_stack(), jit_module(), extra_args(), gpu_fallback(), event->allow_tf32_cublas_, std::move(event->counters_) diff --git a/dipu/torch_dipu/csrc_dipu/profiler/collection.h b/dipu/torch_dipu/csrc_dipu/profiler/collection.h index c937d796f..d78c15b61 100644 --- a/dipu/torch_dipu/csrc_dipu/profiler/collection.h +++ b/dipu/torch_dipu/csrc_dipu/profiler/collection.h @@ -34,23 +34,31 @@ class DIPUInputOutputEncoder final { public: void push(c10::ArrayRef values); - // Used during post-processing to create vectors for shapes and dtype. - auto getNextShapesAndDtypes(); + auto getInputShapeGenerator(); + auto getConcreteInputGenerator(); + static bool isSupportedScalarList(const c10::IValue& list_candidate); void clear(); - private: enum class Tag { Tensor = 0, UndefinedTensor, TensorListBegin, // TODO(caikun-pjlab): generalize to other lists. + ScalarList, Scalar, Other, TERMINATOR }; + enum class IOType { Shapes, ConcreteInputs, None }; + + private: void push(const at::Tensor& t); + // Implementation detail for getInputShapeGenerator and + // getConcreteInputGenerator + auto getIValueGenerator(const IOType& io_type); + torch::profiler::impl::AppendOnlyList< Tag, torch::profiler::impl::IO_ENCODER_DEFAULT_BLOCK_SIZE> tags_; diff --git a/dipu/torch_dipu/csrc_dipu/profiler/profiler_kineto.cpp b/dipu/torch_dipu/csrc_dipu/profiler/profiler_kineto.cpp index fa9856bd7..51fff5078 100644 --- a/dipu/torch_dipu/csrc_dipu/profiler/profiler_kineto.cpp +++ b/dipu/torch_dipu/csrc_dipu/profiler/profiler_kineto.cpp @@ -28,16 +28,21 @@ namespace profile { namespace { inline int64_t getTimeUs() { auto constexpr scale = int64_t{1000}; - return torch::profiler::impl::getTime(true) / scale; + return torch::profiler::impl::getTime() / scale; } -const std::set kCpuTypes{ - libkineto::ActivityType::CPU_OP, - libkineto::ActivityType::CPU_INSTANT_EVENT, - libkineto::ActivityType::USER_ANNOTATION, - libkineto::ActivityType::EXTERNAL_CORRELATION, - libkineto::ActivityType::CUDA_RUNTIME, - libkineto::ActivityType::PYTHON_FUNCTION, +const std::set kCpuTypes { + libkineto::ActivityType::CPU_OP, libkineto::ActivityType::CPU_INSTANT_EVENT, + libkineto::ActivityType::USER_ANNOTATION, + libkineto::ActivityType::EXTERNAL_CORRELATION, +#if DIPU_TORCH_VERSION == 20000 + libkineto::ActivityType::CUDA_RUNTIME, +#else + libkineto::ActivityType::XPU_RUNTIME, + libkineto::ActivityType::CUDA_RUNTIME, + libkineto::ActivityType::CUDA_DRIVER, +#endif + libkineto::ActivityType::PYTHON_FUNCTION, }; using torch::autograd::profiler::experimental_event_t; @@ -46,8 +51,10 @@ using torch::autograd::profiler::post_process_t; using torch::autograd::profiler::ProfilerResult; using torch::profiler::impl::ActiveProfilerType; #if DIPU_TORCH_VERSION == 20000 -using torch::profiler::impl::dtypesToStr; +constexpr auto strListToStr = torch::profiler::impl::dtypesToStr; #else +using torch::profiler::impl::ivalueListToStr; +constexpr auto strListToStr = torch::profiler::impl::strListToStr; #endif using torch::profiler::impl::EventType; using torch::profiler::impl::ExtraFields; @@ -60,30 +67,61 @@ using torch::profiler::impl::shapesToStr; using torch::profiler::impl::stacksToStr; using torch::profiler::impl::TensorMetadata; -auto shapesAndDtypes(const std::vector& inputs) { +struct OpArgData { + bool has_data; std::vector> shapes; std::vector dtypes; - for (const auto& i : inputs) { + std::vector concrete_inputs; +}; + +auto parseArgData(const std::vector& input_shapes, + const std::vector& concrete_inputs) { + if (input_shapes.empty()) { + return OpArgData{false, {}, {}, {}}; + } + + std::vector> shapes(input_shapes.size()); + std::vector dtypes(input_shapes.size()); + std::vector concrete_inputs_list; + + for (const auto& i : c10::irange(input_shapes.size())) { c10::visit(c10::overloaded( [&](const TensorMetadata& t) { - shapes.emplace_back(t.sizes_); - dtypes.emplace_back(scalarTypeToTypeMeta(t.dtype_).name()); + shapes[i] = t.sizes_; + dtypes[i] = std::string( + scalarTypeToTypeMeta(t.dtype_).name().data(), + scalarTypeToTypeMeta(t.dtype_).name().size()); }, [&](const std::vector&) { - shapes.emplace_back(); - dtypes.emplace_back("TensorList"); + dtypes[i] = "TensorList"; }, - [&](const c10::IValue&) { - shapes.emplace_back(); - dtypes.emplace_back("Scalar"); - }, - [&](const auto&) { - shapes.emplace_back(); - dtypes.emplace_back(); - }), - i); + [&](const c10::IValue& val) { dtypes[i] = "Scalar"; }, + [&](const auto&) {}), + input_shapes[i]); + } + + // If we recorded concrete inputs, then parse them + if (input_shapes.size() == concrete_inputs.size() && + !concrete_inputs.empty()) { + concrete_inputs_list.resize(input_shapes.size()); + + for (const auto& i : c10::irange(input_shapes.size())) { + c10::visit( + c10::overloaded( + [&](const c10::IValue& val) { concrete_inputs_list[i] = val; }, + [&](const auto&) {}), + input_shapes[i]); + c10::visit(c10::overloaded( + [&](const c10::IValue& val) { + concrete_inputs_list[i] = val; + dtypes[i] = "ScalarList"; + }, + [&](const auto&) {}), + concrete_inputs[i]); + } } - return std::make_pair(shapes, dtypes); + + return OpArgData{true, shapes, dtypes, concrete_inputs_list}; } struct MetadataBase { @@ -163,20 +201,28 @@ struct AddGenericMetadata : public MetadataBase { } void operator()(ExtraFields& op_event) { - const auto shapes_and_dtypes = shapesAndDtypes(op_event.inputs_); - if (!shapes_and_dtypes.first.empty()) { - addMetadata("Input Dims", shapesToStr(shapes_and_dtypes.first)); - } #if DIPU_TORCH_VERSION == 20000 - if (!shapes_and_dtypes.second.empty()) { - addMetadata("Input type", dtypesToStr(shapes_and_dtypes.second)); - } + const std::vector concrete_inputs; #else + const auto& concrete_inputs = op_event.concrete_inputs_; #endif + const auto arg_data = parseArgData(op_event.inputs_, concrete_inputs); + + if (arg_data.has_data) { + addMetadata("Input Dims", shapesToStr(arg_data.shapes)); + addMetadata("Input type", strListToStr(arg_data.dtypes)); +#if DIPU_TORCH_VERSION == 20000 +#else + if (!arg_data.concrete_inputs.empty()) { + addMetadata("Concrete Inputs", + ivalueListToStr(arg_data.concrete_inputs)); + } +#endif + } if (config_ && !config_->experimental_config.performance_events.empty()) { auto& event_names = config_->experimental_config.performance_events; - for (auto i = 0; i < op_event.perf_event_counters_->size(); ++i) { + for (const auto i : c10::irange(op_event.perf_event_counters_->size())) { addMetadata(event_names[i], std::to_string((*op_event.perf_event_counters_)[i])); } @@ -308,7 +354,7 @@ struct DIPUKinetoThreadLocalState : public ProfilerStateBase { void materializeOpEvents(std::vector>& events) { for (auto& e : events) { - if (e->parent_.expired()) { + if (e->parent_.expired() && e->deviceType() == c10::DeviceType::CPU) { event_tree_.push_back(e); } diff --git a/dipu/torch_dipu/csrc_dipu/vendor/ascend/deviceimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/ascend/deviceimpl.cpp index 9e05f9af1..c9e3e78a8 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/ascend/deviceimpl.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/ascend/deviceimpl.cpp @@ -18,7 +18,10 @@ namespace devapis { using ascend_deviceId = int32_t; thread_local bool setDevFlag = false; -void initializeVendor() { DIPU_CALLACLRT(aclInit(nullptr)); } +void initializeVendor() { + DIPU_CALLACLRT(aclInit(nullptr)); + DIPU_CALLACLRT(aclrtSetDeviceSatMode(ACL_RT_OVERFLOW_MODE_INFNAN)); +} void finalizeVendor() { DIPU_CALLACLRT(aclFinalize()); } diff --git a/dipu/torch_dipu/csrc_dipu/vendor/droplet/DropletGeneratorImpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/droplet/DropletGeneratorImpl.cpp index 944bfdb32..b68d5e818 100644 --- a/dipu/torch_dipu/csrc_dipu/vendor/droplet/DropletGeneratorImpl.cpp +++ b/dipu/torch_dipu/csrc_dipu/vendor/droplet/DropletGeneratorImpl.cpp @@ -1,3 +1,5 @@ +#include + #include #include @@ -7,21 +9,44 @@ namespace dipu { -// Discriminate floating device type. -// static bool is_floating_device = true; - -// just an example -// not implemented now class DROPLETGeneratorImpl : public dipu::DIPUGeneratorImpl { + private: + static constexpr std::size_t seed_size = sizeof(uint64_t); + static constexpr std::size_t offset_size = sizeof(int64_t); + static constexpr std::size_t total_size = seed_size + offset_size; + public: - DROPLETGeneratorImpl(at::DeviceIndex device_index) + explicit DROPLETGeneratorImpl(at::DeviceIndex device_index) : dipu::DIPUGeneratorImpl(device_index) {} - void set_state(const c10::TensorImpl& state) override {} - - void update_state() const override {} + void set_state(const c10::TensorImpl& state) override { + at::detail::check_rng_state(state); + auto state_size = state.numel(); + TORCH_CHECK( + state_size == total_size || state_size == total_size - offset_size, + "RNG state size is invalid"); + + state_ = at::Tensor( + state.shallow_copy_and_detach(state.version_counter(), true)); + state_need_reset_ = false; + } + + void update_state() const override { + if (state_need_reset_) { + state_ = at::detail::empty_cpu({static_cast(total_size)}, + c10::ScalarType::Byte, c10::nullopt, + c10::nullopt, c10::nullopt, c10::nullopt); + auto rng_state = state_.data_ptr(); + uint64_t seed = this->current_seed(); + + std::memcpy(rng_state, &seed, seed_size); + std::memset(rng_state + seed_size, 0, offset_size); + state_need_reset_ = false; + } + } }; +// NOLINTNEXTLINE(readability-const-return-type) const at::Generator vendorMakeGenerator(at::DeviceIndex device_index) { return at::make_generator(device_index); }