diff --git a/dicp/dicp/vendor/AscendGraph/conversion.py b/dicp/dicp/vendor/AscendGraph/conversion.py
index c662c33ea..c1b11e21a 100644
--- a/dicp/dicp/vendor/AscendGraph/conversion.py
+++ b/dicp/dicp/vendor/AscendGraph/conversion.py
@@ -500,17 +500,17 @@ def ne(self, a, b):
 
     @register_conversion([aten.lt.Scalar, aten.lt.Tensor])
     def lt(self, x, y):
-        y_shape = [1]
-        if isinstance(y, torch.fx.proxy.Proxy):
-            y_shape = list(y.node.meta['val'].shape)
         x_shape = list(x.node.meta['val'].shape)
+        y_shape = [] if not isinstance(
+            y, torch.fx.proxy.Proxy) else list(y.node.meta['val'].shape)
         out = list(fx_traceback.get_current_meta()['val'].shape)
         out_shape = self.get_shape_proxy(out)
         x, y = self.binary_cmp_cast_input(x, y)
-
-        if self.shape_prod(x_shape) < self.shape_prod(out):
+        dynamic_shape = symint_in_shape(x_shape) or symint_in_shape(
+            y_shape) or symint_in_shape(out)
+        if dynamic_shape and (self.shape_prod(x_shape) < self.shape_prod(out)):
             x = self.get_proxy(ascend_op.BroadcastTo, (x, out_shape))
-        if self.shape_prod(y_shape) < self.shape_prod(out):
+        if dynamic_shape and (self.shape_prod(y_shape) < self.shape_prod(out)):
             y = self.get_proxy(ascend_op.BroadcastTo, (y, out_shape))
         return self.get_proxy(ascend_op.Less, (x, y))
 
@@ -834,6 +834,26 @@ def compute_stacked_indices(self, indices, src_shape):
 
     @register_conversion(torch.ops.aten.index_put.default)
     def index_put_default(self, x, indices, values):
+        x_shape = list(x.node.meta['val'].shape)
+
+        # When the element type of indices is bool, the masked_fill operator
+        # should be used to achieve this. Currently, only indices with a length
+        # of 1 are supported.
+        if any([index.node.meta['val'].dtype in [torch.bool]
+                for index in indices if index is not None]):
+            assert len(indices) == 1
+            index = indices[0]
+            index_shape = list(index.node.meta['val'].shape)
+            index_shape_size = len(index_shape)
+            x_shape_size = len(x_shape)
+            if index_shape_size == x_shape_size:
+                return self.masked_fill(x, index, values)
+            reshape_shape = index_shape + [1] * \
+                (x_shape_size - index_shape_size)
+            reshape_op = self.get_const_proxy(reshape_shape, torch.int32)
+            index = self.get_proxy(ascend_op.Reshape, (index, reshape_op))
+            return self.masked_fill(x, index, values)
+
         # following comment is from tensorflow tensor_scatter_nd_update:
         # index_depth = indices.shape[-1]
         # batch_shape = indices.shape[:-1]
@@ -845,7 +865,6 @@ def index_put_default(self, x, indices, values):
         # tf.tensor_scatter_nd_update param 'indices' is different from
         # indices in torch.ops.aten.index_put.default, we use broadcast and
         # stack to construct param 'indices' in tf.tensor_scatter_nd_update
-        x_shape = list(x.node.meta['val'].shape)
         stacked_indices, indices_broadcast_shape, stacked_indices_last_dim = \
             self.compute_stacked_indices(indices, x.node.meta['val'].shape)
         values_broadcast_shape = indices_broadcast_shape + x_shape[stacked_indices_last_dim:] # batch_shape + inner_shape
diff --git a/dicp/test/model/test_hf.py b/dicp/test/model/test_hf.py
index 7b978b54d..fe439fe06 100644
--- a/dicp/test/model/test_hf.py
+++ b/dicp/test/model/test_hf.py
@@ -46,4 +46,4 @@
     response_list.append(response.split('\n'))
 
 for idx, dicp_result in enumerate(response_list):
-    assert dicp_result == cuda_results[idx]
+    assert dicp_result == cuda_results[idx], f"dicp result:{dicp_result}, cuda_result:{cuda_results[idx]}"
diff --git a/dicp/test/model/test_llama.py b/dicp/test/model/test_llama.py
index 0353b3aee..433538bfb 100644
--- a/dicp/test/model/test_llama.py
+++ b/dicp/test/model/test_llama.py
@@ -101,4 +101,4 @@ def test_inference(
                 prompt, max_gen_len=max_gen_len, temperature=temperature, top_p=top_p, device=device
             )
             dicp_result = dicp_result[0].split("\n")
-            assert dicp_result == cuda_results[i]
+            assert dicp_result == cuda_results[i], f"dicp result:{dicp_result}, cuda_result:{cuda_results[i]}"
diff --git a/dicp/test/model/test_stable_diffusion.py b/dicp/test/model/test_stable_diffusion.py
index 4f8ae9ac8..a94a1258a 100644
--- a/dicp/test/model/test_stable_diffusion.py
+++ b/dicp/test/model/test_stable_diffusion.py
@@ -50,8 +50,10 @@ def test_inference(
         dicp_pipe = StableDiffusionPipeline.from_pretrained(model_path).to(device)
         dicp_pipe.text_encoder = torch.compile(dicp_pipe.text_encoder, backend=backend, dynamic=dynamic)
         dicp_pipe.unet = torch.compile(dicp_pipe.unet, backend=backend, dynamic=dynamic)
-        if backend == "ascendgraph":
-            dicp_pipe.vae.decoder = torch.compile(dicp_pipe.vae.decoder, backend=backend, dynamic=dynamic)
+
+        # Temporarily run decoder on CPU
+        # if backend == "ascendgraph":
+        #     dicp_pipe.vae.decoder = torch.compile(dicp_pipe.vae.decoder, backend=backend, dynamic=dynamic)
         dicp_image = dicp_pipe(prompt, num_inference_steps=num_inference_steps).images[0]
 
         similarity = get_similarity(cpu_image, dicp_image)
diff --git a/dicp/test/op/test_index_put.py b/dicp/test/op/test_index_put.py
index 98beeb8c3..26ee1a17d 100644
--- a/dicp/test/op/test_index_put.py
+++ b/dicp/test/op/test_index_put.py
@@ -34,7 +34,7 @@ class TestIndexPut():
                                             ((1, 2, 10, 8 ,7, 11), (None, None, (2, 3), (4, 1, 1), None, (1, 2, 1)),
                                              (4, 2, 3, 1, 2, 7)))])
     @pytest.mark.parametrize("compiled_model", compiled_model)
-    def test_torch_split(self, sizes, dtype, compiled_model):
+    def test_torch_index_put(self, sizes, dtype, compiled_model):
         device = get_device()
         size = sizes.dynamic if compiled_model.dynamic else sizes.static
         x_size = size[0]
@@ -59,3 +59,27 @@ def test_torch_split(self, sizes, dtype, compiled_model):
         dicp_output = compiled_model.model(dicp_input1, dicp_indices, dicp_value)
 
         assert torch.allclose(output.cpu(), dicp_output.cpu(), equal_nan=True)
+
+    @pytest.mark.parametrize("dtype", [torch.float16])
+    @pytest.mark.parametrize("sizes", [Size((5,), (5, 3)), Size((3, 5), (5, 3)), Size((2, 3, 4), (2, 4))])
+    @pytest.mark.parametrize("compiled_model", compiled_model)
+    def test_torch_index_put_to_masked_fill(self, sizes, dtype, compiled_model):
+        device = get_device()
+        size = sizes.dynamic if compiled_model.dynamic else sizes.static
+        mask_size = size if len(size) == 1 else size[0]
+
+        input = torch.randn(size, dtype=dtype)
+        mask = torch.randn(mask_size, dtype=dtype) > 0
+        value = torch.tensor(1).to(dtype)
+        indices = [mask]
+
+        dicp_input = input.to(device)
+        dicp_indices = [mask.to(device)]
+        dicp_value = value.to(device)
+
+        output = model(input, indices, value)
+        dynamo.reset()
+        update_dynamo_config(compiled_model.dynamic)
+        dicp_output = compiled_model.model(dicp_input, dicp_indices, dicp_value)    
+
+        assert torch.allclose(output.cpu(), dicp_output.cpu(), equal_nan=True)
diff --git a/dipu/.clang-tidy b/dipu/.clang-tidy
index 8c9ddabec..14fefff48 100644
--- a/dipu/.clang-tidy
+++ b/dipu/.clang-tidy
@@ -66,6 +66,8 @@ CheckOptions:
     value: true
   - key:   readability-implicit-bool-conversion.AllowPointerConditions
     value: true
+  - key:   readability-simplify-boolean-expr.SimplifyDeMorgan
+    value: false
 # --- Google's naming convention BEGIN ---
 # modified part is marked as comment
   - key:   readability-identifier-naming.ClassCase
diff --git a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
index 47efb6a19..5ff126b9a 100755
--- a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
+++ b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
@@ -60,7 +60,7 @@
         return dipu_sub_scalar_out(self, other.item(), alpha, out);
     }
     if (self.numel() == 1 && self.is_cpu()) {
-        at::Tensor selfTensor = at::empty_like(other);
+        at::Tensor selfTensor = nodispatch::empty_like(other);
         dipu_fill__scalar(selfTensor, self.item());
         return dipu_sub_out(selfTensor, other, alpha, out);
     }
@@ -78,7 +78,7 @@
 
 - schema: "div.Scalar(Tensor self, Scalar other) -> Tensor"
   custom_code_at_the_beginning: |
-    auto out = at::empty_like(self);
+    auto out = nodispatch::empty_like(self);
   interface: diopiDivScalar(ctx, out, self, other, RoundModeNone)
 
 - schema: "div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)"
@@ -101,7 +101,7 @@
     }
     if (self.numel() == 1 && self.is_cpu()) {
         // todo:(wentao) temp solution, need using a type promotion strategy
-        at::Tensor selfD = at::empty_like(other);
+        at::Tensor selfD = nodispatch::empty_like(other);
         dipu_fill__scalar(selfD, self.item());
         return dipu_div_out(selfD, other, out);
     }
@@ -124,7 +124,7 @@
   interface: diopiDiv(ctx, out, self, other, mode)
 
 - schema: "mul.Scalar(Tensor self, Scalar other) -> Tensor"
-  custom_code_at_the_beginning: auto out = at::empty_like(self);
+  custom_code_at_the_beginning: auto out = nodispatch::empty_like(self);
   interface: diopiMulScalar(ctx, out, self, other)
 
 - schema: "mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)"
@@ -164,7 +164,7 @@
 - schema: "logical_and(Tensor self, Tensor other) -> Tensor"
   custom_code_at_the_beginning: |
     auto shape = at::infer_size(self.sizes(), other.sizes());
-    auto out = at::empty(shape, self.options().dtype(at::kBool));
+    auto out = nodispatch::empty(shape, self.options().dtype(at::kBool));
   interface: diopiLogicalAnd(ctx, out, self, other);
 
 - schema: "logical_or.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"
@@ -181,7 +181,7 @@
 - schema: "logical_or(Tensor self, Tensor other) -> Tensor"
   custom_code_at_the_beginning: |
     auto shape = at::infer_size(self.sizes(), other.sizes());
-    auto out = at::empty(shape, self.options().dtype(at::kBool));
+    auto out = nodispatch::empty(shape, self.options().dtype(at::kBool));
   interface: diopiLogicalOr(ctx, out, self, other);
 
 - schema: "logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"
@@ -196,7 +196,7 @@
 
 - schema: "logical_not(Tensor self) -> Tensor"
   custom_code_at_the_beginning: |
-    auto out = at::empty(self.sizes(), self.options().dtype(at::kBool));
+    auto out = nodispatch::empty(self.sizes(), self.options().dtype(at::kBool));
   interface: diopiLogicalNot(ctx, out, self);
 
 - schema: "aten::native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))"
@@ -212,7 +212,7 @@
     const int64_t dim_c = input.size(1);
     const auto input_shape = input.sizes();
     const int axis = input_shape.size();
-    auto out0 = at::empty_like(input, input.options(), \
+    auto out0 = nodispatch::empty_like(input, input.options(), \
         (axis==4?\
             (c10::optional<at::MemoryFormat>(${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt})):\
             (axis==5?\
@@ -224,11 +224,11 @@
     at::Tensor out2;
     if (!training) {
         // do not require save_mean/save_invstd when in test mode
-        out1 = at::empty({0}, options);
-        out2 = at::empty({0}, options);
+        out1 = nodispatch::empty({0}, options);
+        out2 = nodispatch::empty({0}, options);
     } else {
-        out1 = at::empty({dim_c}, options);
-        out2 = at::empty({dim_c}, options);
+        out1 = nodispatch::empty({dim_c}, options);
+        out2 = nodispatch::empty({dim_c}, options);
     }
   interface: diopiBatchNorm(ctx, out0, out1, out2, input, weight, bias, const_cast<diopiTensorHandle_t>(running_mean), const_cast<diopiTensorHandle_t>(running_var), training, momentum, eps);
   custom_code_before_call_diopi: |
@@ -243,30 +243,30 @@
     auto options = input.options().dtype(at::kFloat);
     const auto input_shape = input.sizes();
     const int axis = input_shape.size();
-    at::Tensor out0 = at::empty_like(input, input.options(), \
+    at::Tensor out0 = nodispatch::empty_like(input, input.options(), \
         (axis==4?\
              (c10::optional<at::MemoryFormat>(${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt})):\
              (axis==5?\
                  (c10::optional<at::MemoryFormat>(${PREFERRED_MEMORY_FORMAT_PLACEHOLDER_3D:-c10::nullopt})):\
                   c10::optional<at::MemoryFormat>(c10::nullopt))\
         ));
-    at::Tensor out1 = at::empty({dim_c}, options);
-    at::Tensor out2 = at::empty({dim_c}, options);
+    at::Tensor out1 = nodispatch::empty({dim_c}, options);
+    at::Tensor out2 = nodispatch::empty({dim_c}, options);
   interface: diopiBatchNormBackward(ctx, out0, out1, out2, grad_out, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps)
 
 - schema: "native_group_norm(Tensor input, Tensor? weight, Tensor? bias, SymInt N, SymInt C, SymInt HxW, int group, float eps) -> (Tensor, Tensor, Tensor)"
   custom_code_at_the_beginning: |
-    auto out0 = at::empty_like(input);
+    auto out0 = nodispatch::empty_like(input);
     auto options = input.options().dtype(dipu::native::mixed_output_scalar_type(input, weight, bias));
-    auto out1 = at::empty({N.expect_int(), group}, options);
-    auto out2 = at::empty({N.expect_int(), group}, options);
+    auto out1 = nodispatch::empty({N.expect_int(), group}, options);
+    auto out2 = nodispatch::empty({N.expect_int(), group}, options);
   interface: diopiGroupNorm(ctx, out0, out1, out2, input, weight, bias, group, eps);
 
 - schema: "native_group_norm_backward(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, SymInt N, SymInt C, SymInt HxW, int group, bool[3] output_mask) -> (Tensor, Tensor, Tensor)"
   custom_code_at_the_beginning: |
-    auto out0 = output_mask[0] ? at::empty_like(input) : at::Tensor();
-    auto out1 = output_mask[1] ? at::empty_like(weight.value()) : at::Tensor();
-    auto out2 = output_mask[2] ? at::empty_like(weight.value()) : at::Tensor();
+    auto out0 = output_mask[0] ? nodispatch::empty_like(input) : at::Tensor();
+    auto out1 = output_mask[1] ? nodispatch::empty_like(weight.value()) : at::Tensor();
+    auto out2 = output_mask[2] ? nodispatch::empty_like(weight.value()) : at::Tensor();
   interface: diopiGroupNormBackward(ctx, out0, out1, out2, grad_out, input, weight, mean, rstd, group);
 
 - schema: "native_layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor out, Tensor save_mean, Tensor save_invstd)"
@@ -277,9 +277,9 @@
     std::vector<int64_t> stats_shape(input_shape.size(), 1);
     std::copy(input_shape.begin(), input_shape.begin() + axis, stats_shape.begin());
     auto options = input.options();
-    auto save_mean = at::empty(stats_shape, options);
-    auto save_invstd = at::empty(stats_shape, options);
-    auto out = at::empty_like(
+    auto save_mean = nodispatch::empty(stats_shape, options);
+    auto save_invstd = nodispatch::empty(stats_shape, options);
+    auto out = nodispatch::empty_like(
       input,
       c10::nullopt /* dtype */,
       c10::nullopt /* layout */,
@@ -293,9 +293,9 @@
 - schema: "native_layer_norm_backward(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)"
   custom_code_at_the_beginning: |
     auto options = grad_out.options();
-    auto grad_input = output_mask[0] ? at::empty(input.sizes(), options) : at::Tensor();
-    auto grad_weight = output_mask[1] ? at::empty(weight.value().sizes(), options) : at::Tensor();
-    auto grad_bias = output_mask[2] ? at::empty(bias.value().sizes(), options) : at::Tensor();
+    auto grad_input = output_mask[0] ? nodispatch::empty(input.sizes(), options) : at::Tensor();
+    auto grad_weight = output_mask[1] ? nodispatch::empty(weight.value().sizes(), options) : at::Tensor();
+    auto grad_bias = output_mask[2] ? nodispatch::empty(bias.value().sizes(), options) : at::Tensor();
   interface: diopiLayerNormBackward(ctx, grad_input, grad_weight, grad_bias, grad_out, input, weight,  bias, mean, rstd, normalized_shape);
 
 - schema: "aten::native_layer_norm_backward.out(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask, *, Tensor(a!) grad_input, Tensor(b!) grad_weight, Tensor(c!) grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))"
@@ -312,12 +312,12 @@
     auto out_tensor_size = self.sizes().vec();
     out_tensor_size[self.dim() - 2] = output_size[0].expect_int();
     out_tensor_size[self.dim() - 1] = output_size[1].expect_int();
-    at::Tensor out = at::empty(out_tensor_size, self.options());
+    at::Tensor out = nodispatch::empty(out_tensor_size, self.options());
   interface: diopiAdaptiveAvgPool2d(ctx, out, self, output_size)
 
 - schema: "_adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor"
   custom_code_at_the_beginning: |
-    auto out = at::empty_like(self);
+    auto out = nodispatch::empty_like(self);
   interface: diopiAdaptiveAvgPool2dBackward(ctx, out, grad_output, self);
 
 - schema: "avg_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!)"
@@ -470,7 +470,7 @@
   interface: diopiReluInp(ctx, self)
 
 - schema: "relu(Tensor self) -> Tensor"
-  custom_code_at_the_beginning: auto out = at::empty_like(self);
+  custom_code_at_the_beginning: auto out = nodispatch::empty_like(self);
   interface: diopiRelu(ctx, out, self)
 
 - schema: "randperm.out(int n, *, Tensor(a!) out) -> Tensor(a!)"
@@ -489,7 +489,7 @@
     ::diopiConstTensorHandle_t self_dtype_diopi = dipu::diopi_helper::toDiopiTensorHandle(self_dtype);
     if (out.numel() == 0) {
       std::vector<int64_t> output_shape = infer_reduce_op_shape(self.sizes(), dim.value_or(std::vector<int64_t>()), keepdim);
-      out = at::empty(output_shape, self.options());
+      out = nodispatch::empty(output_shape, self.options());
       }
     ::diopiSize_t diopi_size = toDiopiSize(dim);
   interface: diopiSum(ctx, out, self_dtype_diopi, diopi_size)
@@ -503,7 +503,7 @@
   register_op: False
   custom_code_at_the_beginning: |
     const auto reductionDiopi = static_cast<::diopiReduction_t>(reduction);
-    at::Tensor out = at::empty_like(self);
+    at::Tensor out = nodispatch::empty_like(self);
   interface: diopiCrossEntropyLossBackward(ctx, out, grad_output, self, target, weight, reductionDiopi, ignore_index.expect_int(), label_smoothing)
 
 - schema: "cross_entropy_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, float label_smoothing=0.0) -> Tensor"
@@ -514,9 +514,9 @@
     at::Tensor out;
     auto options = self.options();
     if (reductionDiopi == ReductionNone) {
-      out = at::empty(target.sizes(), options);
+      out = nodispatch::empty(target.sizes(), options);
     } else {
-      out = at::empty({}, options);
+      out = nodispatch::empty({}, options);
     }
   interface: diopiCrossEntropyLoss(ctx, out, self, target, weight, reductionDiopi, ignore_index_int, label_smoothing)
   backward_schema: "cross_entropy_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, float label_smoothing=0.0) -> Tensor"
@@ -546,7 +546,7 @@
     int64_t out_height = (height + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1) / stride[0] + 1;
     int64_t out_width = (width + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1) / stride[1] + 1;
     c10::SmallVector<int64_t, 4> output_size = {batch_size, out_channel, out_height, out_width};
-    at::Tensor out = at::empty(output_size, input.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-input.suggest_memory_format()});
+    at::Tensor out = nodispatch::empty(output_size, input.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-input.suggest_memory_format()});
   interface: diopiConvolution2d(&context, out, input, weight, bias, stride, padding, dilation, groups)
 
 - schema: "convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)"
@@ -558,14 +558,14 @@
     at::Tensor grad_bias;
     std::vector<int64_t> bias_sizes;
     if (output_mask[0]) {
-      grad_input = at::empty(input.sizes(), input.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+      grad_input = nodispatch::empty(input.sizes(), input.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
     }
     if (output_mask[1]) {
-      grad_weight = at::empty(weight.sizes(), weight.options().dtype(at::kFloat).memory_format(weight.suggest_memory_format()));
+      grad_weight = nodispatch::empty(weight.sizes(), weight.options().dtype(at::kFloat).memory_format(weight.suggest_memory_format()));
     }
     if (output_mask[2]) {
       bias_sizes.push_back(grad_output.size(1));
-      grad_bias = at::empty(bias_sizes, grad_output.options());
+      grad_bias = nodispatch::empty(bias_sizes, grad_output.options());
     }
   custom_code_before_call_diopi: |
     ::diopiSize_t* bias_sizes_ptr = output_mask[2] ? &bias_sizesDiopiSize : nullptr;
@@ -578,10 +578,10 @@
     at::Tensor grad_input;
     at::Tensor grad_weight;
     at::Tensor grad_bias;
-    grad_input = at::empty(input.sizes(), input.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
-    grad_weight = at::empty(weight.sizes(), weight.options().dtype(at::kFloat));
+    grad_input = nodispatch::empty(input.sizes(), input.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+    grad_weight = nodispatch::empty(weight.sizes(), weight.options().dtype(at::kFloat));
     if (output_mask[2]) {
-        grad_bias = at::empty({grad_output.size(1)}, grad_output.options());
+        grad_bias = nodispatch::empty({grad_output.size(1)}, grad_output.options());
     }
   custom_code_before_call_diopi: |
     ::diopiSize_t* bias_sizes_ptr = output_mask[2] ? &bias_sizesDiopiSize : nullptr;
@@ -600,7 +600,7 @@
     const int64_t w_out = (w_in - 1) * stride[1] - 2 * padding[1] + (dilation[1] * (kernel_width - 1) + 1) + output_padding[1];
     const int64_t c_out = weight.size(1) * groups;
     auto output_shape =  input.sizes().size() == 3 ? std::vector<int64_t>{c_out, h_out, w_out} : std::vector<int64_t>{n, c_out, h_out, w_out};
-    auto out = at::empty(output_shape, input.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+    auto out = nodispatch::empty(output_shape, input.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
   interface: diopiConvTranspose2d(ctx, out, input, weight, bias, stride, padding, output_padding, groups, dilation)
   forward_process_code: |
     bool bias_has_value = (bias.has_value()) ? bias.value().requires_grad() : false;
@@ -639,11 +639,11 @@
 
 - schema: "native_dropout(Tensor input, float p, bool? train) -> (Tensor, Tensor)"
   custom_code_at_the_beginning: |
-    at::Tensor out0 = at::empty_like(input);
+    at::Tensor out0 = nodispatch::empty_like(input);
     at::Tensor out1;
     bool train_ = train.value_or(false);
     if (train_) {
-      out1 = at::empty(input.sizes(), input.options().dtype(at::kByte));;
+      out1 = nodispatch::empty(input.sizes(), input.options().dtype(at::kByte));;
     }
     diopiGeneratorHandle_t generatorDiopiGenerator = toDiopiGeneratorHandle(getDefaultDIPUGenerator());
   interface: diopiDropout(ctx, out0, out1, input, p, train_, generatorDiopiGenerator)
@@ -672,7 +672,7 @@
 
 - schema: "abs(Tensor self) -> Tensor"
   custom_code_at_the_beginning: |
-    auto out = at::empty_like(self);
+    auto out = nodispatch::empty_like(self);
   interface: diopiAbs(ctx, out, self)
 
 - schema: "abs_(Tensor(a!) self) -> Tensor(a!)"
@@ -715,8 +715,8 @@
       dim = dim + static_cast<int64_t>(output_size.size());
     }
     output_size[dim] = k;
-    auto values = at::empty(output_size, self.options());
-    auto indices = at::empty(output_size, self.options().dtype(at::kLong));
+    auto values = nodispatch::empty(output_size, self.options());
+    auto indices = nodispatch::empty(output_size, self.options().dtype(at::kLong));
   interface: diopiTopk(ctx, values, indices, self, k, dim, largest, sorted)
 
 - schema: "mean.out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)"
@@ -730,7 +730,7 @@
   torch_ver: ["200",]
   custom_code_at_the_beginning: |
     std::vector<int64_t> output_shape = infer_reduce_op_shape(self.sizes(), dim.value_or(std::vector<int64_t>()), keepdim);
-    auto out = at::empty(output_shape, self.options());
+    auto out = nodispatch::empty(output_shape, self.options());
     bool unbiased = correction.value_or(1) == 1;
     ::diopiSize_t diopi_size = toDiopiSize(dim);
   interface: diopiStd(ctx, out, self, diopi_size, unbiased);
@@ -746,7 +746,7 @@
   torch_ver: ["211",]
   custom_code_at_the_beginning: |
     std::vector<int64_t> output_shape = infer_reduce_op_shape(self.sizes(), dim.value_or(std::vector<int64_t>()), keepdim);
-    auto out = at::empty(output_shape, self.options());
+    auto out = nodispatch::empty(output_shape, self.options());
     bool unbiased = correction.value_or(1).toLong() == 1;
     ::diopiSize_t diopi_size = toDiopiSize(dim);
   interface: diopiStd(ctx, out, self, diopi_size, unbiased);
@@ -766,13 +766,13 @@
     at::Tensor grad_weight;
     at::Tensor grad_bias;
     if (output_mask[0]) {
-      grad_input = at::empty(input.sizes(), grad_output.options());
+      grad_input = nodispatch::empty(input.sizes(), grad_output.options());
     }
     if (output_mask[1]) {
-      grad_weight = at::empty(weight.sizes(), grad_output.options());
+      grad_weight = nodispatch::empty(weight.sizes(), grad_output.options());
     }
     if (output_mask[2]) {
-      grad_bias = at::empty({grad_output.size(-1)}, grad_output.options());
+      grad_bias = nodispatch::empty({grad_output.size(-1)}, grad_output.options());
     }
   interface: diopiLinearBackward(ctx, grad_input, grad_weight, grad_bias, grad_output, input, weight)
 
@@ -782,7 +782,7 @@
   custom_code_at_the_beginning: |
     std::vector<int64_t> output_size(input.sizes().begin(), input.sizes().end());
     output_size.back() = weight.sizes()[0];
-    auto out = at::empty(output_size, input.options());
+    auto out = nodispatch::empty(output_size, input.options());
   interface: diopiLinear(ctx, out, input, weight, bias)
 
 - schema: "_log_softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype, *, Tensor(a!) out) -> Tensor(a!)"
@@ -810,7 +810,7 @@
   register_op: False
   size_attr: [kernel_size, stride, padding, dilation]
   custom_code_at_the_beginning: |
-    auto out = at::empty(input.sizes(), grad_output.options());
+    auto out = nodispatch::empty(input.sizes(), grad_output.options());
   interface: diopiMaxPool2dBackward(ctx, out, grad_output, input, kernel_size, stride, padding, dilation, ceil_mode, indices)
 
 - schema: "max_pool2d(Tensor input, int[2] kernel_size=1, int[2] stride=1, int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor"
@@ -824,7 +824,7 @@
     int64_t out_height = std::floor((height + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1) / stride[0] + 1);
     int64_t out_width = std::floor((width + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1) / stride[1] + 1);
     c10::SmallVector<int64_t, 4> output_size = {batch_size, channel, out_height, out_width};
-    at::Tensor out = at::empty(output_size, input.options());
+    at::Tensor out = nodispatch::empty(output_size, input.options());
   interface: diopiMaxPool2d(&context, out, input, kernel_size, stride, padding, dilation, ceil_mode)
   autograd: True
   saved_data: [kernel_size, stride, padding, dilation, input, ceil_mode]
@@ -861,7 +861,7 @@
     if (reduction != 0) {
         output = torch::tensor(0.0, self.options());
     } else {
-        output = at::empty(target.sizes(), self.options());
+        output = nodispatch::empty(target.sizes(), self.options());
     }
 
 - schema: nll_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!)
@@ -873,7 +873,7 @@
 - schema: nll_loss2d_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight) -> Tensor grad_input
   interface: diopiNLLLossBackward(&context, grad_input, grad_output, self, target, weight, static_cast<diopiReduction_t>(reduction), ignore_index.expect_int());
   custom_code_at_the_beginning: |
-    auto grad_input = at::empty(self.sizes(), self.options());
+    auto grad_input = nodispatch::empty(self.sizes(), self.options());
 
 - schema: "threshold_backward.grad_input(Tensor grad_output, Tensor self, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!)"
   interface: diopiThresholdBackward(ctx, grad_input, grad_output, self, &threshold)
@@ -937,7 +937,7 @@
     }
     const std::vector<int64_t>& const_tmp = tmp;
     shape = at::ArrayRef<int64_t>(const_tmp);
-    auto out = at::empty({shape}, tensors[0].options());
+    auto out = nodispatch::empty({shape}, tensors[0].options());
 
     std::vector<diopiConstTensorHandle_t> diopiTensorHandles(tensors.size());
     for (size_t i = 0; i < tensors.size(); ++i) {
@@ -964,8 +964,8 @@
     } else {
       dim_ = dim;
     }
-    auto values = at::empty(self.sizes(), self.options());
-    auto indices = at::empty(self.sizes(), self.options().dtype(at::kLong));
+    auto values = nodispatch::empty(self.sizes(), self.options());
+    auto indices = nodispatch::empty(self.sizes(), self.options().dtype(at::kLong));
   interface: diopiSort(ctx, values, indices, self, dim_, descending, nullptr)
 
 - schema: "sort.values(Tensor self, int dim=-1, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)"
@@ -1000,7 +1000,7 @@
 
 - schema: "tril(Tensor self, int diagonal=0) -> Tensor"
   custom_code_at_the_beginning: |
-    auto out = at::empty_like(self);
+    auto out = nodispatch::empty_like(self);
   interface: diopiTril(ctx, out, self, diagonal)
 
 - schema: "tril.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)"
@@ -1008,12 +1008,12 @@
 
 - schema: "multinomial(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor"
   custom_code_at_the_beginning: |
-    auto out = at::empty_like(self);
+    auto out = nodispatch::empty_like(self);
     if (self.dim() == 2){
-      out = at::empty({self.size(0), num_samples}, self.options().dtype(at::kLong));
+      out = nodispatch::empty({self.size(0), num_samples}, self.options().dtype(at::kLong));
     }
     else if (self.dim() == 1) {
-      out = at::empty({num_samples,}, self.options().dtype(at::kLong));
+      out = nodispatch::empty({num_samples,}, self.options().dtype(at::kLong));
     }
   interface: diopiMultinomial(ctx, out, self, num_samples, replacement, generator)
 
@@ -1022,14 +1022,14 @@
 
 - schema: "roll(Tensor self, int[1] shifts, int[1] dims=[]) -> Tensor"
   custom_code_at_the_beginning: |
-    auto out = at::empty_like(self);
+    auto out = nodispatch::empty_like(self);
     ::diopiSize_t diopi_shifts = toDiopiSize(shifts);
     ::diopiSize_t diopi_dims = toDiopiSize(dims);
   interface: diopiRoll(ctx, out, self, diopi_shifts, diopi_dims)
 
 - schema: "leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor"
   custom_code_at_the_beginning: |
-    auto out = at::empty_like(self);
+    auto out = nodispatch::empty_like(self);
   interface: diopiLeakyRelu(ctx, out, self, negative_slope)
 
 - schema: "leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!)"
@@ -1047,9 +1047,9 @@
     at::Tensor out;
     auto options = self.options();
     if (reductionDiopi == ReductionNone) {
-      out = at::empty(self.sizes(), options);
+      out = nodispatch::empty(self.sizes(), options);
     } else {
-      out = at::empty({}, options);
+      out = nodispatch::empty({}, options);
     }
   interface: diopiMSELoss(ctx, out, self, target, reductionDiopi)
 
@@ -1060,7 +1060,7 @@
 
 - schema: "mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor grad_input"
   custom_code_at_the_beginning: |
-    auto grad_input = at::empty(self.sizes(), grad_output.options());
+    auto grad_input = nodispatch::empty(self.sizes(), grad_output.options());
     const auto reductionDiopi = static_cast<::diopiReduction_t>(reduction);
   interface: diopiMSELossBackward(ctx, grad_input, grad_output, self, target, reductionDiopi)
 
@@ -1122,7 +1122,7 @@
   custom_code_at_the_beginning: |
     auto promoted_dtype = at::native::get_dtype_from_self(self, dtype, /*promote_integers=*/true);
     const auto self_dtype = at::native::to(self, promoted_dtype);
-    auto out = at::empty({}, self_dtype.options());
+    auto out = nodispatch::empty({}, self_dtype.options());
     ::diopiConstTensorHandle_t self_dtype_diopi = dipu::diopi_helper::toDiopiTensorHandle(self_dtype);
   interface: diopiProd(ctx, out, self_dtype_diopi, nullptr)
 
@@ -1145,12 +1145,12 @@
       output_size[j] *= self_sizes.at(i);
     }
 
-    at::Tensor out = at::empty(output_size, self.options());
+    at::Tensor out = nodispatch::empty(output_size, self.options());
   interface: diopiRepeat(ctx, out, self, repeats)
 
 - schema: rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
   custom_code_at_the_beginning: |
-    auto out = at::empty_like(self);
+    auto out = nodispatch::empty_like(self);
     // NOLINTNEXTLINE(readability-suspicious-call-argument)
     return dipu_sub_out(other, self, alpha, out);
   interface: diopiSub(ctx, out, other, self, alpha)
@@ -1165,7 +1165,7 @@
       if (dim < 0) {
         dim += static_cast<int64_t>(ndims);
       }
-      indices = at::empty({self.sizes().at(dim)}, self.options().dtype(at::kLong));
+      indices = nodispatch::empty({self.sizes().at(dim)}, self.options().dtype(at::kLong));
     }
     diopiTensorHandle_t out_ptr = nullptr;
     diopiTensorHandle_t counts_ptr = nullptr;
@@ -1183,7 +1183,7 @@
     at::Tensor counts;
     at::Tensor indices;
     if (return_inverse) {
-      indices = at::empty(self.sizes(), self.options().dtype(at::kLong));
+      indices = nodispatch::empty(self.sizes(), self.options().dtype(at::kLong));
     }
     diopiTensorHandle_t out_ptr = nullptr;
     diopiTensorHandle_t counts_ptr = nullptr;
@@ -1205,7 +1205,7 @@
 
 - schema: "masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor"
   custom_code_at_the_beginning: |
-    auto out = at::empty_like(self);
+    auto out = nodispatch::empty_like(self);
   interface: diopiMaskedFill(ctx, out, self, mask, value)
 
 - schema: "masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!)"
@@ -1213,7 +1213,7 @@
 
 - schema: "masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor"
   custom_code_at_the_beginning: |
-    auto out = at::empty_like(self);
+    auto out = nodispatch::empty_like(self);
   interface: diopiMaskedFillScalar(ctx, out, self, mask, value)
 
 - schema: "masked_fill_.Scalar(Tensor(a!) self, Tensor mask, Scalar value) -> Tensor(a!)"
@@ -1221,7 +1221,7 @@
 
 - schema: "min(Tensor self) -> Tensor"
   custom_code_at_the_beginning: |
-    auto out = at::empty({}, self.options());
+    auto out = nodispatch::empty({}, self.options());
   interface: diopiMinAll(ctx, out, self)
 
 - schema: "min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) min, Tensor(b!) min_indices)"
@@ -1231,7 +1231,7 @@
 
 - schema: "max(Tensor self) -> Tensor"
   custom_code_at_the_beginning: |
-    auto out = at::empty({}, self.options());
+    auto out = nodispatch::empty({}, self.options());
   interface: diopiMaxAll(ctx, out, self)
 
 - schema: "maximum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"
@@ -1314,8 +1314,8 @@
   custom_code_at_the_beginning: |
     // todo:(wentao) temp solution, need using a type promotion strategy
     at::Tensor out = self.numel() == 1 && self.is_cpu() ? 
-        at::empty_like(other) :
-        at::empty_like(self);
+        nodispatch::empty_like(other) :
+        nodispatch::empty_like(self);
     out = dipu_div_out(self,other,out);
   interface: diopiFloorInp(ctx,out)
 
@@ -1323,7 +1323,7 @@
   custom_code_at_the_beginning: |
     auto shape = at::infer_size(condition.sizes(), self.sizes());
     shape = at::infer_size(shape, other.sizes());
-    auto out = at::empty(shape, self.options());
+    auto out = nodispatch::empty(shape, self.options());
   interface: diopiWhere(ctx, out, condition,self, other)
 
 - schema: "gelu.out(Tensor self, *, str approximate='none', Tensor(a!) out) -> Tensor(a!)"
@@ -1333,7 +1333,7 @@
   interface: diopiGeluBackward(ctx, grad_input, grad_output, self, approximate.data())
 
 - schema: "hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor"
-  custom_code_at_the_beginning: auto out = at::empty_like(self);
+  custom_code_at_the_beginning: auto out = nodispatch::empty_like(self);
   custom_code_before_call_diopi: |
     min_valDiopiScalar = dipu::diopi_helper::toDiopiScalar(min_val, at::kDouble);
     max_valDiopiScalar = dipu::diopi_helper::toDiopiScalar(max_val, at::kDouble);
@@ -1361,7 +1361,7 @@
 
 - schema: "hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor grad_input"
   custom_code_at_the_beginning: |
-    auto grad_input = at::empty(self.sizes(), grad_output.options());
+    auto grad_input = nodispatch::empty(self.sizes(), grad_output.options());
   interface: diopiHardtanhBackward(ctx, grad_input, grad_output, self, min_val, max_val)
 
 - schema: "upsample_nearest2d.out(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)"
@@ -1390,7 +1390,7 @@
       size[0] = std::floor(self.size(-2) * scales_h.value_or(1.0));
       size[1] = std::floor(self.size(-1) * scales_w.value_or(1.0));
     }
-    auto out = at::empty({self.size(0),self.size(1),size[0],size[1]},self.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+    auto out = nodispatch::empty({self.size(0),self.size(1),size[0],size[1]},self.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
   interface: diopiUpsampleNearest(ctx, out, self, size);
 
 - schema: "upsample_bilinear2d.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)"
@@ -1420,7 +1420,7 @@
       size[0] = std::floor(self.size(-2) * scales_h.value_or(1.0));
       size[1] = std::floor(self.size(-1) * scales_w.value_or(1.0));
     }
-    auto out = at::empty({self.size(0),self.size(1),size[0],size[1]},self.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+    auto out = nodispatch::empty({self.size(0),self.size(1),size[0],size[1]},self.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
     const char* mode = "bilinear";
   interface: diopiUpsampleLinear(ctx, out, self, size, align_corners, mode);
 
@@ -1444,7 +1444,7 @@
     auto symInt2Int = [](const c10::SymInt& t)-> int64_t {return t.expect_int();};
     std::vector<int64_t> grad_input_shape(input_size.size());
     std::transform(input_size.cbegin(), input_size.cend(), grad_input_shape.begin(), symInt2Int);
-    auto grad_input = at::empty(grad_input_shape,grad_output.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+    auto grad_input = nodispatch::empty(grad_input_shape,grad_output.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
   custom_code_before_call_diopi: |
     if (output_size.size() > 0) {
       std::copy(output_sizeVector.begin(), output_sizeVector.end(), size.begin());
@@ -1475,7 +1475,7 @@
     auto symInt2Int = [](const c10::SymInt& t)-> int64_t {return t.expect_int();};
     std::vector<int64_t> grad_input_shape(input_size.size());
     std::transform(input_size.cbegin(), input_size.cend(), grad_input_shape.begin(), symInt2Int);
-    auto grad_input = at::empty(grad_input_shape,grad_output.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+    auto grad_input = nodispatch::empty(grad_input_shape,grad_output.options(),${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
   custom_code_before_call_diopi: |
     if (output_size.size() > 0) {
       std::copy(output_sizeVector.begin(), output_sizeVector.end(), size.begin());
@@ -1488,7 +1488,7 @@
 
 - schema: "sin(Tensor self) -> Tensor"
   custom_code_at_the_beginning: |
-    auto out = at::empty_like(self);
+    auto out = nodispatch::empty_like(self);
   interface: diopiSin(ctx, out, self)
 
 - schema: "sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"
@@ -1499,7 +1499,7 @@
 
 - schema: "cos(Tensor self) -> Tensor"
   custom_code_at_the_beginning: |
-    auto out = at::empty_like(self);
+    auto out = nodispatch::empty_like(self);
   interface: diopiCos(ctx, out, self)
 
 - schema: "cos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"
@@ -1530,7 +1530,7 @@
 - schema: normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor
   autocompare: disable
   custom_code_at_the_beginning: |
-      auto out = at::empty_like(mean);
+      auto out = nodispatch::empty_like(mean);
   interface: diopiNormalTensorScalar(ctx, out, mean, std, generator)
 
 - schema: normal.float_Tensor_out(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
@@ -1540,7 +1540,7 @@
 - schema: normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor
   autocompare: disable
   custom_code_at_the_beginning: |
-      auto out = at::empty_like(std);
+      auto out = nodispatch::empty_like(std);
   interface: diopiNormalScalarTensor(ctx, out, mean, std, generator)
 
 - schema: normal.Tensor_Tensor_out(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
@@ -1550,7 +1550,7 @@
 - schema: normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor
   autocompare: disable
   custom_code_at_the_beginning: |
-      auto out = at::empty_like(mean);
+      auto out = nodispatch::empty_like(mean);
   interface: diopiNormalTensor(ctx, out, mean, std, generator)
 
 - schema: normal.float_float_out(float mean, float std, SymInt[] size, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
@@ -1602,7 +1602,7 @@
       }
     }
 
-    auto out = at::empty(output_shape, self.options());
+    auto out = nodispatch::empty(output_shape, self.options());
 
   interface: diopiMatmul(ctx, out, self, other)
 
@@ -1631,7 +1631,7 @@
 
 - schema: "flip(Tensor self, int[] dims) -> Tensor"
   custom_code_at_the_beginning: |
-    auto out = at::empty_like(self);
+    auto out = nodispatch::empty_like(self);
     ::diopiSize_t diopi_size = toDiopiSize(dims);
   interface: diopiFlip(ctx, out,self,diopi_size)
 
@@ -1647,12 +1647,12 @@
     std::vector<int64_t> output_shape(shape.begin(), shape.end());
     dim += dim >= 0 ? 0 : static_cast<int64_t>(shape.size());
     output_shape[dim] = index.numel();
-    auto out = at::empty({output_shape}, self.options());
+    auto out = nodispatch::empty({output_shape}, self.options());
   interface: diopiIndexSelect(ctx, out, self, dim, index)
 
 - schema: "hardswish(Tensor self) -> Tensor"
   custom_code_at_the_beginning: |
-    auto out = at::empty(self.sizes(), self.options());
+    auto out = nodispatch::empty(self.sizes(), self.options());
   interface: diopiHardswish(ctx, out, self)
 
 - schema: "hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"
@@ -1663,7 +1663,7 @@
 
 - schema: "hardswish_backward(Tensor grad_output, Tensor self) -> Tensor grad_input"
   custom_code_at_the_beginning: |
-    auto grad_input = at::empty(self.sizes(), grad_output.options());
+    auto grad_input = nodispatch::empty(self.sizes(), grad_output.options());
   interface: diopiHardswishBackward(ctx, grad_input, grad_output, self)
 
 - schema: "sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"
@@ -1681,9 +1681,9 @@
     at::Tensor out;
     auto options = self.options();
     if (reductionDiopi == ReductionNone) {
-      out = at::empty(self.sizes(), options);
+      out = nodispatch::empty(self.sizes(), options);
     } else {
-      out = at::empty({}, options);
+      out = nodispatch::empty({}, options);
     }
   interface: diopiBCELoss(ctx, out, self, target, weight, reductionDiopi)
 
@@ -1696,7 +1696,7 @@
   register_op: False
   custom_code_at_the_beginning: |
     const auto reductionDiopi = static_cast<::diopiReduction_t>(reduction);
-    at::Tensor grad_input = at::empty_like(log_probs);
+    at::Tensor grad_input = nodispatch::empty_like(log_probs);
 
   interface: diopiCTCLossBackward(ctx,  grad_input,  grad_output, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, blank, reductionDiopi, zero_infinity)
 
@@ -1712,12 +1712,12 @@
     int64_t max_target_length = target_lengths.max().cpu().item().to<int64_t>();
     auto options = log_probs.options();
     if (reductionDiopi == ReductionNone) {
-      out = at::empty({batch_size}, options);
+      out = nodispatch::empty({batch_size}, options);
     } else {
-      out = at::empty({1}, options);
+      out = nodispatch::empty({1}, options);
     }
-    at::Tensor neg_log_likelihood = at::empty({batch_size}, options);
-    at::Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, options);
+    at::Tensor neg_log_likelihood = nodispatch::empty({batch_size}, options);
+    at::Tensor log_alpha = nodispatch::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, options);
   interface: diopiCTCLoss(ctx, out, neg_log_likelihood, log_alpha, log_probs, targets, input_lengths, target_lengths, blank, reductionDiopi, zero_infinity)
   forward_process_code: |
     auto targets_dev = targets.to(log_probs.device());
@@ -1751,8 +1751,8 @@
     int64_t num_labels = log_probs.size(2);
     int64_t max_target_length = target_lengths.max().cpu().item().to<int64_t>();
 
-    at::Tensor neg_log_likelihood = at::empty({batch_size}, options);
-    at::Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, options);
+    at::Tensor neg_log_likelihood = nodispatch::empty({batch_size}, options);
+    at::Tensor log_alpha = nodispatch::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, options);
   backward_return_code: |
     /* Note: This kernel's output size will be checked by pytorch/torch/csrc/autograd/custom_function.h
     *
@@ -1794,15 +1794,15 @@
   custom_code_at_the_beginning: |
     const auto reductionDiopi = static_cast<::diopiReduction_t>(reduction);
 
-    auto input_lengths_tensor = at::empty({static_cast<int64_t>(input_lengths.size())}, at::kLong);
-    auto target_lengths_tensor =  at::empty({static_cast<int64_t>(target_lengths.size())}, at::kLong);
+    auto input_lengths_tensor = nodispatch::empty_cpu({static_cast<int64_t>(input_lengths.size())}, at::kLong);
+    auto target_lengths_tensor =  nodispatch::empty_cpu({static_cast<int64_t>(target_lengths.size())}, at::kLong);
     std::copy(input_lengths.begin(), input_lengths.end(), static_cast<int64_t*>(input_lengths_tensor.data_ptr()));
     std::copy(target_lengths.begin(), target_lengths.end(), static_cast<int64_t*>(target_lengths_tensor.data_ptr()));
 
     input_lengths_tensor = input_lengths_tensor.to(log_probs.device());
     target_lengths_tensor = target_lengths_tensor.to(log_probs.device());
 
-    at::Tensor grad_input = at::empty_like(log_probs);
+    at::Tensor grad_input = nodispatch::empty_like(log_probs);
 
   interface: diopiCTCLossBackward(ctx,  grad_input,  grad_output, log_probs, targets, input_lengths_tensor, target_lengths_tensor, neg_log_likelihood, log_alpha, blank, reductionDiopi, zero_infinity)
 
@@ -1813,8 +1813,8 @@
   outs: [neg_log_likelihood, log_alpha]
   custom_code_at_the_beginning: |
     const auto reductionDiopi = static_cast<::diopiReduction_t>(reduction);
-    auto input_lengths_tensor = at::empty({static_cast<int64_t>(input_lengths.size())}, at::kLong);
-    auto target_lengths_tensor =  at::empty({static_cast<int64_t>(target_lengths.size())}, at::kLong);
+    auto input_lengths_tensor = nodispatch::empty_cpu({static_cast<int64_t>(input_lengths.size())}, at::kLong);
+    auto target_lengths_tensor =  nodispatch::empty_cpu({static_cast<int64_t>(target_lengths.size())}, at::kLong);
     std::copy(input_lengths.begin(), input_lengths.end(), static_cast<int64_t*>(input_lengths_tensor.data_ptr()));
     std::copy(target_lengths.begin(), target_lengths.end(), static_cast<int64_t*>(target_lengths_tensor.data_ptr()));
 
@@ -1827,12 +1827,12 @@
     int64_t max_target_length = target_lengths_tensor.max().cpu().item().to<int64_t>();
     auto options = log_probs.options();
     if (reductionDiopi == ReductionNone) {
-      out = at::empty({batch_size}, options);
+      out = nodispatch::empty({batch_size}, options);
     } else {
-      out = at::empty({1}, options);
+      out = nodispatch::empty({1}, options);
     }
-    at::Tensor neg_log_likelihood = at::empty({batch_size}, options);
-    at::Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, options);
+    at::Tensor neg_log_likelihood = nodispatch::empty({batch_size}, options);
+    at::Tensor log_alpha = nodispatch::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, options);
   interface: diopiCTCLoss(ctx, out, neg_log_likelihood, log_alpha, log_probs, targets, input_lengths_tensor, target_lengths_tensor, blank, reductionDiopi, zero_infinity)
   backward_schema: "ctc_loss_intlist_backward(Tensor grad_output, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, int reduction=Mean, bool zero_infinity=False) -> Tensor grad_input"
   saved_data:
@@ -1862,8 +1862,8 @@
     // int64_t max_target_length = target_lengths_tensor.max().cpu().item().to<int64_t>();
     int64_t max_target_length = *std::max_element(target_lengths.begin(), target_lengths.end());
 
-    at::Tensor neg_log_likelihood = at::empty({batch_size}, options);
-    at::Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, options);
+    at::Tensor neg_log_likelihood = nodispatch::empty({batch_size}, options);
+    at::Tensor log_alpha = nodispatch::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, options);
   backward_return_code: |
     /* Note: This kernel's output size will be checked by pytorch/torch/csrc/autograd/custom_function.h
     *
@@ -1903,8 +1903,8 @@
     int64_t batch_size = log_probs.size(1);
     int64_t num_labels = log_probs.size(2);
     int64_t max_target_length = target_lengths.max().cpu().item().to<int64_t>();
-    auto neg_log_likelihood = at::empty({batch_size}, log_probs.options());
-    auto log_alpha = at::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, log_probs.options());
+    auto neg_log_likelihood = nodispatch::empty({batch_size}, log_probs.options());
+    auto log_alpha = nodispatch::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, log_probs.options());
   interface: diopiCTCLoss(ctx, neg_log_likelihood, neg_log_likelihood, log_alpha, log_probs, targets, input_lengths, target_lengths, blank, ReductionNone, zero_infinity); # TODO: param log_alpha ?
 
 - schema: "_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor neg_log_likelihood, Tensor log_alpha)"
@@ -1915,10 +1915,10 @@
     int64_t batch_size = log_probs.size(1);
     int64_t num_labels = log_probs.size(2);
     int64_t max_target_length = *std::max_element(target_lengths.begin(), target_lengths.end());;
-    auto neg_log_likelihood = at::empty({batch_size}, log_probs.options());
-    auto log_alpha = at::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, log_probs.options());
-    auto input_lengths_tensor = at::empty({static_cast<int64_t>(input_lengths.size())}, at::kLong);
-    auto target_lengths_tensor =  at::empty({static_cast<int64_t>(target_lengths.size())}, at::kLong);
+    auto neg_log_likelihood = nodispatch::empty({batch_size}, log_probs.options());
+    auto log_alpha = nodispatch::empty({batch_size, log_probs.size(0), 2 * max_target_length + 1}, log_probs.options());
+    auto input_lengths_tensor = nodispatch::empty_cpu({static_cast<int64_t>(input_lengths.size())}, at::kLong);
+    auto target_lengths_tensor =  nodispatch::empty_cpu({static_cast<int64_t>(target_lengths.size())}, at::kLong);
     std::copy(input_lengths.begin(), input_lengths.end(), static_cast<int64_t*>(input_lengths_tensor.data_ptr()));
     std::copy(target_lengths.begin(), target_lengths.end(), static_cast<int64_t*>(target_lengths_tensor.data_ptr()));
   interface: diopiCTCLoss(ctx, neg_log_likelihood, neg_log_likelihood, log_alpha, log_probs, targets, input_lengths_tensor, target_lengths_tensor, blank, ReductionNone, zero_infinity); # TODO: param log_alpha ?
@@ -1926,7 +1926,7 @@
 - schema: "_ctc_loss_backward.Tensor(Tensor grad, Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor grad_input"
   device: [-camb, all]
   custom_code_at_the_beginning: |
-    auto grad_input = at::empty(log_probs.sizes(), grad.options());
+    auto grad_input = nodispatch::empty(log_probs.sizes(), grad.options());
   interface: diopiCTCLossBackward(ctx, grad_input, grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, blank, ReductionNone, zero_infinity)
 
 - schema: "_ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor grad_input"
@@ -1934,9 +1934,9 @@
   no_device_check_args: [input_lengths_tensor, target_lengths_tensor]
   device: [-camb, all]
   custom_code_at_the_beginning: |
-    auto grad_input = at::empty(log_probs.sizes(), grad.options());
-    auto input_lengths_tensor = at::empty({static_cast<int64_t>(input_lengths.size())}, at::kLong);
-    auto target_lengths_tensor =  at::empty({static_cast<int64_t>(target_lengths.size())}, at::kLong);
+    auto grad_input = nodispatch::empty(log_probs.sizes(), grad.options());
+    auto input_lengths_tensor = nodispatch::empty_cpu({static_cast<int64_t>(input_lengths.size())}, at::kLong);
+    auto target_lengths_tensor =  nodispatch::empty_cpu({static_cast<int64_t>(target_lengths.size())}, at::kLong);
     std::copy(input_lengths.begin(), input_lengths.end(), static_cast<int64_t*>(input_lengths_tensor.data_ptr()));
     std::copy(target_lengths.begin(), target_lengths.end(), static_cast<int64_t*>(target_lengths_tensor.data_ptr()));
   interface: diopiCTCLossBackward(ctx, grad_input, grad, log_probs, targets, input_lengths_tensor, target_lengths_tensor, neg_log_likelihood, log_alpha, blank, ReductionNone, zero_infinity)
@@ -2067,7 +2067,7 @@
     auto output_shape = at::infer_size(x1_shape, x2_shape);
     *output_shape.rbegin() = x2.size(-2);
     *(output_shape.rbegin() + 1) = x1.size(-2);
-    auto out = at::empty(output_shape, x1.options());
+    auto out = nodispatch::empty(output_shape, x1.options());
   interface: diopiCdist(ctx, out, x1, x2, p, compute_mode_ptr)
 
 - schema: "_cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor grad_input"
@@ -2079,7 +2079,7 @@
     auto grad_shape = at::infer_size(x1_shape, x2_shape);
     *grad_shape.rbegin() = x1.size(-1);
     *(grad_shape.rbegin() + 1) = x1.size(-2);
-    auto grad_input = at::empty(grad_shape, grad.options());
+    auto grad_input = nodispatch::empty(grad_shape, grad.options());
   interface: diopiCdistBackward(ctx, grad_input, grad, x1, x2, p, cdist)
 
 - schema: "erfinv.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"
@@ -2091,7 +2091,7 @@
 - schema: "polar(Tensor abs, Tensor angle) -> Tensor"
   custom_code_at_the_beginning: |
     auto dtype = c10::toComplexType(abs.scalar_type());
-    auto out = at::empty(abs.sizes(), abs.options().dtype(dtype));
+    auto out = nodispatch::empty(abs.sizes(), abs.options().dtype(dtype));
   interface: diopiPolar(ctx, out, abs, angle)
 
 - schema: "polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!)"
@@ -2129,7 +2129,7 @@
     if(batched_input){
       out_shape.insert(out_shape.begin(), input_shape[0]);
     }
-    auto out = at::empty({out_shape}, self.options());
+    auto out = nodispatch::empty({out_shape}, self.options());
   interface: diopiIm2Col(ctx, out, self, kernel_size, dilation, padding, stride)
 
 - schema: "col2im(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor"
@@ -2151,7 +2151,7 @@
     if(batched_input){
       out_shape.insert(out_shape.begin(), input_shape[0]);
     }
-    auto out = at::empty({out_shape}, self.options());
+    auto out = nodispatch::empty({out_shape}, self.options());
   interface: diopiCol2Im(ctx, out, self, output_size, kernel_size, dilation, padding, stride)
 
 - schema: "sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"
@@ -2162,7 +2162,7 @@
 
 - schema: "isnan(Tensor self) -> Tensor"
   custom_code_at_the_beginning: |
-    auto out = at::empty(self.sizes(), self.options().dtype(at::kBool));
+    auto out = nodispatch::empty(self.sizes(), self.options().dtype(at::kBool));
   interface: diopiIsNan(ctx, out, self)
 
 - schema: "embedding_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor grad_weight"
@@ -2184,15 +2184,15 @@
 - schema: batch_norm_stats(Tensor input, float eps) -> (Tensor, Tensor)
   custom_code_at_the_beginning: |
     auto shape = input.size(1);
-    auto out0 = at::empty({shape}, input.options().dtype(at::kFloat));
-    auto out1 = at::empty({shape}, input.options().dtype(at::kFloat));
+    auto out0 = nodispatch::empty({shape}, input.options().dtype(at::kFloat));
+    auto out1 = nodispatch::empty({shape}, input.options().dtype(at::kFloat));
   interface: diopiBatchNormStats(ctx, out0, out1, input, eps)
 
 - schema: batch_norm_gather_stats_with_counts(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, Tensor counts) -> (Tensor, Tensor)
   custom_code_at_the_beginning: |
     auto shape = input.size(1);
-    auto out0 = at::empty({shape}, input.options().dtype(at::kFloat));
-    auto out1 = at::empty({shape}, input.options().dtype(at::kFloat));
+    auto out0 = nodispatch::empty({shape}, input.options().dtype(at::kFloat));
+    auto out1 = nodispatch::empty({shape}, input.options().dtype(at::kFloat));
   interface: diopiBatchNormGatherStatsWithCounts(ctx, out0, out1, input, mean, invstd, const_cast<diopiTensorHandle_t>(running_mean), const_cast<diopiTensorHandle_t>(running_var), static_cast<float>(momentum), static_cast<float>(eps), counts)
   custom_code_before_call_diopi: |
     // NOTE: const_cast here is safe according to pytorch's source code
@@ -2208,25 +2208,25 @@
     at::Tensor out2;
     at::Tensor out3;
     if(input_g){
-      out0 = at::empty({shape}, input.options().dtype(at::kFloat), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
-      out1 = at::empty({shape}, input.options().dtype(at::kFloat), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+      out0 = nodispatch::empty({shape}, input.options().dtype(at::kFloat), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+      out1 = nodispatch::empty({shape}, input.options().dtype(at::kFloat), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
     }
     if(weight_g){
-      out2 = at::empty({shape}, input.options().dtype(at::kFloat));
+      out2 = nodispatch::empty({shape}, input.options().dtype(at::kFloat));
     }
     if(bias_g){
-      out3 = at::empty({shape}, input.options().dtype(at::kFloat));
+      out3 = nodispatch::empty({shape}, input.options().dtype(at::kFloat));
     }
   interface: diopiBatchNormBackwardReduce(ctx, out0, out1, out2, out3, grad_out, input, mean, invstd, weight, input_g, weight_g, bias_g)
 
 - schema: batch_norm_backward_elemt(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor sum_dy, Tensor sum_dy_xmu, Tensor count) -> Tensor
   custom_code_at_the_beginning: |
-    auto out = at::empty_like(grad_out, grad_out.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+    auto out = nodispatch::empty_like(grad_out, grad_out.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
   interface: diopiBatchNormBackwardElemt(ctx, out, grad_out, input, mean, invstd, weight, sum_dy, sum_dy_xmu, count);
 
 - schema: batch_norm_elemt(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps) -> Tensor
   custom_code_at_the_beginning: |
-    auto out = at::empty_like(input, input.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
+    auto out = nodispatch::empty_like(input, input.options(), ${PREFERRED_MEMORY_FORMAT_PLACEHOLDER:-c10::nullopt});
   interface: diopiBatchNormElemt(ctx, out, input, weight, bias, mean, invstd, static_cast<float>(eps));
 
 - schema: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!)
@@ -2251,7 +2251,7 @@
     std::vector<at::Tensor> out(self.size());
     for (size_t i = 0;i < self.size();i++) {
       auto& in = self[i];
-      out[i] = at::empty(in.sizes(), in.options());
+      out[i] = nodispatch::empty(in.sizes(), in.options());
       dipu_add_out(self.at(i), other.at(i), alpha, out[i]);
     }
     return out;
@@ -2272,7 +2272,7 @@
   custom_code_at_the_beginning: |
     std::vector<at::Tensor> out(self.size());
     for (size_t i = 0;i < self.size();i++) {
-      out[i] = at::empty(self[i].sizes(), self[i].options());
+      out[i] = nodispatch::empty(self[i].sizes(), self[i].options());
       dipu_add_scalar_out(self[i], scalar, 1.0 , out[i]);
     }
     return out;
@@ -2293,7 +2293,7 @@
   custom_code_at_the_beginning: |
     std::vector<at::Tensor> out(self.size());
     for (size_t i = 0;i < self.size();i++) {
-      out[i] = at::empty_like(self[i]);
+      out[i] = nodispatch::empty_like(self[i]);
       dipu_mul_out(self[i], other[i], out[i]);
     }
     return out;
@@ -2325,7 +2325,7 @@
     std::vector<at::Tensor> out(self.size());
     for (size_t i = 0;i < self.size();i++) {
       auto& in = self[i];
-      out[i] = at::empty(in.sizes(), in.options());
+      out[i] = nodispatch::empty(in.sizes(), in.options());
       dipu_mul_scalar_out(self.at(i), scalar, out[i]);
     }
     return out;
@@ -2337,7 +2337,7 @@
     std::vector<at::Tensor> out(self.size());
     for (size_t i = 0;i < self.size();i++) {
       auto& in = self[i];
-      out[i] = at::empty(in.sizes(), in.options());
+      out[i] = nodispatch::empty(in.sizes(), in.options());
       dipu_mul_scalar_out(self.at(i), scalars[i], out[i]);
     }
     return out;
@@ -2358,7 +2358,7 @@
   custom_code_at_the_beginning: |
     std::vector<at::Tensor> out(self.size());
     for (size_t i = 0;i < self.size();i++) {
-      out[i] = at::empty_like(self[i]);
+      out[i] = nodispatch::empty_like(self[i]);
       dipu_div_out(self[i], other[i], out[i]);
     }
     return out;
@@ -2390,7 +2390,7 @@
     std::vector<at::Tensor> out(self.size());
     for (size_t i = 0;i < self.size();i++) {
       auto& in = self[i];
-      out[i] = at::empty(in.sizes(), in.options());
+      out[i] = nodispatch::empty(in.sizes(), in.options());
       dipu_div_scalar_out(self.at(i), scalar, out[i]);
     }
     return out;
@@ -2402,7 +2402,7 @@
     std::vector<at::Tensor> out(self.size());
     for (size_t i = 0;i < self.size();i++) {
       auto& in = self[i];
-      out[i] = at::empty(in.sizes(), in.options());
+      out[i] = nodispatch::empty(in.sizes(), in.options());
       dipu_div_scalar_out(self.at(i), scalars[i], out[i]);
     }
     return out;
@@ -2485,7 +2485,7 @@
   custom_code_at_the_beginning: |
     std::vector<at::Tensor> out(self.size());
     for (size_t i = 0;i < self.size();i++) {
-      out[i] = at::empty_like(self[i]);
+      out[i] = nodispatch::empty_like(self[i]);
       dipu_sqrt_out(self[i], out[i]);
     }
     return out;
@@ -2497,7 +2497,7 @@
     std::vector<at::Tensor> out(self.size());
     for (size_t i = 0;i < self.size();i++) {
       auto& in = self[i];
-      out[i] = at::empty(in.sizes(), in.options());
+      out[i] = nodispatch::empty(in.sizes(), in.options());
       dipu_neg_out(self.at(i), out[i]);
     }
     return out;
@@ -2509,7 +2509,7 @@
     std::vector<at::Tensor> out(self.size());
     for (size_t i = 0;i < self.size();i++) {
       auto& in = self[i];
-      out[i] = at::empty({}, in.options());
+      out[i] = nodispatch::empty({}, in.options());
       dipu_norm_out(in, ord, {}, false, out[i]);
     }
     return out;
@@ -2520,7 +2520,7 @@
 - schema: "wrap_diopi_cast_dtype(Tensor(a) self, ScalarType dtype) -> Tensor(a)"
   register_op: False
   custom_code_at_the_beginning: |
-    auto out = at::empty_like(self, self.options().dtype(dtype));
+    auto out = nodispatch::empty_like(self, self.options().dtype(dtype));
   interface: diopiCastDtype(ctx, out, self);
 
 # a diopi func wrapper.
diff --git a/dipu/tests/python/unittests/test_profiler_vendor.py b/dipu/tests/python/unittests/test_profiler_vendor.py
index ff9ff1f59..f94c81a66 100644
--- a/dipu/tests/python/unittests/test_profiler_vendor.py
+++ b/dipu/tests/python/unittests/test_profiler_vendor.py
@@ -32,7 +32,7 @@ def test_aot_profiler(self):
         with torch_dipu.profiler.NativeProfile(path, True):
             x.add_(y)
 
-        self.assertTrue(check_string_in_directory(path, "test_profiler.py"))
+        self.assertTrue(check_string_in_directory(path, "test_profiler_vendor.py"))
         self.assertTrue(check_string_in_directory(path, "aten::add_"))
         self.assertTrue(check_string_in_directory(path, "Add"))
 
@@ -54,13 +54,13 @@ def fn(x):
             y = opt_model(input)
             z = y + y
 
-        self.assertTrue(check_string_in_directory(path, "test_profiler.py"))
+        self.assertTrue(check_string_in_directory(path, "test_profiler_vendor.py"))
         self.assertTrue(check_string_in_directory(path, "aten::add"))
         self.assertTrue(check_string_in_directory(path, "mulrelu"))
         self.assertTrue(check_string_in_directory(path, "softmax"))
 
     @onlyOn("CUDA")
-    def test_profiler(self):
+    def test_profiler_cuda(self):
         model = models.resnet18().cuda()
         inputs = torch.randn(5, 3, 224, 224).cuda()
 
@@ -110,7 +110,7 @@ def test_profiler(self):
             prof.export_chrome_trace(f"{tmpdir}/resnet18_profiler_cuda.json")
 
     @onlyOn("MLU")
-    def test_profiler(self):
+    def test_profiler_mlu(self):
         model = models.resnet18().cuda()
         inputs = torch.randn(5, 3, 224, 224).cuda()
 
diff --git a/dipu/third_party/kineto b/dipu/third_party/kineto
index 926533879..08e23777d 160000
--- a/dipu/third_party/kineto
+++ b/dipu/third_party/kineto
@@ -1 +1 @@
-Subproject commit 92653387900fe7637a4065e710332dc81b73366d
+Subproject commit 08e23777de5f6c95f1c58b2e98654a0ce70f8a1c
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/NodispatchUtils.hpp b/dipu/torch_dipu/csrc_dipu/aten/ops/NodispatchUtils.hpp
index 38fed1826..c19247ba3 100644
--- a/dipu/torch_dipu/csrc_dipu/aten/ops/NodispatchUtils.hpp
+++ b/dipu/torch_dipu/csrc_dipu/aten/ops/NodispatchUtils.hpp
@@ -6,10 +6,17 @@
 
 #pragma once
 
+#include <ATen/NamedTensorUtils.h>
 #include <ATen/core/ATen_fwd.h>
 #include <ATen/core/TensorBody.h>
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/Layout.h>
 #include <c10/core/MemoryFormat.h>
+#include <c10/core/ScalarType.h>
 #include <c10/core/ScalarTypeToTypeMeta.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Exception.h>
 #include <c10/util/Optional.h>
 
 #include "csrc_dipu/aten/DIPUATenFunctions.h"
@@ -30,9 +37,69 @@ inline at::Tensor empty(
                                                                 memory_format));
 }
 
+// The code that calls this overloaded function is all for allocating CPU memory
+inline at::Tensor empty_cpu(
+    at::IntArrayRef size, at::ScalarType dtype,
+    c10::optional<at::Device> device_opt = c10::nullopt,
+    c10::optional<at::MemoryFormat> memory_format = c10::nullopt) {
+  return dipu_aten::empty_cpu(size, dtype, at::Layout::Strided,
+                              device_or_default(device_opt), false,
+                              c10::get_contiguous_memory_format());
+}
+
+inline at::Tensor empty_like(
+    const at::Tensor& self, c10::optional<at::ScalarType> dtype,
+    c10::optional<at::Layout> layout, c10::optional<at::Device> device,
+    c10::optional<bool> pin_memory,
+    c10::optional<at::MemoryFormat> optional_memory_format) {
+  at::TensorOptions options_ = at::TensorOptions()
+                                   .dtype(dtype)
+                                   .layout(layout)
+                                   .device(device)
+                                   .pinned_memory(pin_memory);
+  at::TensorOptions options =
+      self.options().merge_in(options_).merge_memory_format(
+          optional_memory_format);
+
+  TORCH_CHECK(!(options.layout() != c10::kStrided &&
+                optional_memory_format.has_value()),
+              "memory format option is only supported by strided tensors");
+
+  auto memory_format =
+      options.memory_format_opt().value_or(at::MemoryFormat::Preserve);
+
+  at::Tensor result;
+
+  // TODO(liuweiyu): need to implement nodispatch::empty_strided
+  if (memory_format == at::MemoryFormat::Preserve) {
+    result =
+        empty(self.sizes(), options.memory_format(self.suggest_memory_format()),
+              c10::nullopt);
+  } else {
+    result =
+        empty(self.sizes(), options.memory_format(memory_format), c10::nullopt);
+  }
+
+  if (self.opt_names()) {
+    at::namedinference::propagate_names(result, self.names());
+  }
+
+  // never propagate Conjugate, Negative, and ZeroTensor dispatch key
+  result._set_conj(false);
+  result._set_neg(false);
+  result._set_zero(false);
+  return result;
+}
+
 // an simplified version of `at::empty_like` but without dispatch
-inline at::Tensor empty_like(const at::Tensor& self) {
-  return empty(self.sizes(), self.options());
+inline at::Tensor empty_like(
+    const at::Tensor& self, at::TensorOptions options = {},
+    c10::optional<at::MemoryFormat> memory_format = c10::nullopt) {
+  return nodispatch::empty_like(
+      self, c10::optTypeMetaToScalarType(options.dtype_opt()),
+      options.layout_opt(), options.device_opt(), options.pinned_memory_opt(),
+      c10::impl::check_tensor_options_and_extract_memory_format(options,
+                                                                memory_format));
 }
 
 }  // namespace nodispatch
diff --git a/dipu/torch_dipu/csrc_dipu/profiler/collection.cpp b/dipu/torch_dipu/csrc_dipu/profiler/collection.cpp
index 0efbd3364..4ccaaa190 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/collection.cpp
+++ b/dipu/torch_dipu/csrc_dipu/profiler/collection.cpp
@@ -48,6 +48,55 @@ using result_ptr_t = std::shared_ptr<torch::profiler::impl::Result>;
 using trace_ptr_t =
     std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>;
 
+namespace {
+struct TagToIOType {
+  DIPUInputOutputEncoder::Tag tag;
+  DIPUInputOutputEncoder::IOType io_type;
+};
+
+constexpr int tagCount =
+    (static_cast<int>(DIPUInputOutputEncoder::Tag::TERMINATOR)) + 1;
+constexpr std::array<TagToIOType, tagCount> tag_map = {{
+    {DIPUInputOutputEncoder::Tag::Tensor,
+     DIPUInputOutputEncoder::IOType::Shapes},
+    {DIPUInputOutputEncoder::Tag::UndefinedTensor,
+     DIPUInputOutputEncoder::IOType::Shapes},
+    {DIPUInputOutputEncoder::Tag::TensorListBegin,
+     DIPUInputOutputEncoder::IOType::Shapes},
+    {DIPUInputOutputEncoder::Tag::ScalarList,
+     DIPUInputOutputEncoder::IOType::ConcreteInputs},
+    {DIPUInputOutputEncoder::Tag::Scalar,
+     DIPUInputOutputEncoder::IOType::Shapes},
+    {DIPUInputOutputEncoder::Tag::Other,
+     DIPUInputOutputEncoder::IOType::Shapes},
+    {DIPUInputOutputEncoder::Tag::TERMINATOR,
+     DIPUInputOutputEncoder::IOType::None},
+}};
+
+constexpr bool allTagsMapped(int idx = 0) {
+  return tag_map[idx].tag == DIPUInputOutputEncoder::Tag::TERMINATOR ||
+         ((idx == static_cast<int>(tag_map[idx].tag)) &&
+          allTagsMapped(idx + 1));
+}
+static_assert(allTagsMapped(), "tag_map is out of order");
+
+constexpr DIPUInputOutputEncoder::IOType tagToIOType(
+    DIPUInputOutputEncoder::Tag tag) {
+  return tag_map[static_cast<int>(tag)].io_type;
+}
+
+constexpr int32_t kScalarListLengthLimit = 30;
+
+bool dipu_get_record_concrete_inputs_enabled() {
+#if DIPU_TORCH_VERSION == 20000
+  return false;
+#else
+  return torch::profiler::impl::get_record_concrete_inputs_enabled();
+#endif
+}
+
+}  // namespace
+
 void DIPUInputOutputEncoder::push(c10::ArrayRef<const c10::IValue> values) {
   for (const auto& value : values) {
     if (value.isTensor()) {
@@ -65,6 +114,9 @@ void DIPUInputOutputEncoder::push(c10::ArrayRef<const c10::IValue> values) {
         push(t);
       }
       tags_.emplace_back(Tag::TERMINATOR);
+    } else if (isSupportedScalarList(value)) {
+      tags_.emplace_back(Tag::ScalarList);
+      ivalues_.emplace_back(value);
     } else {
       tags_.emplace_back(Tag::Other);
     }
@@ -73,7 +125,9 @@ void DIPUInputOutputEncoder::push(c10::ArrayRef<const c10::IValue> values) {
 }
 
 void DIPUInputOutputEncoder::push(const at::Tensor& t) {
-  if (t.defined() && !t.is_nested()) {  // TODO(caikun-pjlab) fix nested sizes
+  // TODO(caikun-pjlab) fix nested sizes
+  if (t.defined() && !t.is_nested() &&
+      !t.unsafeGetTensorImpl()->has_symbolic_sizes_strides()) {
     tags_.emplace_back(Tag::Tensor);
     tensor_metadata_.emplace_back(t);
     tensor_sizes_strides_.copy(t.sizes());
@@ -86,12 +140,42 @@ void DIPUInputOutputEncoder::push(const at::Tensor& t) {
   }
 }
 
-// This is a custom-iterator-like getter to obtain input shapes and dtypes.
-auto DIPUInputOutputEncoder::getNextShapesAndDtypes() {
+bool DIPUInputOutputEncoder::isSupportedScalarList(
+    const c10::IValue& list_candidate) {
+  // Scalar list can be very long. If a list is too long, we shouldn't
+  // collect it. This function checks whether the list is a scalar list
+  // and whether its length is sufficiently short.
+
+  if (!dipu_get_record_concrete_inputs_enabled()) {
+    return false;
+  }
+
+  if (!list_candidate.isList()) {
+    return false;
+  }
+  auto list_ref = list_candidate.toListRef();
+  if (C10_UNLIKELY(list_ref.empty())) {
+    return true;
+  }
+  if (C10_UNLIKELY(!list_ref[0].isScalar())) {
+    return false;
+  }
+  if (C10_UNLIKELY(list_ref.size() > kScalarListLengthLimit)) {
+    return false;
+  }
+  return true;
+}
+
+// io_type is used to filter the ivalues between 'Shapes' and 'Concrete Args'.
+// Shapes are used to represent the shapes of tensors. We save only the shapes
+//   of the tensors because tensors can be large.
+// Concrete args are separated to clarify that they are the actual values.
+// NOLINTNEXTLINE(readability-function-cognitive-complexity)
+auto DIPUInputOutputEncoder::getIValueGenerator(const IOType& io_type) {
   return [this, tag_it = tags_.begin(),
           tensor_metadata_it = tensor_metadata_.begin(),
           tensor_size_strides_it = tensor_sizes_strides_.begin(),
-          ivals_it = ivalues_.begin()]() mutable {
+          ivals_it = ivalues_.begin(), io_type]() mutable {
     auto decode_tensor = [&]() -> TensorMetadata {
       const auto& raw_metadata = *tensor_metadata_it++;
       std::vector<int64_t> sizes;
@@ -108,29 +192,48 @@ auto DIPUInputOutputEncoder::getNextShapesAndDtypes() {
     };
 
     std::vector<op_input_t> out;
+    auto push_value = [&out, io_type](const Tag& tag, op_input_t input) {
+      if (io_type == tagToIOType(tag)) {
+        out.push_back(std::move(input));
+      } else {
+        out.emplace_back(c10::nullopt);
+      }
+    };
+
     bool terminate = false;
     while (!terminate && tag_it != tags_.end()) {
       switch (*tag_it) {
         case Tag::Tensor:
-          out.emplace_back(decode_tensor());
+          push_value(*tag_it, decode_tensor());
           break;
 
         case Tag::TensorListBegin: {
           std::vector<TensorMetadata> arg;
+          bool found_undefined = false;
           while (*(++tag_it) != Tag::TERMINATOR) {
-            TORCH_INTERNAL_ASSERT(*tag_it == Tag::Tensor, (int)(*tag_it));
+            if (*tag_it == Tag::UndefinedTensor) {
+              found_undefined = true;
+              continue;
+            }
+            TORCH_INTERNAL_ASSERT(*tag_it == Tag::Tensor,
+                                  static_cast<int>(*tag_it));
             arg.emplace_back(decode_tensor());
           }
-          out.emplace_back(std::move(arg));
+          if (found_undefined) {
+            push_value(*tag_it, c10::nullopt);
+          } else {
+            push_value(Tag::TensorListBegin, std::move(arg));
+          }
         } break;
 
+        case Tag::ScalarList:
         case Tag::Scalar:
-          out.emplace_back(*ivals_it++);
+          push_value(*tag_it, *ivals_it++);
           break;
 
         case Tag::UndefinedTensor:
         case Tag::Other:
-          out.emplace_back(c10::nullopt);
+          push_value(*tag_it, c10::nullopt);
           break;
 
         case Tag::TERMINATOR:
@@ -147,6 +250,14 @@ auto DIPUInputOutputEncoder::getNextShapesAndDtypes() {
   };
 }
 
+auto DIPUInputOutputEncoder::getInputShapeGenerator() {
+  return getIValueGenerator(IOType::Shapes);
+}
+
+auto DIPUInputOutputEncoder::getConcreteInputGenerator() {
+  return getIValueGenerator(IOType::ConcreteInputs);
+}
+
 void DIPUInputOutputEncoder::clear() {
   tags_.clear();
   tensor_metadata_.clear();
@@ -289,7 +400,8 @@ void DIPUThreadLocalSubqueue::TorchOpStorage::materialize(
     }
   }
 
-  auto input_getter = inputs_outputs_.getNextShapesAndDtypes();
+  auto input_shape_getter = inputs_outputs_.getInputShapeGenerator();
+  auto concrete_input_getter = inputs_outputs_.getConcreteInputGenerator();
 
   // TODO(caikun-pjlab): CTAD will take care of template args when we move to
   // C++17
@@ -302,10 +414,10 @@ void DIPUThreadLocalSubqueue::TorchOpStorage::materialize(
     ExtraFields<torch::profiler::impl::EventType::TorchOp> e {
       std::move(event->basic_fields_),
           DIPUThreadLocalSubqueue::TorchOpStorage::OpList::correlationID(event),
-          time_converter(event->end_time_), input_getter(),
+          time_converter(event->end_time_), input_shape_getter(),
 #if DIPU_TORCH_VERSION == 20000
 #else
-          input_getter(),
+          concrete_input_getter(),
 #endif
           jit_stack(), jit_module(), extra_args(), gpu_fallback(),
           event->allow_tf32_cublas_, std::move(event->counters_)
diff --git a/dipu/torch_dipu/csrc_dipu/profiler/collection.h b/dipu/torch_dipu/csrc_dipu/profiler/collection.h
index c937d796f..d78c15b61 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/collection.h
+++ b/dipu/torch_dipu/csrc_dipu/profiler/collection.h
@@ -34,23 +34,31 @@ class DIPUInputOutputEncoder final {
  public:
   void push(c10::ArrayRef<const c10::IValue> values);
 
-  // Used during post-processing to create vectors for shapes and dtype.
-  auto getNextShapesAndDtypes();
+  auto getInputShapeGenerator();
+  auto getConcreteInputGenerator();
+  static bool isSupportedScalarList(const c10::IValue& list_candidate);
 
   void clear();
 
- private:
   enum class Tag {
     Tensor = 0,
     UndefinedTensor,
     TensorListBegin,  // TODO(caikun-pjlab): generalize to other lists.
+    ScalarList,
     Scalar,
     Other,
     TERMINATOR
   };
 
+  enum class IOType { Shapes, ConcreteInputs, None };
+
+ private:
   void push(const at::Tensor& t);
 
+  // Implementation detail for getInputShapeGenerator and
+  // getConcreteInputGenerator
+  auto getIValueGenerator(const IOType& io_type);
+
   torch::profiler::impl::AppendOnlyList<
       Tag, torch::profiler::impl::IO_ENCODER_DEFAULT_BLOCK_SIZE>
       tags_;
diff --git a/dipu/torch_dipu/csrc_dipu/profiler/profiler_kineto.cpp b/dipu/torch_dipu/csrc_dipu/profiler/profiler_kineto.cpp
index fa9856bd7..51fff5078 100644
--- a/dipu/torch_dipu/csrc_dipu/profiler/profiler_kineto.cpp
+++ b/dipu/torch_dipu/csrc_dipu/profiler/profiler_kineto.cpp
@@ -28,16 +28,21 @@ namespace profile {
 namespace {
 inline int64_t getTimeUs() {
   auto constexpr scale = int64_t{1000};
-  return torch::profiler::impl::getTime(true) / scale;
+  return torch::profiler::impl::getTime() / scale;
 }
 
-const std::set<libkineto::ActivityType> kCpuTypes{
-    libkineto::ActivityType::CPU_OP,
-    libkineto::ActivityType::CPU_INSTANT_EVENT,
-    libkineto::ActivityType::USER_ANNOTATION,
-    libkineto::ActivityType::EXTERNAL_CORRELATION,
-    libkineto::ActivityType::CUDA_RUNTIME,
-    libkineto::ActivityType::PYTHON_FUNCTION,
+const std::set<libkineto::ActivityType> kCpuTypes {
+  libkineto::ActivityType::CPU_OP, libkineto::ActivityType::CPU_INSTANT_EVENT,
+      libkineto::ActivityType::USER_ANNOTATION,
+      libkineto::ActivityType::EXTERNAL_CORRELATION,
+#if DIPU_TORCH_VERSION == 20000
+      libkineto::ActivityType::CUDA_RUNTIME,
+#else
+      libkineto::ActivityType::XPU_RUNTIME,
+      libkineto::ActivityType::CUDA_RUNTIME,
+      libkineto::ActivityType::CUDA_DRIVER,
+#endif
+      libkineto::ActivityType::PYTHON_FUNCTION,
 };
 
 using torch::autograd::profiler::experimental_event_t;
@@ -46,8 +51,10 @@ using torch::autograd::profiler::post_process_t;
 using torch::autograd::profiler::ProfilerResult;
 using torch::profiler::impl::ActiveProfilerType;
 #if DIPU_TORCH_VERSION == 20000
-using torch::profiler::impl::dtypesToStr;
+constexpr auto strListToStr = torch::profiler::impl::dtypesToStr;
 #else
+using torch::profiler::impl::ivalueListToStr;
+constexpr auto strListToStr = torch::profiler::impl::strListToStr;
 #endif
 using torch::profiler::impl::EventType;
 using torch::profiler::impl::ExtraFields;
@@ -60,30 +67,61 @@ using torch::profiler::impl::shapesToStr;
 using torch::profiler::impl::stacksToStr;
 using torch::profiler::impl::TensorMetadata;
 
-auto shapesAndDtypes(const std::vector<op_input_t>& inputs) {
+struct OpArgData {
+  bool has_data;
   std::vector<std::vector<int64_t>> shapes;
   std::vector<std::string> dtypes;
-  for (const auto& i : inputs) {
+  std::vector<c10::IValue> concrete_inputs;
+};
+
+auto parseArgData(const std::vector<op_input_t>& input_shapes,
+                  const std::vector<op_input_t>& concrete_inputs) {
+  if (input_shapes.empty()) {
+    return OpArgData{false, {}, {}, {}};
+  }
+
+  std::vector<std::vector<int64_t>> shapes(input_shapes.size());
+  std::vector<std::string> dtypes(input_shapes.size());
+  std::vector<c10::IValue> concrete_inputs_list;
+
+  for (const auto& i : c10::irange(input_shapes.size())) {
     c10::visit(c10::overloaded(
                    [&](const TensorMetadata& t) {
-                     shapes.emplace_back(t.sizes_);
-                     dtypes.emplace_back(scalarTypeToTypeMeta(t.dtype_).name());
+                     shapes[i] = t.sizes_;
+                     dtypes[i] = std::string(
+                         scalarTypeToTypeMeta(t.dtype_).name().data(),
+                         scalarTypeToTypeMeta(t.dtype_).name().size());
                    },
                    [&](const std::vector<TensorMetadata>&) {
-                     shapes.emplace_back();
-                     dtypes.emplace_back("TensorList");
+                     dtypes[i] = "TensorList";
                    },
-                   [&](const c10::IValue&) {
-                     shapes.emplace_back();
-                     dtypes.emplace_back("Scalar");
-                   },
-                   [&](const auto&) {
-                     shapes.emplace_back();
-                     dtypes.emplace_back();
-                   }),
-               i);
+                   [&](const c10::IValue& val) { dtypes[i] = "Scalar"; },
+                   [&](const auto&) {}),
+               input_shapes[i]);
+  }
+
+  // If we recorded concrete inputs, then parse them
+  if (input_shapes.size() == concrete_inputs.size() &&
+      !concrete_inputs.empty()) {
+    concrete_inputs_list.resize(input_shapes.size());
+
+    for (const auto& i : c10::irange(input_shapes.size())) {
+      c10::visit(
+          c10::overloaded(
+              [&](const c10::IValue& val) { concrete_inputs_list[i] = val; },
+              [&](const auto&) {}),
+          input_shapes[i]);
+      c10::visit(c10::overloaded(
+                     [&](const c10::IValue& val) {
+                       concrete_inputs_list[i] = val;
+                       dtypes[i] = "ScalarList";
+                     },
+                     [&](const auto&) {}),
+                 concrete_inputs[i]);
+    }
   }
-  return std::make_pair(shapes, dtypes);
+
+  return OpArgData{true, shapes, dtypes, concrete_inputs_list};
 }
 
 struct MetadataBase {
@@ -163,20 +201,28 @@ struct AddGenericMetadata : public MetadataBase {
   }
 
   void operator()(ExtraFields<EventType::TorchOp>& op_event) {
-    const auto shapes_and_dtypes = shapesAndDtypes(op_event.inputs_);
-    if (!shapes_and_dtypes.first.empty()) {
-      addMetadata("Input Dims", shapesToStr(shapes_and_dtypes.first));
-    }
 #if DIPU_TORCH_VERSION == 20000
-    if (!shapes_and_dtypes.second.empty()) {
-      addMetadata("Input type", dtypesToStr(shapes_and_dtypes.second));
-    }
+    const std::vector<op_input_t> concrete_inputs;
 #else
+    const auto& concrete_inputs = op_event.concrete_inputs_;
 #endif
+    const auto arg_data = parseArgData(op_event.inputs_, concrete_inputs);
+
+    if (arg_data.has_data) {
+      addMetadata("Input Dims", shapesToStr(arg_data.shapes));
+      addMetadata("Input type", strListToStr(arg_data.dtypes));
+#if DIPU_TORCH_VERSION == 20000
+#else
+      if (!arg_data.concrete_inputs.empty()) {
+        addMetadata("Concrete Inputs",
+                    ivalueListToStr(arg_data.concrete_inputs));
+      }
+#endif
+    }
 
     if (config_ && !config_->experimental_config.performance_events.empty()) {
       auto& event_names = config_->experimental_config.performance_events;
-      for (auto i = 0; i < op_event.perf_event_counters_->size(); ++i) {
+      for (const auto i : c10::irange(op_event.perf_event_counters_->size())) {
         addMetadata(event_names[i],
                     std::to_string((*op_event.perf_event_counters_)[i]));
       }
@@ -308,7 +354,7 @@ struct DIPUKinetoThreadLocalState : public ProfilerStateBase {
 
   void materializeOpEvents(std::vector<std::shared_ptr<Result>>& events) {
     for (auto& e : events) {
-      if (e->parent_.expired()) {
+      if (e->parent_.expired() && e->deviceType() == c10::DeviceType::CPU) {
         event_tree_.push_back(e);
       }
 
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/ascend/deviceimpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/ascend/deviceimpl.cpp
index 9e05f9af1..c9e3e78a8 100644
--- a/dipu/torch_dipu/csrc_dipu/vendor/ascend/deviceimpl.cpp
+++ b/dipu/torch_dipu/csrc_dipu/vendor/ascend/deviceimpl.cpp
@@ -18,7 +18,10 @@ namespace devapis {
 using ascend_deviceId = int32_t;
 thread_local bool setDevFlag = false;
 
-void initializeVendor() { DIPU_CALLACLRT(aclInit(nullptr)); }
+void initializeVendor() {
+  DIPU_CALLACLRT(aclInit(nullptr));
+  DIPU_CALLACLRT(aclrtSetDeviceSatMode(ACL_RT_OVERFLOW_MODE_INFNAN));
+}
 
 void finalizeVendor() { DIPU_CALLACLRT(aclFinalize()); }
 
diff --git a/dipu/torch_dipu/csrc_dipu/vendor/droplet/DropletGeneratorImpl.cpp b/dipu/torch_dipu/csrc_dipu/vendor/droplet/DropletGeneratorImpl.cpp
index 944bfdb32..b68d5e818 100644
--- a/dipu/torch_dipu/csrc_dipu/vendor/droplet/DropletGeneratorImpl.cpp
+++ b/dipu/torch_dipu/csrc_dipu/vendor/droplet/DropletGeneratorImpl.cpp
@@ -1,3 +1,5 @@
+#include <cstring>
+
 #include <ATen/Functions.h>
 #include <ATen/Utils.h>
 
@@ -7,21 +9,44 @@
 
 namespace dipu {
 
-// Discriminate floating device type.
-// static bool is_floating_device = true;
-
-// just an example
-// not implemented now
 class DROPLETGeneratorImpl : public dipu::DIPUGeneratorImpl {
+ private:
+  static constexpr std::size_t seed_size = sizeof(uint64_t);
+  static constexpr std::size_t offset_size = sizeof(int64_t);
+  static constexpr std::size_t total_size = seed_size + offset_size;
+
  public:
-  DROPLETGeneratorImpl(at::DeviceIndex device_index)
+  explicit DROPLETGeneratorImpl(at::DeviceIndex device_index)
       : dipu::DIPUGeneratorImpl(device_index) {}
 
-  void set_state(const c10::TensorImpl& state) override {}
-
-  void update_state() const override {}
+  void set_state(const c10::TensorImpl& state) override {
+    at::detail::check_rng_state(state);
+    auto state_size = state.numel();
+    TORCH_CHECK(
+        state_size == total_size || state_size == total_size - offset_size,
+        "RNG state size is invalid");
+
+    state_ = at::Tensor(
+        state.shallow_copy_and_detach(state.version_counter(), true));
+    state_need_reset_ = false;
+  }
+
+  void update_state() const override {
+    if (state_need_reset_) {
+      state_ = at::detail::empty_cpu({static_cast<int64_t>(total_size)},
+                                     c10::ScalarType::Byte, c10::nullopt,
+                                     c10::nullopt, c10::nullopt, c10::nullopt);
+      auto rng_state = state_.data_ptr<uint8_t>();
+      uint64_t seed = this->current_seed();
+
+      std::memcpy(rng_state, &seed, seed_size);
+      std::memset(rng_state + seed_size, 0, offset_size);
+      state_need_reset_ = false;
+    }
+  }
 };
 
+// NOLINTNEXTLINE(readability-const-return-type)
 const at::Generator vendorMakeGenerator(at::DeviceIndex device_index) {
   return at::make_generator<DROPLETGeneratorImpl>(device_index);
 }