diff --git a/clang/lib/DPCT/AnalysisInfo.cpp b/clang/lib/DPCT/AnalysisInfo.cpp
index d410a7e8ba57..458724d38b1a 100644
--- a/clang/lib/DPCT/AnalysisInfo.cpp
+++ b/clang/lib/DPCT/AnalysisInfo.cpp
@@ -246,7 +246,8 @@ std::shared_ptr<TargetType> makeTextureObjectInfo(const ValueDecl *D,
     if (auto VD = dyn_cast<VarDecl>(D)) {
       return std::make_shared<TargetType>(VD);
     }
-  } else if (auto PVD = dyn_cast<ParmVarDecl>(D)) {
+  } else if (const auto *PVD = dyn_cast<ParmVarDecl>(D);
+             PVD && PVD->getTypeSourceInfo()) {
     return std::make_shared<TargetType>(PVD);
   }
   return std::shared_ptr<TargetType>();
diff --git a/clang/lib/DPCT/RulesInclude/InclusionHeaders.cpp b/clang/lib/DPCT/RulesInclude/InclusionHeaders.cpp
index 79673ca76c2e..672a76e3f5b2 100644
--- a/clang/lib/DPCT/RulesInclude/InclusionHeaders.cpp
+++ b/clang/lib/DPCT/RulesInclude/InclusionHeaders.cpp
@@ -200,7 +200,8 @@ void IncludesCallbacks::InclusionDirective(
         DpctGlobalInfo::getIncludeMapSet().push_back({IncludedFile, Repl});
       }
     }
-    return;
+    if (Global.isInRoot(IncludedFile))
+      return;
   }
 
   if (!Global.isInAnalysisScope(LocInfo.first) &&
diff --git a/clang/lib/DPCT/RulesInclude/InclusionHeaders.inc b/clang/lib/DPCT/RulesInclude/InclusionHeaders.inc
index 87991dc3ae14..dad32862ed64 100644
--- a/clang/lib/DPCT/RulesInclude/InclusionHeaders.inc
+++ b/clang/lib/DPCT/RulesInclude/InclusionHeaders.inc
@@ -50,8 +50,7 @@ REGIST_INCLUSION("curand_kernel.h", FullMatch, Rng, Replace, false,
 REGIST_INCLUSION("cusparse.h", FullMatch, Sparse, Replace, false,
                  HeaderType::HT_DPCT_SPBLAS_Utils)
 REGIST_INCLUSION("cusparse_v2.h", FullMatch, Sparse, Replace, false,
-                 HeaderType::HT_DPCT_SPBLAS_Utils,
-                 HeaderType::HT_DPCT_BLAS_Utils)
+                 HeaderType::HT_DPCT_SPBLAS_Utils)
 
 REGIST_INCLUSION("cufft.h", FullMatch, FFT, Replace, false,
                  HeaderType::HT_DPCT_FFT_Utils)
diff --git a/clang/lib/DPCT/RulesLang/RulesLang.cpp b/clang/lib/DPCT/RulesLang/RulesLang.cpp
index af65bd17b184..68ffdaf523b7 100644
--- a/clang/lib/DPCT/RulesLang/RulesLang.cpp
+++ b/clang/lib/DPCT/RulesLang/RulesLang.cpp
@@ -1082,6 +1082,13 @@ void TypeInDeclRule::runRule(const MatchFinder::MatchResult &Result) {
         }
         Str = Itr->second;
       }
+    } else if (llvm::StringRef(TypeStr).starts_with("cublas")) {
+      // In most cases, we do not need to insert blas_utils.hpp manually since
+      // the cublas_v2.h will be migrated. However, when the include directive
+      // of cublas_v2.h is not in the in-root, the migrated code cannot be
+      // built successfully.
+      DpctGlobalInfo::getInstance().insertHeader(
+          TL->getBeginLoc(), HeaderType::HT_DPCT_BLAS_Utils);
     }
 
     // Add '#include <complex>' directive to the file only once
diff --git a/clang/test/dpct/pytorch/ATen.cu b/clang/test/dpct/pytorch/ATen.cu
index d613ff66d474..8df6fd9be66b 100644
--- a/clang/test/dpct/pytorch/ATen.cu
+++ b/clang/test/dpct/pytorch/ATen.cu
@@ -5,7 +5,7 @@
 // RUN: cp -r %S/pytorch_cuda_inc %T/pytorch/ATen/
 // RUN: cd %T/pytorch/ATen
 // RUN: mkdir dpct_out
-// RUN: dpct -out-root dpct_out %T/pytorch/ATen/src/ATen.cu --extra-arg="-I%T/pytorch/ATen/pytorch_cuda_inc" --cuda-include-path="%cuda-path/include" --rule-file=%T/pytorch/ATen/user_defined_rule_pytorch.yaml  -- -x cuda --cuda-host-only
+// RUN: dpct --out-root dpct_out %T/pytorch/ATen/src/ATen.cu --extra-arg="-I%T/pytorch/ATen/pytorch_cuda_inc" --cuda-include-path="%cuda-path/include" --rule-file=%T/pytorch/ATen/user_defined_rule_pytorch.yaml --analysis-scope-path %T/pytorch/ATen/pytorch_cuda_inc --analysis-scope-path %T/pytorch/ATen/src --in-root %T/pytorch/ATen/src
 // RUN: FileCheck --input-file %T/pytorch/ATen/dpct_out/ATen.dp.cpp --match-full-lines %T/pytorch/ATen/src/ATen.cu
 // RUN: %if build_lit %{icpx -c -fsycl -DNO_BUILD_TEST  %T/pytorch/ATen/dpct_out/ATen.dp.cpp -o %T/pytorch/ATen/dpct_out/ATen.dp.o %}
 
diff --git a/clang/test/dpct/pytorch_api_type/pytoch_api_test1.cpp b/clang/test/dpct/pytorch_api_type/pytoch_api_test1.cpp
index 68bf1013d60f..fb0828bfee5c 100644
--- a/clang/test/dpct/pytorch_api_type/pytoch_api_test1.cpp
+++ b/clang/test/dpct/pytorch_api_type/pytoch_api_test1.cpp
@@ -8,15 +8,17 @@
 #include <iostream>
 #include <stdexcept>
 
-// CHECK: #include "c10/xpu/XPUStream.h"
+// CHECK: #include "ATen/xpu/XPUContext.h"
 #include "ATen/cuda/CUDAContext.h"
 
-class TensorStub {
+namespace torch {
+class Tensor {
 public:
   bool is_cuda() const {
     return true;
   }
 };
+} // namespace torch
 
 #define MY_CHECK(condition, message)                              \
   do {                                                            \
@@ -25,9 +27,8 @@ class TensorStub {
     }                                                             \
   } while (0)
 
-int main() {
-  TensorStub x;
-  // CHECK: MY_CHECK(x.is_xpu(), "x must reside on device");
+void foo(torch::Tensor x) {
+  // CHECK: MY_CHECK(x. is_xpu(), "x must reside on device");
   MY_CHECK(x.is_cuda(), "x must reside on device");
 
   return 0;
diff --git a/clang/test/dpct/pytorch_api_type/pytoch_api_test2.cu b/clang/test/dpct/pytorch_api_type/pytoch_api_test2.cu
index 284f67902330..771dc3c2833f 100644
--- a/clang/test/dpct/pytorch_api_type/pytoch_api_test2.cu
+++ b/clang/test/dpct/pytorch_api_type/pytoch_api_test2.cu
@@ -6,12 +6,26 @@
 
 #define AT_CUDA_CHECK(stmt)  (stmt)
 
-namespace at {
+namespace c10 {
+using DeviceIndex = int8_t;
 namespace cuda {
-cudaStream_t getCurrentCUDAStream() {
-  return nullptr; // Return a dummy stream
+class CUDAStream {
+public:
+  CUDAStream() {}
+  cudaStream_t stream() { return 0; }
+  operator cudaStream_t() const {
+    return stream();
+  }
+  cudaStream_t stream() const;
+};
+CUDAStream getCurrentCUDAStream(DeviceIndex device_index = -1) {
+  return CUDAStream();
 }
 } // namespace cuda
+} // namespace c10
+
+namespace at {
+using namespace c10;
 } // namespace at
 
 __global__ void kernel() {}
@@ -21,15 +35,19 @@ int main() {
   dim3 blockSize(8, 8, 1);
   void *args[] = {nullptr}; 
 
-  // CHECK: [&](){
-  // CHECK-NEXT:  &static_cast<sycl::queue &>(c10::xpu::getCurrentXPUStream())->parallel_for(
+  //      CHECK:([&](){
+  // CHECK-NEXT:  ((sycl::queue*)(c10::xpu::getCurrentXPUStream()))->parallel_for(
   // CHECK-NEXT:    sycl::nd_range<3>(gridSize * blockSize, blockSize),
   // CHECK-NEXT:    [=](sycl::nd_item<3> item_ct1) {
   // CHECK-NEXT:      kernel();
   // CHECK-NEXT:    });
   // CHECK-NEXT:  return 0;
-  // CHECK-NEXT:}();
+  // CHECK-NEXT:}());
   AT_CUDA_CHECK(cudaLaunchKernel((const void *)kernel, gridSize, blockSize, args, 0, at::cuda::getCurrentCUDAStream()));
-
+  at::DeviceIndex d = 1;
+  // CHECK: c10::xpu::getCurrentXPUStream(d);
+  at::cuda::getCurrentCUDAStream(d);
+  // CHECK: dpct::queue_ptr s = &static_cast<sycl::queue &>(c10::xpu::getCurrentXPUStream(). queue());
+  cudaStream_t s = at::cuda::getCurrentCUDAStream().stream();
   return 0;
 }
diff --git a/clang/test/dpct/two_analysis_scopes/app/test.cu b/clang/test/dpct/two_analysis_scopes/app/test.cu
index 247013fddf31..0819f27962e5 100644
--- a/clang/test/dpct/two_analysis_scopes/app/test.cu
+++ b/clang/test/dpct/two_analysis_scopes/app/test.cu
@@ -1,15 +1,16 @@
 // RUN: dpct --format-range=none --out-root %T/out %s --analysis-scope-path %S --analysis-scope-path %S/../deps --cuda-include-path="%cuda-path/include" --extra-arg="-I%S/../deps"
 // RUN: FileCheck --match-full-lines --input-file %T/out/test.dp.cpp %s
+// RUN: FileCheck --match-full-lines --input-file %T/out/test.dp.hpp %S/test.cuh
 // RUN: echo "// empty" > %T/out/dep.h
 // RUN: %if build_lit %{icpx -c -fsycl %T/out/test.dp.cpp -o %T/out/test.dp.o -I%T/out %}
 
 // CHECK: #include <sycl/sycl.hpp>
 // CHECK-NEXT: #include <dpct/dpct.hpp>
-// CHECK-NEXT: #include <dep.h>
+// CHECK-NEXT: #include "test.dp.hpp"
 // CHECK-NEXT: #include <dpct/blas_utils.hpp>
+#include "test.cuh"
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
-#include <dep.h>
 
 void foo(cublasHandle_t handle, const half *a, const half *b, half *c,
          int n, half *alpha, half *beta) {
diff --git a/clang/test/dpct/two_analysis_scopes/app/test.cuh b/clang/test/dpct/two_analysis_scopes/app/test.cuh
new file mode 100644
index 000000000000..f90a1831f970
--- /dev/null
+++ b/clang/test/dpct/two_analysis_scopes/app/test.cuh
@@ -0,0 +1,5 @@
+// CHECK: #include <dep.h>
+// CHECK-NEXT: #include <dpct/blas_utils.hpp>
+#include <dep.h>
+
+void foo_bar(cublasHandle_t h);
diff --git a/clang/tools/dpct/DpctOptRules/pytorch_api.yaml b/clang/tools/dpct/DpctOptRules/pytorch_api.yaml
index 169e96229d8b..a363f955fa5e 100644
--- a/clang/tools/dpct/DpctOptRules/pytorch_api.yaml
+++ b/clang/tools/dpct/DpctOptRules/pytorch_api.yaml
@@ -40,29 +40,42 @@
   Priority: Takeover
   In: get_in_order_queue
   Out: static_cast<sycl::queue&>(c10::xpu::getCurrentXPUStream())
+  Includes: ["c10/xpu/XPUStream.h"]
 
-- Rule: rule_process_is_cuda
-  Kind: PatternRewriter
+- Rule: rule_torch_Tensor
+  Kind: Class
   Priority: Takeover
-  In: is_cuda
-  Out: is_xpu
+  In: torch::Tensor
+  Out: torch::Tensor
+  Methods:
+    - In: is_cuda
+      Out: $method_base is_xpu()
 
 - Rule: rule_getCurrentCUDAStream
-  Kind: PatternRewriter
+  Kind: API
   Priority: Takeover
-  In: at::cuda::getCurrentCUDAStream()
-  Out: |
-       &static_cast<sycl::queue &>(c10::xpu::getCurrentXPUStream())
+  In: at::cuda::getCurrentCUDAStream
+  Out: c10::xpu::getCurrentXPUStream($1)
+  Includes: ["c10/xpu/XPUStream.h"]
+
+- Rule: rule_CUDAStream
+  Kind: Class
+  Priority: Takeover
+  In: c10::cuda::CUDAStream
+  Out: c10::xpu::XPUStream
+  Methods:
+    - In: stream
+      Out: \&static_cast<sycl::queue &>($method_base queue())
 
 - Rule: rule_remove_AT_CUDA_CHECK
-  Kind: PatternRewriter
+  Kind: Macro
   Priority: Takeover
-  In: AT_CUDA_CHECK(${args});
-  Out: ${args};
+  In: AT_CUDA_CHECK
+  Out: |
 
 - Rule: rule_CUDAContext_h
   Kind: Header
   Priority: Takeover
   In: ATen/cuda/CUDAContext.h
-  Out: c10/xpu/XPUStream.h
+  Out: ATen/xpu/XPUContext.h
   Includes: []