Merge remote-tracking branch 'origin' into tiling-bf16-support

libxsmm · Feb 19, 2025 · 304bc8c · 304bc8c
2 parents f717259 + cb1e22f
commit 304bc8c
Show file tree

Hide file tree

Showing 88 changed files with 605 additions and 476 deletions.
diff --git a/README.md b/README.md
@@ -30,7 +30,7 @@ If you're having trouble with your build, you can use Conda to create a minimal
 git clone https://github.com/llvm/llvm-project.git
 
 # checking out a tpp-mlir compatible version of llvm-project
-wget https://raw.githubusercontent.com/plaidml/tpp-mlir/main/build_tools/llvm_version.txt
+wget https://raw.githubusercontent.com/libxsmm/tpp-mlir/main/build_tools/llvm_version.txt
 pushd llvm-project
 git checkout `cat ../llvm_version.txt`
 popd
@@ -51,7 +51,7 @@ cmake -G Ninja ../llvm \
    -DLLVM_BUILD_EXAMPLES=ON \
    -DLLVM_INSTALL_UTILS=ON \
    -DLLVM_TARGETS_TO_BUILD="host" \
-   -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+   -DCMAKE_BUILD_TYPE=Release \
    -DLLVM_ENABLE_ASSERTIONS=ON \
    -DCMAKE_C_COMPILER=clang \
    -DCMAKE_CXX_COMPILER=clang++ \
@@ -82,7 +82,7 @@ pushd tpp-mlir/build
 # Build & test
 # Please, make sure to use clang to build TPP-MLIR
 cmake -G Ninja .. \
-   -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+   -DCMAKE_BUILD_TYPE=Release \
    -DMLIR_DIR=$CUSTOM_LLVM_ROOT/lib/cmake/mlir \
    -DLLVM_EXTERNAL_LIT=$CUSTOM_LLVM_ROOT/bin/llvm-lit \
    -DCMAKE_C_COMPILER=clang \
@@ -95,20 +95,22 @@ popd
 
 To enable experimental GPU support see: [GPU/README.md](lib/TPP/GPU/README.md)
 
+In the example above, we are building both LLVM/MLIR and tpp-mlir in relese mode. You can easily change the build type by adopting the `-DCMAKE_BUILD_TYPE` option, e.g. `=DCMAKE_BUILD_TYPE=RelWithDebInfo`. 
+
 ### Conda Environment
 
 Every modern Linux and MacOS system should be able to build our project without glitches, however, you may have an older OS or some special condisiont (cluster environment).
 As each operating system has its own package manager and package names, we opted for providing instructions for the user-level package manager ```conda```.
 This environment has been successfully tested on top of a Fedora Server minimal installation with less than 400 system-wide packages being installed.
 
-Initial Setup (using Conda):
+Initial Setup (using Conda via Miniforge):
 ```sh
 export TPPMLIR_WORKSPACE_DIR=/foo
 cd ${TPPMLIR_WORKSPACE_DIR}
 export ARCH_NAME=$(uname -m)
-wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${ARCH_NAME}.sh
-bash Miniconda3-latest-Linux-${ARCH_NAME}.sh -b -p ${TPPMLIR_WORKSPACE_DIR}/miniconda3
-eval "$(${TPPMLIR_WORKSPACE_DIR}/miniconda3/bin/conda shell.bash hook)"
+wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-${ARCH_NAME}.sh
+bash Miniforge3-Linux-${ARCH_NAME}.sh -b -p ${TPPMLIR_WORKSPACE_DIR}/miniforge3
+eval "$(${TPPMLIR_WORKSPACE_DIR}/miniforge3/bin/conda shell.bash hook)"
 conda activate
 
 conda install -y cmake ninja git clang clangxx llvm lld llvm-openmp llvm-tools binutils
@@ -124,7 +126,7 @@ Reloading the environment  after conda deactivate/logout/reboot:
 ```sh
 export TPPMLIR_WORKSPACE_DIR=/foo
 cd ${TPPMLIR_WORKSPACE_DIR}
-eval "$(${TPPMLIR_WORKSPACE_DIR}/miniconda3/bin/conda shell.bash hook)"
+eval "$(${TPPMLIR_WORKSPACE_DIR}/miniforge3/bin/conda shell.bash hook)"
 conda activate
 ```
 

diff --git a/benchmarks/mlir/fp32-pack-gemm-operand-a-512x1024.mlir b/benchmarks/mlir/fp32-pack-gemm-operand-a-512x1024.mlir
@@ -4,7 +4,7 @@
 // BENCH_TOTAL_FLOPS: 2097152
 
 func.func @entry(%arg0: tensor<512x1024xf32>, %arg1: tensor<16x32x32x32xf32>) -> tensor<16x32x32x32xf32> {
-  %pack = tensor.pack %arg0
+  %pack = linalg.pack %arg0
     inner_dims_pos = [0, 1]
     inner_tiles = [32, 32]
     into %arg1 : tensor<512x1024xf32> -> tensor<16x32x32x32xf32>

diff --git a/benchmarks/mlir/fp32-pack-gemm-operand-b-512x1024.mlir b/benchmarks/mlir/fp32-pack-gemm-operand-b-512x1024.mlir
@@ -4,7 +4,7 @@
 // BENCH_TOTAL_FLOPS: 2097152
 
 func.func @entry(%arg0: tensor<1024x512xf32>, %arg1: tensor<16x32x32x32xf32>) -> tensor<16x32x32x32xf32> {
-  %0 = tensor.pack %arg0
+  %0 = linalg.pack %arg0
     outer_dims_perm = [1, 0]
     inner_dims_pos = [0, 1]
     inner_tiles = [32, 32]

diff --git a/benchmarks/mlir/fp32-unpack-gemm-operand-a-512x512.mlir b/benchmarks/mlir/fp32-unpack-gemm-operand-a-512x512.mlir
@@ -4,7 +4,7 @@
 // BENCH_TOTAL_FLOPS: 1048576
 
 func.func @entry(%arg0: tensor<16x16x32x32xf32>, %arg1: tensor<512x512xf32>) -> tensor<512x512xf32> {
-  %unpack = tensor.unpack %arg0
+  %unpack = linalg.unpack %arg0
     inner_dims_pos = [0, 1]
     inner_tiles = [32, 32]
     into %arg1 : tensor<16x16x32x32xf32> -> tensor<512x512xf32>

diff --git a/build_tools/llvm_version.txt b/build_tools/llvm_version.txt
@@ -1 +1 @@
-3654f1baa66f524c89e40ab24e18e594e56363e9
+2b71df5a74cb5bd67f3f34277749dc920fd35105
diff --git a/docs/TPPDialect.md b/docs/TPPDialect.md
@@ -172,12 +172,12 @@ Should be fused with the user(s).
 GEMM ops have transposed versions, we should use this op to annotate operands.
 
 ## Tensor pack
-The tensor operation `tensor.pack` does a "block transpose" (n,m <-> m,n) copies.
+The tensor operation `linalg.pack` does a "block transpose" (n,m <-> m,n) copies.
 We lower this to a series of `tpp.copy` into temporary tiles if needed.
 But the idea is that all constant tensors would have been packed by the compiler already and all input packs would be combined at the beginning.
 
 ## Tensor Unpack
-The tensor operation `tensor.unpack` does a "block transpose" (n,m <-> m,n) copies.
+The tensor operation `linalg.unpack` does a "block transpose" (n,m <-> m,n) copies.
 
 ## VNNI Pack
 Packs into VNNI shape.

diff --git a/include/TPP/IR/StructuredOpMatcher.h b/include/TPP/IR/StructuredOpMatcher.h
@@ -190,7 +190,7 @@ struct HasStaticStrides {
     SmallVector<int64_t> strides;
     if (auto memRefType = dyn_cast_or_null<MemRefType>(operandType)) {
       int64_t offset;
-      if (failed(getStridesAndOffset(memRefType, strides, offset)))
+      if (failed(memRefType.getStridesAndOffset(strides, offset)))
         return false;
       if (llvm::any_of(strides, [](int64_t stride) {
             return stride == ShapedType::kDynamic;

diff --git a/include/TPP/Passes.td b/include/TPP/Passes.td
@@ -262,26 +262,26 @@ def CombineXsmmOpPass : Pass<"combine-xsmm-op-optimization", "func::FuncOp"> {
 }
 
 def PropagatePackUnPack : Pass<"propagate-pack-and-unpack", "func::FuncOp"> {
-  let summary = "Propagate tensor.pack and tensor.unpack";
+  let summary = "Propagate linalg.pack and linalg.unpack";
   let description = [{
-    Attempt to push tensor.pack and tensor.unpack at the boundaries. Currently,
+    Attempt to push linalg.pack and linalg.unpack at the boundaries. Currently,
     it propagates through linalg element-wise operations. Only one operand in the
-    generic must come from a tensor.pack/tensor.unpack.
+    generic must come from a linalg.pack/linalg.unpack.
   }];
 }
 
 def SimplifyAndCanonicalizePack : Pass<"simplify-pack", "func::FuncOp"> {
-  let summary = "Simplify and canonicalize tensor.pack";
+  let summary = "Simplify and canonicalize linalg.pack";
   let description = [{
-    Apply `tensor.pack` and `tensor.unpack` canonicalization and simplification
+    Apply `linalg.pack` and `linalg.unpack` canonicalization and simplification
     patterns.
   }];
 }
 
 def ConstantFoldPack : Pass<"constant-fold-pack", "ModuleOp"> {
-  let summary = "Constant fold tensor.pack";
+  let summary = "Constant fold linalg.pack";
   let description = [{
-    Reduce pack overhead by folding tensor.pack into constant tensors.
+    Reduce pack overhead by folding linalg.pack into constant tensors.
   }];
   let dependentDialects = ["linalg::LinalgDialect",
                            "tensor::TensorDialect",

diff --git a/include/TPP/Transforms/Utils/VNNIUtils.h b/include/TPP/Transforms/Utils/VNNIUtils.h
@@ -28,14 +28,16 @@ class LinalgOp;
 
 namespace vnni {
 namespace utils {
-
 enum class VnniOperandRank {
   TRANSPOSE = 3,
   GEMM = 3,
   BRGEMM_INS = 4,
   BRGEMM_OUTS = 3
 };
 
+// Returns True if the current architecture supports AMX instructions.
+bool hasAMX();
+
 // Return the VNNI blocking factor if it can be determined for the given type or
 // zero, otherwise.
 // Optionally, an operation can be provided to give access to DLTI.

diff --git a/lib/TPP/Conversion/ConvertCheckToLoops/ConvertCheckToLoops.cpp b/lib/TPP/Conversion/ConvertCheckToLoops/ConvertCheckToLoops.cpp
@@ -184,7 +184,7 @@ struct ConvertCheckToLoops
   void runOnOperation() override {
     RewritePatternSet patterns(&getContext());
     populateCheckToLoopsPatterns(patterns);
-    (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+    (void)applyPatternsGreedily(getOperation(), std::move(patterns));
   }
 };
 

diff --git a/lib/TPP/Conversion/ConvertLinalgToFunc/ConvertLinalgToFunc.cpp b/lib/TPP/Conversion/ConvertLinalgToFunc/ConvertLinalgToFunc.cpp
@@ -127,7 +127,7 @@ struct ConvertLinalgToFunc
     auto *ctx = &getContext();
     RewritePatternSet patterns(ctx);
     patterns.add<ConvertMatmulOp>(ctx);
-    (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+    (void)applyPatternsGreedily(getOperation(), std::move(patterns));
   }
 };
 

diff --git a/lib/TPP/Conversion/ConvertLinalgToXsmm/ConvertLinalgToXsmm.cpp b/lib/TPP/Conversion/ConvertLinalgToXsmm/ConvertLinalgToXsmm.cpp
@@ -585,7 +585,7 @@ static FailureOr<BrgemmInfo> checkAccess(linalg::LinalgOp linalgOp, unsigned m,
     strideB = (*stridesOnB)[*batchPosCodomainB];
   }
 
-  auto loops = linalgOp.computeStaticLoopSizes();
+  auto loops = linalgOp.getStaticLoopRanges();
   int64_t batchVal = (batchPos) ? loops[batchPos.value()] : 0;
 
   bool isVnni = vnni::utils::isInVnniLayout(linalgOp);
@@ -847,7 +847,7 @@ void ConvertLinalgToXsmm::runOnOperation() {
   SmallVector<StringRef> skipPatterns(skipOperations.begin(),
                                       skipOperations.end());
   tpp::populateLinalgToXsmmPatterns(patterns, skipPatterns);
-  if (failed(applyPatternsAndFoldGreedily(getOperation(), std::move(patterns))))
+  if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
     return signalPassFailure();
 }
 

diff --git a/lib/TPP/Conversion/ConvertPerfToFunc/ConvertPerfToFunc.cpp b/lib/TPP/Conversion/ConvertPerfToFunc/ConvertPerfToFunc.cpp
@@ -242,7 +242,7 @@ struct ConvertPerfToFunc
   void runOnOperation() override {
     RewritePatternSet patterns(&getContext());
     populatePerfToFuncPatterns(patterns);
-    (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+    (void)applyPatternsGreedily(getOperation(), std::move(patterns));
   }
 };
 

diff --git a/lib/TPP/Conversion/ConvertPerfToLoops/ConvertPerfToLoops.cpp b/lib/TPP/Conversion/ConvertPerfToLoops/ConvertPerfToLoops.cpp
@@ -105,7 +105,7 @@ struct ConvertPerfToLoops
   void runOnOperation() override {
     RewritePatternSet patterns(&getContext());
     populatePerfToLoopsPatterns(patterns);
-    (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+    (void)applyPatternsGreedily(getOperation(), std::move(patterns));
   }
 };
 

diff --git a/lib/TPP/Conversion/ConvertVectorToXsmm/ConvertVectorToXsmm.cpp b/lib/TPP/Conversion/ConvertVectorToXsmm/ConvertVectorToXsmm.cpp
@@ -248,7 +248,7 @@ struct ConvertVectorToXsmm
 
   void runOnOperation() final {
     PatternRewriter rewriter(&getContext());
-    if (failed(applyPatternsAndFoldGreedily(getOperation(), patterns))) {
+    if (failed(applyPatternsGreedily(getOperation(), patterns))) {
       signalPassFailure();
     }
   }

diff --git a/lib/TPP/Conversion/ConvertXsmmToFunc/ConvertXsmmToFunc.cpp b/lib/TPP/Conversion/ConvertXsmmToFunc/ConvertXsmmToFunc.cpp
@@ -432,7 +432,7 @@ struct ConvertXsmmToFunc
                  ConvertGemmDispatchOp, ConvertBrgemmDispatchOp,
                  ConvertFusedBrgemmOp, ConvertIntelAMXTileConfigDispatchOp>(
         patterns.getContext());
-    (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+    (void)applyPatternsGreedily(getOperation(), std::move(patterns));
   }
 };
 

diff --git a/lib/TPP/DefaultPipeline.cpp b/lib/TPP/DefaultPipeline.cpp
@@ -22,6 +22,7 @@
 #include "TPP/Dialect/Perf/PerfOps.h"
 #include "TPP/Dialect/Xsmm/XsmmDialect.h"
 #include "TPP/PassUtils.h"
+#include "TPP/Transforms/Utils/VNNIUtils.h"
 #include "mlir/Transforms/Passes.h"
 
 #include <string>
@@ -187,27 +188,32 @@ struct DefaultPipeline : public tpp::impl::DefaultPipelineBase<DefaultPipeline>,
       pm.addPass(createPrintIRPass());
 
     // Lower to LLVM
-    pm.addPass(createConvertVectorToLLVMPass());
+    ConvertVectorToLLVMPassOptions options;
+    options.amx = vnni::utils::hasAMX();
+    pm.addPass(createConvertVectorToLLVMPass(options));
     pm.addPass(createFinalizeMemRefToLLVMConversionPass());
-    pm.addPass(createConvertSCFToCFPass());
+    pm.addPass(createSCFToControlFlowPass());
     if (defParallel)
       pm.addPass(createConvertOpenMPToLLVMPass());
-    pm.addPass(createConvertMathToLLVMPass());
 
     pm.addNestedPass<func::FuncOp>(createGpuAsyncRegionPass());
     pm.addPass(createGpuToLLVMConversionPass());
     GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions;
     gpuModuleToBinaryPassOptions.compilationTarget = "fatbin";
     pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions));
+    pm.addPass(createConvertMathToLLVMPass());
     pm.addPass(createAsyncToAsyncRuntimePass());
     pm.addPass(createAsyncRuntimeRefCountingPass());
     pm.addPass(createConvertAsyncToLLVMPass());
+    pm.addPass(createConvertIndexToLLVMPass());
 
     pm.addPass(createConvertFuncToLLVMPass());
 
-    pm.addNestedPass<func::FuncOp>(createArithToLLVMConversionPass());
-    pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
-    pm.addNestedPass<func::FuncOp>(createCSEPass());
+    pm.addPass(createArithToLLVMConversionPass());
+    pm.addPass(createConvertControlFlowToLLVMPass());
+    pm.addPass(createUBToLLVMConversionPass());
+    pm.addPass(createCanonicalizerPass());
+    pm.addPass(createCSEPass());
     pm.addPass(createReconcileUnrealizedCastsPass());
 
     // Anything useful has been lowered by now.

diff --git a/lib/TPP/DefaultTppPasses.cpp b/lib/TPP/DefaultTppPasses.cpp
@@ -104,7 +104,7 @@ struct DefaultTppPasses
     if (linalgToLoops) {
       // Lower linalg directly to loops.
       // Skip all TPP transformations.
-      // Generalize tensor.pack and tensor.unpack.
+      // Generalize linalg.pack and linalg.unpack.
       pm.addPass(createLowerPacksAndUnPacks());
       pm.addNestedPass<func::FuncOp>(createDecomposeAggregatedOps());
       pm.addPass(createBufferize());
@@ -120,7 +120,7 @@ struct DefaultTppPasses
       TppMappingOptions tppMappingOptions{lowerPackUnpackWithoutTranspose};
       pm.addPass(createTppMapping(tppMappingOptions));
 
-      // Generalize tensor.pack and tensor.unpack.
+      // Generalize linalg.pack and linalg.unpack.
       pm.addPass(createLowerPacksAndUnPacks());
       pm.addPass(createCleanup());
 

diff --git a/lib/TPP/Dialect/Xsmm/XsmmUtils.cpp b/lib/TPP/Dialect/Xsmm/XsmmUtils.cpp
@@ -151,7 +151,7 @@ getVectorUnaryInfo(MemRefType shapedType, MemRefType inputType,
     SmallVector<int64_t> strides;
     int64_t offset;
 
-    if (failed(getStridesAndOffset(memrefType, strides, offset))) {
+    if (failed(memrefType.getStridesAndOffset(strides, offset))) {
       return failure();
     }
     if (strides.empty()) {

diff --git a/lib/TPP/GPU/GpuConversion.cpp b/lib/TPP/GPU/GpuConversion.cpp
@@ -58,7 +58,7 @@ struct GpuConversion : public tpp::impl::GpuConversionBase<GpuConversion>,
   void constructPipeline() override {
     // Map loops into GPU kernels.
     pm.addNestedPass<func::FuncOp>(createGpuMapParallelLoopsPass());
-    pm.addNestedPass<func::FuncOp>(createParallelLoopToGpuPass());
+    pm.addNestedPass<func::FuncOp>(createConvertParallelLoopToGpuPass());
     pm.addPass(createCleanup());
 
     // First lower linalg using custom patterns then fall back to

diff --git a/lib/TPP/GPU/GpuDataTransfer.cpp b/lib/TPP/GPU/GpuDataTransfer.cpp
@@ -238,7 +238,7 @@ class GpuDataTransfer : public tpp::impl::GpuDataTransferBase<GpuDataTransfer> {
     RewritePatternSet patterns(ctx);
     // TODO: Add cleanup patterns to minimize data copies.
     patterns.add<TransferDataToGpu>(ctx);
-    (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+    (void)applyPatternsGreedily(getOperation(), std::move(patterns));
   }
 };
 

diff --git a/lib/TPP/GPU/GpuInlineConstants.cpp b/lib/TPP/GPU/GpuInlineConstants.cpp
@@ -81,7 +81,7 @@ struct GpuInlineConstants
   void runOnOperation() override {
     RewritePatternSet patterns(&getContext());
     populateGpuInlineConstantsPatterns(patterns);
-    (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+    (void)applyPatternsGreedily(getOperation(), std::move(patterns));
   }
 };
 

diff --git a/lib/TPP/GPU/GpuToCuda.cpp b/lib/TPP/GPU/GpuToCuda.cpp
@@ -68,7 +68,7 @@ struct GpuToCuda : public tpp::impl::GpuToCudaBase<GpuToCuda>,
     pm.addNestedPass<gpu::GPUModuleOp>(arith::createArithExpandOpsPass());
     pm.addNestedPass<gpu::GPUModuleOp>(createLowerAffinePass());
     pm.addNestedPass<gpu::GPUModuleOp>(createConvertVectorToSCFPass());
-    pm.addNestedPass<gpu::GPUModuleOp>(createConvertSCFToCFPass());
+    pm.addNestedPass<gpu::GPUModuleOp>(createSCFToControlFlowPass());
 
     pm.addNestedPass<gpu::GPUModuleOp>(createConvertNVGPUToNVVMPass());
     pm.addNestedPass<gpu::GPUModuleOp>(createConvertGpuOpsToNVVMOps());
@@ -77,6 +77,7 @@ struct GpuToCuda : public tpp::impl::GpuToCudaBase<GpuToCuda>,
     pm.addNestedPass<gpu::GPUModuleOp>(createConvertFuncToLLVMPass());
     pm.addNestedPass<gpu::GPUModuleOp>(createArithToLLVMConversionPass());
     pm.addNestedPass<gpu::GPUModuleOp>(createConvertIndexToLLVMPass());
+    pm.addNestedPass<gpu::GPUModuleOp>(createUBToLLVMConversionPass());
 
     GpuNVVMAttachTargetOptions nvvmTargetOptions;
     nvvmTargetOptions.triple = gpuTriple;
@@ -85,7 +86,6 @@ struct GpuToCuda : public tpp::impl::GpuToCudaBase<GpuToCuda>,
     pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions));
 
     // Create CUDA kernels.
-    pm.addNestedPass<gpu::GPUModuleOp>(createStripDebugInfoPass());
     pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
     pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
     pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass());

diff --git a/lib/TPP/GPU/GpuVectorize.cpp b/lib/TPP/GPU/GpuVectorize.cpp
@@ -109,7 +109,7 @@ struct GpuVectorize : public tpp::impl::GpuVectorizeBase<GpuVectorize> {
     vector::TransferReadOp::getCanonicalizationPatterns(patterns, ctx);
     vector::TransferWriteOp::getCanonicalizationPatterns(patterns, ctx);
 
-    (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+    (void)applyPatternsGreedily(getOperation(), std::move(patterns));
   }
 };
 

diff --git a/lib/TPP/GPU/LinalgToXeGPU.cpp b/lib/TPP/GPU/LinalgToXeGPU.cpp
@@ -884,7 +884,7 @@ static LogicalResult createDPASKernel(linalg::LinalgOp linalgOp,
 
   // DPAS only works with F32 accumulators.
   auto dpasResType =
-      VectorType::get(dpasTypeC.getShape(), FloatType::getF32(ctx));
+      VectorType::get(dpasTypeC.getShape(), Float32Type::get(ctx));
 
   // Extend the accumulation values if needed.
   auto convOutPrecision = !typeC.getElementType().isF32();
@@ -1397,12 +1397,12 @@ struct LinalgToXeGPU : public tpp::impl::LinalgToXeGPUBase<LinalgToXeGPU> {
     // Run GEMM pattern first to allow fusion with its consumers.
     RewritePatternSet gemmPatterns(&getContext());
     populateLinalgGemmToXeGPUPatterns(gemmPatterns, options);
-    (void)applyPatternsAndFoldGreedily(getOperation(), std::move(gemmPatterns));
+    (void)applyPatternsGreedily(getOperation(), std::move(gemmPatterns));
 
     // Convert other remaining ops.
     RewritePatternSet patterns(&getContext());
     populateLinalgEltwiseToXeGPUPatterns(patterns, options);
-    (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+    (void)applyPatternsGreedily(getOperation(), std::move(patterns));
   }
 };
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		3654f1baa66f524c89e40ab24e18e594e56363e9
		2b71df5a74cb5bd67f3f34277749dc920fd35105