From d338e79a4cc1d52a6b5a1a241c2318a7288c0240 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Wed, 7 Jul 2021 14:28:41 -0700
Subject: [PATCH 01/32] [OpaquePtr] Remove checking pointee type for
 byval/preallocated type

These currently always require a type parameter. The bitcode reader
already upgrades old bitcode without the type parameter to use the
pointee type.

In cases where the caller does not have byval but the callee does, we
need to follow CallBase::paramHasAttr() and also look at the callee for
the byval type so that CallBase::isByValArgument() and
CallBase::getParamByValType() are in sync. Do the same for preallocated.

While we're here add a corresponding version for inalloca since we'll
need it soon.

Reviewed By: nikic

Differential Revision: https://reviews.llvm.org/D104663
---
 llvm/include/llvm/IR/InstrTypes.h    | 23 +++++++++++---
 llvm/unittests/IR/AttributesTest.cpp | 46 +++++++++++++++++++++++++++-
 2 files changed, 64 insertions(+), 5 deletions(-)
diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
index c690306cd3d27..0e372d1cc8793 100644
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -1728,14 +1728,29 @@ class CallBase : public Instruction {
 
   /// Extract the byval type for a call or parameter.
   Type *getParamByValType(unsigned ArgNo) const {
-    Type *Ty = Attrs.getParamByValType(ArgNo);
-    return Ty ? Ty : getArgOperand(ArgNo)->getType()->getPointerElementType();
+    if (auto *Ty = Attrs.getParamByValType(ArgNo))
+      return Ty;
+    if (const Function *F = getCalledFunction())
+      return F->getAttributes().getParamByValType(ArgNo);
+    return nullptr;
   }
 
   /// Extract the preallocated type for a call or parameter.
   Type *getParamPreallocatedType(unsigned ArgNo) const {
-    Type *Ty = Attrs.getParamPreallocatedType(ArgNo);
-    return Ty ? Ty : getArgOperand(ArgNo)->getType()->getPointerElementType();
+    if (auto *Ty = Attrs.getParamPreallocatedType(ArgNo))
+      return Ty;
+    if (const Function *F = getCalledFunction())
+      return F->getAttributes().getParamPreallocatedType(ArgNo);
+    return nullptr;
+  }
+
+  /// Extract the preallocated type for a call or parameter.
+  Type *getParamInAllocaType(unsigned ArgNo) const {
+    if (auto *Ty = Attrs.getParamInAllocaType(ArgNo))
+      return Ty;
+    if (const Function *F = getCalledFunction())
+      return F->getAttributes().getParamInAllocaType(ArgNo);
+    return nullptr;
   }
 
   /// Extract the number of dereferenceable bytes for a call or
diff --git a/llvm/unittests/IR/AttributesTest.cpp b/llvm/unittests/IR/AttributesTest.cpp
index f260f0f9bf864..d29df4cd3425b 100644
--- a/llvm/unittests/IR/AttributesTest.cpp
+++ b/llvm/unittests/IR/AttributesTest.cpp
@@ -7,8 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/Attributes.h"
-#include "llvm/IR/LLVMContext.h"
+#include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 using namespace llvm;
 
@@ -252,4 +256,44 @@ TEST(Attributes, AttributeListPrinting) {
   }
 }
 
+TEST(Attributes, MismatchedABIAttrs) {
+  const char *IRString = R"IR(
+    declare void @f1(i32* byval(i32))
+    define void @g() {
+      call void @f1(i32* null)
+      ret void
+    }
+    declare void @f2(i32* preallocated(i32))
+    define void @h() {
+      call void @f2(i32* null)
+      ret void
+    }
+    declare void @f3(i32* inalloca(i32))
+    define void @i() {
+      call void @f3(i32* null)
+      ret void
+    }
+  )IR";
+
+  SMDiagnostic Err;
+  LLVMContext Context;
+  std::unique_ptr<Module> M = parseAssemblyString(IRString, Err, Context);
+  ASSERT_TRUE(M);
+
+  {
+    auto *I = cast<CallBase>(&M->getFunction("g")->getEntryBlock().front());
+    ASSERT_TRUE(I->isByValArgument(0));
+    ASSERT_TRUE(I->getParamByValType(0));
+  }
+  {
+    auto *I = cast<CallBase>(&M->getFunction("h")->getEntryBlock().front());
+    ASSERT_TRUE(I->getParamPreallocatedType(0));
+  }
+  {
+    auto *I = cast<CallBase>(&M->getFunction("i")->getEntryBlock().front());
+    ASSERT_TRUE(I->isInAllocaArgument(0));
+    ASSERT_TRUE(I->getParamInAllocaType(0));
+  }
+}
+
 } // end anonymous namespace

From 89f2d98b9870263106adfeb20c835d4751963cf7 Mon Sep 17 00:00:00 2001
From: Jinsong Ji <jji@us.ibm.com>
Date: Wed, 7 Jul 2021 21:35:56 +0000
Subject: [PATCH 02/32] [PowerPC] Add P7 RUN line for load and splat test

---
 llvm/test/CodeGen/PowerPC/load-and-splat.ll | 99 +++++++++++++++++++++
 1 file changed, 99 insertions(+)

diff --git a/llvm/test/CodeGen/PowerPC/load-and-splat.ll b/llvm/test/CodeGen/PowerPC/load-and-splat.ll
index c011e45412045..3ec8468dcd364 100644
--- a/llvm/test/CodeGen/PowerPC/load-and-splat.ll
+++ b/llvm/test/CodeGen/PowerPC/load-and-splat.ll
@@ -5,6 +5,10 @@
 ; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
 ; RUN:   -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s \
 ; RUN:   -check-prefix=P8
+; RUN: llc -mcpu=pwr7 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mtriple=powerpc64-unknown-unknown < %s | FileCheck %s \
+; RUN:   -check-prefix=P7
+
 define dso_local void @test(<2 x double>* nocapture %c, double* nocapture readonly %a) local_unnamed_addr {
 ; P9-LABEL: test:
 ; P9:       # %bb.0: # %entry
@@ -19,6 +23,13 @@ define dso_local void @test(<2 x double>* nocapture %c, double* nocapture readon
 ; P8-NEXT:    lxvdsx vs0, 0, r4
 ; P8-NEXT:    stxvd2x vs0, 0, r3
 ; P8-NEXT:    blr
+;
+; P7-LABEL: test:
+; P7:       # %bb.0: # %entry
+; P7-NEXT:    addi r4, r4, 24
+; P7-NEXT:    lxvdsx vs0, 0, r4
+; P7-NEXT:    stxvd2x vs0, 0, r3
+; P7-NEXT:    blr
 entry:
   %arrayidx = getelementptr inbounds double, double* %a, i64 3
   %0 = load double, double* %arrayidx, align 8
@@ -43,6 +54,16 @@ define dso_local void @test2(<4 x float>* nocapture %c, float* nocapture readonl
 ; P8-NEXT:    xxspltw v2, vs0, 1
 ; P8-NEXT:    stvx v2, 0, r3
 ; P8-NEXT:    blr
+;
+; P7-LABEL: test2:
+; P7:       # %bb.0: # %entry
+; P7-NEXT:    lwz r4, 12(r4)
+; P7-NEXT:    addi r5, r1, -16
+; P7-NEXT:    stw r4, -16(r1)
+; P7-NEXT:    lxvw4x vs0, 0, r5
+; P7-NEXT:    xxspltw vs0, vs0, 0
+; P7-NEXT:    stxvw4x vs0, 0, r3
+; P7-NEXT:    blr
 entry:
   %arrayidx = getelementptr inbounds float, float* %a, i64 3
   %0 = load float, float* %arrayidx, align 4
@@ -67,6 +88,16 @@ define dso_local void @test3(<4 x i32>* nocapture %c, i32* nocapture readonly %a
 ; P8-NEXT:    xxspltw v2, vs0, 1
 ; P8-NEXT:    stvx v2, 0, r3
 ; P8-NEXT:    blr
+;
+; P7-LABEL: test3:
+; P7:       # %bb.0: # %entry
+; P7-NEXT:    lwz r4, 12(r4)
+; P7-NEXT:    addi r5, r1, -16
+; P7-NEXT:    stw r4, -16(r1)
+; P7-NEXT:    lxvw4x vs0, 0, r5
+; P7-NEXT:    xxspltw vs0, vs0, 0
+; P7-NEXT:    stxvw4x vs0, 0, r3
+; P7-NEXT:    blr
 entry:
   %arrayidx = getelementptr inbounds i32, i32* %a, i64 3
   %0 = load i32, i32* %arrayidx, align 4
@@ -90,6 +121,16 @@ define dso_local void @test4(<2 x i64>* nocapture %c, i64* nocapture readonly %a
 ; P8-NEXT:    lxvdsx vs0, 0, r4
 ; P8-NEXT:    stxvd2x vs0, 0, r3
 ; P8-NEXT:    blr
+;
+; P7-LABEL: test4:
+; P7:       # %bb.0: # %entry
+; P7-NEXT:    ld r4, 24(r4)
+; P7-NEXT:    addi r5, r1, -16
+; P7-NEXT:    std r4, -8(r1)
+; P7-NEXT:    std r4, -16(r1)
+; P7-NEXT:    lxvd2x vs0, 0, r5
+; P7-NEXT:    stxvd2x vs0, 0, r3
+; P7-NEXT:    blr
 entry:
   %arrayidx = getelementptr inbounds i64, i64* %a, i64 3
   %0 = load i64, i64* %arrayidx, align 8
@@ -110,6 +151,15 @@ define <16 x i8> @unadjusted_lxvwsx(i32* %s, i32* %t) {
 ; P8-NEXT:    lfiwzx f0, 0, r3
 ; P8-NEXT:    xxspltw v2, vs0, 1
 ; P8-NEXT:    blr
+;
+; P7-LABEL: unadjusted_lxvwsx:
+; P7:       # %bb.0: # %entry
+; P7-NEXT:    lwz r3, 0(r3)
+; P7-NEXT:    addi r4, r1, -16
+; P7-NEXT:    stw r3, -16(r1)
+; P7-NEXT:    lxvw4x vs0, 0, r4
+; P7-NEXT:    xxspltw v2, vs0, 0
+; P7-NEXT:    blr
   entry:
     %0 = bitcast i32* %s to <4 x i8>*
     %1 = load <4 x i8>, <4 x i8>* %0, align 4
@@ -129,6 +179,15 @@ define <16 x i8> @adjusted_lxvwsx(i64* %s, i64* %t) {
 ; P8-NEXT:    lfdx f0, 0, r3
 ; P8-NEXT:    xxspltw v2, vs0, 0
 ; P8-NEXT:    blr
+;
+; P7-LABEL: adjusted_lxvwsx:
+; P7:       # %bb.0: # %entry
+; P7-NEXT:    ld r3, 0(r3)
+; P7-NEXT:    addi r4, r1, -16
+; P7-NEXT:    std r3, -16(r1)
+; P7-NEXT:    lxvw4x vs0, 0, r4
+; P7-NEXT:    xxspltw v2, vs0, 1
+; P7-NEXT:    blr
   entry:
     %0 = bitcast i64* %s to <8 x i8>*
     %1 = load <8 x i8>, <8 x i8>* %0, align 8
@@ -147,6 +206,12 @@ define <16 x i8> @unadjusted_lxvwsx_v16i8(<16 x i8> *%s, <16 x i8> %t) {
 ; P8-NEXT:    lvx v2, 0, r3
 ; P8-NEXT:    xxspltw v2, v2, 3
 ; P8-NEXT:    blr
+;
+; P7-LABEL: unadjusted_lxvwsx_v16i8:
+; P7:       # %bb.0: # %entry
+; P7-NEXT:    lxvw4x vs0, 0, r3
+; P7-NEXT:    xxspltw v2, vs0, 0
+; P7-NEXT:    blr
   entry:
     %0 = load <16 x i8>, <16 x i8>* %s, align 16
     %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -165,6 +230,12 @@ define <16 x i8> @adjusted_lxvwsx_v16i8(<16 x i8> *%s, <16 x i8> %t) {
 ; P8-NEXT:    lvx v2, 0, r3
 ; P8-NEXT:    xxspltw v2, v2, 2
 ; P8-NEXT:    blr
+;
+; P7-LABEL: adjusted_lxvwsx_v16i8:
+; P7:       # %bb.0: # %entry
+; P7-NEXT:    lxvw4x vs0, 0, r3
+; P7-NEXT:    xxspltw v2, vs0, 1
+; P7-NEXT:    blr
   entry:
     %0 = load <16 x i8>, <16 x i8>* %s, align 16
     %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
@@ -183,6 +254,12 @@ define <16 x i8> @adjusted_lxvwsx_v16i8_2(<16 x i8> *%s, <16 x i8> %t) {
 ; P8-NEXT:    lvx v2, 0, r3
 ; P8-NEXT:    xxspltw v2, v2, 1
 ; P8-NEXT:    blr
+;
+; P7-LABEL: adjusted_lxvwsx_v16i8_2:
+; P7:       # %bb.0: # %entry
+; P7-NEXT:    lxvw4x vs0, 0, r3
+; P7-NEXT:    xxspltw v2, vs0, 2
+; P7-NEXT:    blr
   entry:
     %0 = load <16 x i8>, <16 x i8>* %s, align 16
     %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11>
@@ -201,6 +278,12 @@ define <16 x i8> @adjusted_lxvwsx_v16i8_3(<16 x i8> *%s, <16 x i8> %t) {
 ; P8-NEXT:    lvx v2, 0, r3
 ; P8-NEXT:    xxspltw v2, v2, 0
 ; P8-NEXT:    blr
+;
+; P7-LABEL: adjusted_lxvwsx_v16i8_3:
+; P7:       # %bb.0: # %entry
+; P7-NEXT:    lxvw4x vs0, 0, r3
+; P7-NEXT:    xxspltw v2, vs0, 3
+; P7-NEXT:    blr
   entry:
     %0 = load <16 x i8>, <16 x i8>* %s, align 16
     %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15>
@@ -217,6 +300,11 @@ define <16 x i8> @unadjusted_lxvdsx(i64* %s, i64* %t) {
 ; P8:       # %bb.0: # %entry
 ; P8-NEXT:    lxvdsx v2, 0, r3
 ; P8-NEXT:    blr
+;
+; P7-LABEL: unadjusted_lxvdsx:
+; P7:       # %bb.0: # %entry
+; P7-NEXT:    lxvdsx v2, 0, r3
+; P7-NEXT:    blr
   entry:
     %0 = bitcast i64* %s to <8 x i8>*
     %1 = load <8 x i8>, <8 x i8>* %0, align 8
@@ -234,6 +322,11 @@ define <16 x i8> @unadjusted_lxvdsx_v16i8(<16 x i8> *%s, <16 x i8> %t) {
 ; P8:       # %bb.0: # %entry
 ; P8-NEXT:    lxvdsx v2, 0, r3
 ; P8-NEXT:    blr
+;
+; P7-LABEL: unadjusted_lxvdsx_v16i8:
+; P7:       # %bb.0: # %entry
+; P7-NEXT:    lxvdsx v2, 0, r3
+; P7-NEXT:    blr
   entry:
     %0 = load <16 x i8>, <16 x i8>* %s, align 16
     %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -252,6 +345,12 @@ define <16 x i8> @adjusted_lxvdsx_v16i8(<16 x i8> *%s, <16 x i8> %t) {
 ; P8-NEXT:    addi r3, r3, 8
 ; P8-NEXT:    lxvdsx v2, 0, r3
 ; P8-NEXT:    blr
+;
+; P7-LABEL: adjusted_lxvdsx_v16i8:
+; P7:       # %bb.0: # %entry
+; P7-NEXT:    addi r3, r3, 8
+; P7-NEXT:    lxvdsx v2, 0, r3
+; P7-NEXT:    blr
   entry:
     %0 = load <16 x i8>, <16 x i8>* %s, align 16
     %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>

From aad41e229966e3371256de6adf8f4812803efaf6 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Wed, 7 Jul 2021 14:50:30 -0700
Subject: [PATCH 03/32] [OpaquePtr] Use ArgListEntry::IndirectType for lowering
 ABI attributes

Consolidate PreallocatedType and ByValType into IndirectType, and use that for inalloca.
---
 llvm/include/llvm/CodeGen/TargetLowering.h            |  3 +--
 llvm/lib/CodeGen/SelectionDAG/FastISel.cpp            |  7 ++-----
 llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp |  8 ++------
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp      | 11 +++++++----
 4 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index e2474e21052f1..1cf6ac0e5f949 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -290,8 +290,7 @@ class TargetLoweringBase {
     bool IsSwiftError : 1;
     bool IsCFGuardTarget : 1;
     MaybeAlign Alignment = None;
-    Type *ByValType = nullptr;
-    Type *PreallocatedType = nullptr;
+    Type *IndirectType = nullptr;
 
     ArgListEntry()
         : IsSExt(false), IsZExt(false), IsInReg(false), IsSRet(false),
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index ffdaf9a547e66..ec40ddc1ff750 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1076,15 +1076,12 @@ bool FastISel::lowerCallTo(CallLoweringInfo &CLI) {
     }
     MaybeAlign MemAlign = Arg.Alignment;
     if (Arg.IsByVal || Arg.IsInAlloca || Arg.IsPreallocated) {
-      PointerType *Ty = cast<PointerType>(Arg.Ty);
-      Type *ElementTy = Ty->getElementType();
-      unsigned FrameSize =
-          DL.getTypeAllocSize(Arg.ByValType ? Arg.ByValType : ElementTy);
+      unsigned FrameSize = DL.getTypeAllocSize(Arg.IndirectType);
 
       // For ByVal, alignment should come from FE. BE will guess if this info
       // is not there, but there are cases it cannot get right.
       if (!MemAlign)
-        MemAlign = Align(TLI.getByValTypeAlignment(ElementTy, DL));
+        MemAlign = Align(TLI.getByValTypeAlignment(Arg.IndirectType, DL));
       Flags.setByValSize(FrameSize);
     } else if (!MemAlign) {
       MemAlign = DL.getABITypeAlign(Arg.Ty);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 941ec61264b4c..baef5e7c4a770 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -9578,18 +9578,14 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
       }
       Align MemAlign;
       if (Args[i].IsByVal || Args[i].IsInAlloca || Args[i].IsPreallocated) {
-        PointerType *Ty = cast<PointerType>(Args[i].Ty);
-        Type *ElementTy = Ty->getElementType();
-
-        unsigned FrameSize = DL.getTypeAllocSize(
-            Args[i].ByValType ? Args[i].ByValType : ElementTy);
+        unsigned FrameSize = DL.getTypeAllocSize(Args[i].IndirectType);
         Flags.setByValSize(FrameSize);
 
         // info is not there but there are cases it cannot get right.
         if (auto MA = Args[i].Alignment)
           MemAlign = *MA;
         else
-          MemAlign = Align(getByValTypeAlignment(ElementTy, DL));
+          MemAlign = Align(getByValTypeAlignment(Args[i].IndirectType, DL));
       } else if (auto MA = Args[i].Alignment) {
         MemAlign = *MA;
       } else {
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 00403a9260b01..bc033b06e7a54 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -119,15 +119,18 @@ void TargetLoweringBase::ArgListEntry::setAttributes(const CallBase *Call,
   IsSwiftAsync = Call->paramHasAttr(ArgIdx, Attribute::SwiftAsync);
   IsSwiftError = Call->paramHasAttr(ArgIdx, Attribute::SwiftError);
   Alignment = Call->getParamStackAlign(ArgIdx);
-  ByValType = nullptr;
+  IndirectType = nullptr;
+  assert(IsByVal + IsPreallocated + IsInAlloca <= 1 &&
+         "multiple ABI attributes?");
   if (IsByVal) {
-    ByValType = Call->getParamByValType(ArgIdx);
+    IndirectType = Call->getParamByValType(ArgIdx);
     if (!Alignment)
       Alignment = Call->getParamAlign(ArgIdx);
   }
-  PreallocatedType = nullptr;
   if (IsPreallocated)
-    PreallocatedType = Call->getParamPreallocatedType(ArgIdx);
+    IndirectType = Call->getParamPreallocatedType(ArgIdx);
+  if (IsInAlloca)
+    IndirectType = Call->getParamInAllocaType(ArgIdx);
 }
 
 /// Generate a libcall taking the given operands as arguments and returning a

From b81aa458afd023323dcd4400164f6a43d981d7de Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 7 Jul 2021 15:04:18 -0700
Subject: [PATCH 04/32] [llvm-nm][test] Fix just-symbols.test

---
 llvm/test/tools/llvm-nm/just-symbols.test | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/tools/llvm-nm/just-symbols.test b/llvm/test/tools/llvm-nm/just-symbols.test
index 24284610ea89f..e0926c8a93209 100644
--- a/llvm/test/tools/llvm-nm/just-symbols.test
+++ b/llvm/test/tools/llvm-nm/just-symbols.test
@@ -7,7 +7,7 @@
 # RUN: llvm-nm --just-symbol-name %t.o | diff %t.txt -
 # RUN: llvm-nm --format=just-symbols %t.o | diff %t.txt -
 # RUN: llvm-nm --format=sysv -j %t.o | diff %t.txt -
-# RUN: llvm-nm -j --format=posix %t.o | not diff -q %t.txt %t1.txt
+# RUN: llvm-nm -j --format=posix %t.o | not diff -q %t.txt -
 
 # RUN: FileCheck %s --input-file=%t.txt --implicit-check-not={{.}} --check-prefix=COMMON
 

From 966386514bec9366ca85d1599f4c866eee9f1927 Mon Sep 17 00:00:00 2001
From: Leonard Chan <leonardchan@google.com>
Date: Fri, 11 Jun 2021 10:32:04 -0700
Subject: [PATCH 05/32] [compiler-rt][hwasan] Setup hwasan thread handling on
 Fuchsia

This patch splits up hwasan thread creation between `__sanitizer_before_thread_create_hook`,
`__sanitizer_thread_create_hook`, and `__sanitizer_thread_start_hook`.
The linux implementation creates the hwasan thread object inside the
new thread. On Fuchsia, we know the stack bounds before thread creation,
so we can initialize part of the thread object in `__sanitizer_before_thread_create_hook`,
then initialize the stack ring buffer in `__sanitizer_thread_start_hook`
once we enter the thread.

Differential Revision: https://reviews.llvm.org/D104085
---
 compiler-rt/lib/hwasan/CMakeLists.txt     |   1 +
 compiler-rt/lib/hwasan/hwasan_fuchsia.cpp | 159 ++++++++++++++++++++++
 compiler-rt/lib/hwasan/hwasan_thread.cpp  |   5 +
 3 files changed, 165 insertions(+)
 create mode 100644 compiler-rt/lib/hwasan/hwasan_fuchsia.cpp

diff --git a/compiler-rt/lib/hwasan/CMakeLists.txt b/compiler-rt/lib/hwasan/CMakeLists.txt
index 137abf7c246ab..f6740dca021b0 100644
--- a/compiler-rt/lib/hwasan/CMakeLists.txt
+++ b/compiler-rt/lib/hwasan/CMakeLists.txt
@@ -7,6 +7,7 @@ set(HWASAN_RTL_SOURCES
   hwasan_allocation_functions.cpp
   hwasan_dynamic_shadow.cpp
   hwasan_exceptions.cpp
+  hwasan_fuchsia.cpp
   hwasan_globals.cpp
   hwasan_interceptors.cpp
   hwasan_interceptors_vfork.S
diff --git a/compiler-rt/lib/hwasan/hwasan_fuchsia.cpp b/compiler-rt/lib/hwasan/hwasan_fuchsia.cpp
new file mode 100644
index 0000000000000..b8e67c4c48221
--- /dev/null
+++ b/compiler-rt/lib/hwasan/hwasan_fuchsia.cpp
@@ -0,0 +1,159 @@
+//===-- hwasan_fuchsia.cpp --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file is a part of HWAddressSanitizer and contains Fuchsia-specific
+/// code.
+///
+//===----------------------------------------------------------------------===//
+
+#include "sanitizer_common/sanitizer_fuchsia.h"
+#if SANITIZER_FUCHSIA
+
+#include "hwasan.h"
+#include "hwasan_interface_internal.h"
+#include "hwasan_report.h"
+#include "hwasan_thread.h"
+#include "hwasan_thread_list.h"
+
+// This TLS variable contains the location of the stack ring buffer and can be
+// used to always find the hwasan thread object associated with the current
+// running thread.
+[[gnu::tls_model("initial-exec")]]
+SANITIZER_INTERFACE_ATTRIBUTE
+THREADLOCAL uptr __hwasan_tls;
+
+namespace __hwasan {
+
+// These are known parameters passed to the hwasan runtime on thread creation.
+struct Thread::InitState {
+  uptr stack_bottom, stack_top;
+};
+
+static void FinishThreadInitialization(Thread *thread);
+
+void InitThreads() {
+  // This is the minimal alignment needed for the storage where hwasan threads
+  // and their stack ring buffers are placed. This alignment is necessary so the
+  // stack ring buffer can perform a simple calculation to get the next element
+  // in the RB. The instructions for this calculation are emitted by the
+  // compiler. (Full explanation in hwasan_thread_list.h.)
+  uptr alloc_size = UINT64_C(1) << kShadowBaseAlignment;
+  uptr thread_start = reinterpret_cast<uptr>(
+      MmapAlignedOrDieOnFatalError(alloc_size, alloc_size, __func__));
+
+  InitThreadList(thread_start, alloc_size);
+
+  // Create the hwasan thread object for the current (main) thread. Stack info
+  // for this thread is known from information passed via
+  // __sanitizer_startup_hook.
+  const Thread::InitState state = {
+      .stack_bottom = __sanitizer::MainThreadStackBase,
+      .stack_top =
+          __sanitizer::MainThreadStackBase + __sanitizer::MainThreadStackSize,
+  };
+  FinishThreadInitialization(hwasanThreadList().CreateCurrentThread(&state));
+}
+
+uptr *GetCurrentThreadLongPtr() { return &__hwasan_tls; }
+
+// This is called from the parent thread before the new thread is created. Here
+// we can propagate known info like the stack bounds to Thread::Init before
+// jumping into the thread. We cannot initialize the stack ring buffer yet since
+// we have not entered the new thread.
+static void *BeforeThreadCreateHook(uptr user_id, bool detached,
+                                    const char *name, uptr stack_bottom,
+                                    uptr stack_size) {
+  const Thread::InitState state = {
+      .stack_bottom = stack_bottom,
+      .stack_top = stack_bottom + stack_size,
+  };
+  return hwasanThreadList().CreateCurrentThread(&state);
+}
+
+// This sets the stack top and bottom according to the InitState passed to
+// CreateCurrentThread above.
+void Thread::InitStackAndTls(const InitState *state) {
+  CHECK_NE(state->stack_bottom, 0);
+  CHECK_NE(state->stack_top, 0);
+  stack_bottom_ = state->stack_bottom;
+  stack_top_ = state->stack_top;
+  tls_end_ = tls_begin_ = 0;
+}
+
+// This is called after creating a new thread with the pointer returned by
+// BeforeThreadCreateHook. We are still in the creating thread and should check
+// if it was actually created correctly.
+static void ThreadCreateHook(void *hook, bool aborted) {
+  Thread *thread = static_cast<Thread *>(hook);
+  if (!aborted) {
+    // The thread was created successfully.
+    // ThreadStartHook can already be running in the new thread.
+  } else {
+    // The thread wasn't created after all.
+    // Clean up everything we set up in BeforeThreadCreateHook.
+    atomic_signal_fence(memory_order_seq_cst);
+    hwasanThreadList().ReleaseThread(thread);
+  }
+}
+
+// This is called in the newly-created thread before it runs anything else,
+// with the pointer returned by BeforeThreadCreateHook (above). Here we can
+// setup the stack ring buffer.
+static void ThreadStartHook(void *hook, thrd_t self) {
+  Thread *thread = static_cast<Thread *>(hook);
+  FinishThreadInitialization(thread);
+  thread->InitRandomState();
+}
+
+// This is the function that sets up the stack ring buffer and enables us to use
+// GetCurrentThread. This function should only be called while IN the thread
+// that we want to create the hwasan thread object for so __hwasan_tls can be
+// properly referenced.
+static void FinishThreadInitialization(Thread *thread) {
+  CHECK_NE(thread, nullptr);
+
+  // The ring buffer is located immediately before the thread object.
+  uptr stack_buffer_size = hwasanThreadList().GetRingBufferSize();
+  uptr stack_buffer_start = reinterpret_cast<uptr>(thread) - stack_buffer_size;
+  thread->InitStackRingBuffer(stack_buffer_start, stack_buffer_size);
+}
+
+static void ThreadExitHook(void *hook, thrd_t self) {
+  Thread *thread = static_cast<Thread *>(hook);
+  atomic_signal_fence(memory_order_seq_cst);
+  hwasanThreadList().ReleaseThread(thread);
+}
+
+}  // namespace __hwasan
+
+extern "C" {
+
+void *__sanitizer_before_thread_create_hook(thrd_t thread, bool detached,
+                                            const char *name, void *stack_base,
+                                            size_t stack_size) {
+  return __hwasan::BeforeThreadCreateHook(
+      reinterpret_cast<uptr>(thread), detached, name,
+      reinterpret_cast<uptr>(stack_base), stack_size);
+}
+
+void __sanitizer_thread_create_hook(void *hook, thrd_t thread, int error) {
+  __hwasan::ThreadCreateHook(hook, error != thrd_success);
+}
+
+void __sanitizer_thread_start_hook(void *hook, thrd_t self) {
+  __hwasan::ThreadStartHook(hook, reinterpret_cast<uptr>(self));
+}
+
+void __sanitizer_thread_exit_hook(void *hook, thrd_t self) {
+  __hwasan::ThreadExitHook(hook, self);
+}
+
+}  // extern "C"
+
+#endif  // SANITIZER_FUCHSIA
diff --git a/compiler-rt/lib/hwasan/hwasan_thread.cpp b/compiler-rt/lib/hwasan/hwasan_thread.cpp
index 764ca4f651b37..ee747a3beea5e 100644
--- a/compiler-rt/lib/hwasan/hwasan_thread.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_thread.cpp
@@ -46,7 +46,12 @@ void Thread::Init(uptr stack_buffer_start, uptr stack_buffer_size,
     heap_allocations_ = HeapAllocationsRingBuffer::New(sz);
 
   InitStackAndTls(state);
+#if !SANITIZER_FUCHSIA
+  // Do not initialize the stack ring buffer just yet on Fuchsia. Threads will
+  // be initialized before we enter the thread itself, so we will instead call
+  // this later.
   InitStackRingBuffer(stack_buffer_start, stack_buffer_size);
+#endif
 }
 
 void Thread::InitStackRingBuffer(uptr stack_buffer_start,

From 2c60d22610325bcd6fb4c4bcc8b522b9fdfb46ee Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov@gmail.com>
Date: Wed, 7 Jul 2021 02:22:45 +0200
Subject: [PATCH 06/32] [clang] disable P2266 simpler implicit moves under
 -fms-compatibility

The Microsoft STL currently has some issues with P2266.
We disable it for now in that mode, but we might come back later with a
more targetted approach.

Signed-off-by: Matheus Izvekov <mizvekov@gmail.com>

Reviewed By: aaron.ballman

Differential Revision: https://reviews.llvm.org/D105518
---
 clang/lib/Frontend/InitPreprocessor.cpp       |  3 +-
 clang/lib/Sema/SemaStmt.cpp                   |  7 ++-
 .../cxx2b-p2266-disable-with-msvc-compat.cpp  | 50 +++++++++++++++++++
 3 files changed, 58 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/SemaCXX/cxx2b-p2266-disable-with-msvc-compat.cpp

diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index bca0bb4ada672..676421552a757 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -598,7 +598,8 @@ static void InitializeCPlusPlusFeatureTestMacros(const LangOptions &LangOpts,
   }
   // C++2b features.
   if (LangOpts.CPlusPlus2b) {
-    Builder.defineMacro("__cpp_implicit_move", "202011L");
+    if (!LangOpts.MSVCCompat)
+      Builder.defineMacro("__cpp_implicit_move", "202011L");
     Builder.defineMacro("__cpp_size_t_suffix", "202011L");
   }
   if (LangOpts.Char8)
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index 506c06b412b6f..59e64c4b1c5b1 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -3333,8 +3333,13 @@ Sema::NamedReturnInfo Sema::getNamedReturnInfo(Expr *&E, bool ForceCXX2b) {
   if (!VD)
     return NamedReturnInfo();
   NamedReturnInfo Res = getNamedReturnInfo(VD);
+  // FIXME: We supress simpler implicit move here (unless ForceCXX2b is true)
+  //        in msvc compatibility mode just as a temporary work around,
+  //        as the MSVC STL has issues with this change.
+  //        We will come back later with a more targeted approach.
   if (Res.Candidate && !E->isXValue() &&
-      (ForceCXX2b || getLangOpts().CPlusPlus2b)) {
+      (ForceCXX2b ||
+       (getLangOpts().CPlusPlus2b && !getLangOpts().MSVCCompat))) {
     E = ImplicitCastExpr::Create(Context, VD->getType().getNonReferenceType(),
                                  CK_NoOp, E, nullptr, VK_XValue,
                                  FPOptionsOverride());
diff --git a/clang/test/SemaCXX/cxx2b-p2266-disable-with-msvc-compat.cpp b/clang/test/SemaCXX/cxx2b-p2266-disable-with-msvc-compat.cpp
new file mode 100644
index 0000000000000..2143c0535e606
--- /dev/null
+++ b/clang/test/SemaCXX/cxx2b-p2266-disable-with-msvc-compat.cpp
@@ -0,0 +1,50 @@
+// RUN: %clang_cc1 -std=c++2b -fsyntax-only -fcxx-exceptions                    -verify=new %s
+// RUN: %clang_cc1 -std=c++2b -fsyntax-only -fcxx-exceptions -fms-compatibility -verify=old %s
+// RUN: %clang_cc1 -std=c++20 -fsyntax-only -fcxx-exceptions                    -verify=old %s
+
+// FIXME: This is a test for a temporary workaround where we disable simpler implicit moves
+//        when compiling with -fms-compatibility, because the MSVC STL does not compile.
+//        A better workaround is under discussion.
+//        The test cases here are just a copy from `CXX/class/class.init/class.copy.elision/p3.cpp`,
+//        so feel free to delete this file when the workaround is not needed anymore.
+
+struct CopyOnly {
+  CopyOnly(); // new-note {{candidate constructor not viable: requires 0 arguments, but 1 was provided}}
+  // new-note@-1 {{candidate constructor not viable: requires 0 arguments, but 1 was provided}}
+  CopyOnly(CopyOnly &); // new-note {{candidate constructor not viable: expects an lvalue for 1st argument}}
+  // new-note@-1 {{candidate constructor not viable: expects an lvalue for 1st argument}}
+};
+struct MoveOnly {
+  MoveOnly();
+  MoveOnly(MoveOnly &&);
+};
+MoveOnly &&rref();
+
+MoveOnly &&test1(MoveOnly &&w) {
+  return w; // old-error {{cannot bind to lvalue of type}}
+}
+
+CopyOnly test2(bool b) {
+  static CopyOnly w1;
+  CopyOnly w2;
+  if (b) {
+    return w1;
+  } else {
+    return w2; // new-error {{no matching constructor for initialization}}
+  }
+}
+
+template <class T> T &&test3(T &&x) { return x; } // old-error {{cannot bind to lvalue of type}}
+template MoveOnly &test3<MoveOnly &>(MoveOnly &);
+template MoveOnly &&test3<MoveOnly>(MoveOnly &&); // old-note {{in instantiation of function template specialization}}
+
+MoveOnly &&test4() {
+  MoveOnly &&x = rref();
+  return x; // old-error {{cannot bind to lvalue of type}}
+}
+
+void test5() try {
+  CopyOnly x;
+  throw x; // new-error {{no matching constructor for initialization}}
+} catch (...) {
+}

From 398bfa2eadbea371ab20f4dd8cadbef432b35627 Mon Sep 17 00:00:00 2001
From: Leonard Chan <leonardchan@google.com>
Date: Wed, 2 Jun 2021 11:33:49 -0700
Subject: [PATCH 07/32] [compiler-rt][Fuchsia] Disable interceptors while
 enabling new/delete replacements

This disables use of hwasan interceptors which we do not use on Fuchsia. This
explicitly sets the macro for defining the hwasan versions of new/delete.

Differential Revision: https://reviews.llvm.org/D103544
---
 compiler-rt/CMakeLists.txt            | 8 ++++++--
 compiler-rt/lib/hwasan/CMakeLists.txt | 5 +++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt
index f5b07cee45c47..cdb33087ab53b 100644
--- a/compiler-rt/CMakeLists.txt
+++ b/compiler-rt/CMakeLists.txt
@@ -67,8 +67,12 @@ if (NOT COMPILER_RT_ASAN_SHADOW_SCALE STREQUAL "")
       -D${COMPILER_RT_ASAN_SHADOW_SCALE_DEFINITION})
 endif()
 
-set(COMPILER_RT_HWASAN_WITH_INTERCEPTORS ON CACHE BOOL
-    "Enable libc interceptors in HWASan (testing mode)")
+if(FUCHSIA)
+  set(COMPILER_RT_HWASAN_WITH_INTERCEPTORS_DEFAULT OFF)
+else()
+  set(COMPILER_RT_HWASAN_WITH_INTERCEPTORS_DEFAULT ON)
+endif()
+set(COMPILER_RT_HWASAN_WITH_INTERCEPTORS ${COMPILER_RT_HWASAN_WITH_INTERCEPTORS_DEFAULT} CACHE BOOL "Enable libc interceptors in HWASan (testing mode)")
 
 set(COMPILER_RT_BAREMETAL_BUILD OFF CACHE BOOL
   "Build for a bare-metal target.")
diff --git a/compiler-rt/lib/hwasan/CMakeLists.txt b/compiler-rt/lib/hwasan/CMakeLists.txt
index f6740dca021b0..d65c9b843c1b5 100644
--- a/compiler-rt/lib/hwasan/CMakeLists.txt
+++ b/compiler-rt/lib/hwasan/CMakeLists.txt
@@ -45,6 +45,11 @@ set(HWASAN_RTL_HEADERS
 set(HWASAN_DEFINITIONS)
 append_list_if(COMPILER_RT_HWASAN_WITH_INTERCEPTORS HWASAN_WITH_INTERCEPTORS=1 HWASAN_DEFINITIONS)
 
+if(FUCHSIA)
+  # Set this explicitly on Fuchsia, otherwise the default value is set to HWASAN_WITH_INTERCEPTORS.
+  list(APPEND HWASAN_DEFINITIONS HWASAN_REPLACE_OPERATORS_NEW_AND_DELETE=1)
+endif()
+
 set(HWASAN_RTL_CFLAGS ${SANITIZER_COMMON_CFLAGS})
 append_rtti_flag(OFF HWASAN_RTL_CFLAGS)
 append_list_if(COMPILER_RT_HAS_FPIC_FLAG -fPIC HWASAN_RTL_CFLAGS)

From 0fdb25cd954c5aaf86259e713f03d119ab9f2700 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Wed, 7 Jul 2021 14:25:24 -0700
Subject: [PATCH 08/32] [AMDGPU] Disable garbage collection passes

Differential Revision: https://reviews.llvm.org/D105593
---
 llvm/include/llvm/CodeGen/Passes.h            |  8 +++++++
 llvm/lib/CodeGen/CodeGen.cpp                  |  1 +
 llvm/lib/CodeGen/GCRootLowering.cpp           |  1 +
 llvm/lib/CodeGen/ShadowStackGCLowering.cpp    |  1 +
 llvm/lib/CodeGen/TargetPassConfig.cpp         |  4 ++--
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  3 +++
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      | 22 +++++--------------
 7 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index d0fe1a264b74e..76667eac051dd 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -270,11 +270,19 @@ namespace llvm {
   /// operations.
   FunctionPass *createGCLoweringPass();
 
+  /// GCLowering Pass - Used by gc.root to perform its default lowering
+  /// operations.
+  extern char &GCLoweringID;
+
   /// ShadowStackGCLowering - Implements the custom lowering mechanism
   /// used by the shadow stack GC.  Only runs on functions which opt in to
   /// the shadow stack collector.
   FunctionPass *createShadowStackGCLoweringPass();
 
+  /// ShadowStackGCLowering - Implements the custom lowering mechanism
+  /// used by the shadow stack GC.
+  extern char &ShadowStackGCLoweringID;
+
   /// GCMachineCodeAnalysis - Target-independent pass to mark safe points
   /// in machine code. Must be added very late during code generation, just
   /// prior to output, and importantly after all CFG transformations (such as
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index d2400d0371e3c..708325298aaeb 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -99,6 +99,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeRegisterCoalescerPass(Registry);
   initializeRenameIndependentSubregsPass(Registry);
   initializeSafeStackLegacyPassPass(Registry);
+  initializeShadowStackGCLoweringPass(Registry);
   initializeShrinkWrapPass(Registry);
   initializeSjLjEHPreparePass(Registry);
   initializeSlotIndexesPass(Registry);
diff --git a/llvm/lib/CodeGen/GCRootLowering.cpp b/llvm/lib/CodeGen/GCRootLowering.cpp
index faf0fb7f09a7a..58269e172c573 100644
--- a/llvm/lib/CodeGen/GCRootLowering.cpp
+++ b/llvm/lib/CodeGen/GCRootLowering.cpp
@@ -85,6 +85,7 @@ INITIALIZE_PASS_END(LowerIntrinsics, "gc-lowering", "GC Lowering", false, false)
 FunctionPass *llvm::createGCLoweringPass() { return new LowerIntrinsics(); }
 
 char LowerIntrinsics::ID = 0;
+char &llvm::GCLoweringID = LowerIntrinsics::ID;
 
 LowerIntrinsics::LowerIntrinsics() : FunctionPass(ID) {
   initializeLowerIntrinsicsPass(*PassRegistry::getPassRegistry());
diff --git a/llvm/lib/CodeGen/ShadowStackGCLowering.cpp b/llvm/lib/CodeGen/ShadowStackGCLowering.cpp
index 36752ef86526d..86b559fd64130 100644
--- a/llvm/lib/CodeGen/ShadowStackGCLowering.cpp
+++ b/llvm/lib/CodeGen/ShadowStackGCLowering.cpp
@@ -89,6 +89,7 @@ class ShadowStackGCLowering : public FunctionPass {
 } // end anonymous namespace
 
 char ShadowStackGCLowering::ID = 0;
+char &llvm::ShadowStackGCLoweringID = ShadowStackGCLowering::ID;
 
 INITIALIZE_PASS_BEGIN(ShadowStackGCLowering, DEBUG_TYPE,
                       "Shadow Stack GC Lowering", false, false)
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 2d8ecd5025fa1..2a4f6bfd98b03 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -857,8 +857,8 @@ void TargetPassConfig::addIRPasses() {
 
   // Run GC lowering passes for builtin collectors
   // TODO: add a pass insertion point here
-  addPass(createGCLoweringPass());
-  addPass(createShadowStackGCLoweringPass());
+  addPass(&GCLoweringID);
+  addPass(&ShadowStackGCLoweringID);
   addPass(createLowerConstantIntrinsicsPass());
 
   // Make sure that no unreachable blocks are instruction selected.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f611c893cde4c..7f74204229c20 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -763,6 +763,9 @@ class AMDGPUPassConfig : public TargetPassConfig {
     // anything.
     disablePass(&StackMapLivenessID);
     disablePass(&FuncletLayoutID);
+    // Garbage collection is not supported.
+    disablePass(&GCLoweringID);
+    disablePass(&ShadowStackGCLoweringID);
   }
 
   AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 4d42307327658..80d05799281a3 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -22,8 +22,8 @@
 ; GCN-O0-NEXT: Target Transform Information
 ; GCN-O0-NEXT: Assumption Cache Tracker
 ; GCN-O0-NEXT: Profile summary info
-; GCN-O0-NEXT: Create Garbage Collector Module Metadata
 ; GCN-O0-NEXT: Argument Register Usage Information Storage
+; GCN-O0-NEXT: Create Garbage Collector Module Metadata
 ; GCN-O0-NEXT: Register Usage Information Storage
 ; GCN-O0-NEXT: Machine Branch Probability Analysis
 ; GCN-O0-NEXT:   ModulePass Manager
@@ -43,9 +43,7 @@
 ; GCN-O0-NEXT:     Lower OpenCL enqueued blocks
 ; GCN-O0-NEXT:     Lower uses of LDS variables from non-kernel functions
 ; GCN-O0-NEXT:     FunctionPass Manager
-; GCN-O0-NEXT:     Expand Atomic instructions
-; GCN-O0-NEXT:       Lower Garbage Collection Instructions
-; GCN-O0-NEXT:       Shadow Stack GC Lowering
+; GCN-O0-NEXT:       Expand Atomic instructions
 ; GCN-O0-NEXT:       Lower constant intrinsics
 ; GCN-O0-NEXT:       Remove unreachable blocks from the CFG
 ; GCN-O0-NEXT:       Expand vector predication intrinsics
@@ -165,8 +163,8 @@
 ; GCN-O1-NEXT: External Alias Analysis
 ; GCN-O1-NEXT: Type-Based Alias Analysis
 ; GCN-O1-NEXT: Scoped NoAlias Alias Analysis
-; GCN-O1-NEXT: Create Garbage Collector Module Metadata
 ; GCN-O1-NEXT: Argument Register Usage Information Storage
+; GCN-O1-NEXT: Create Garbage Collector Module Metadata
 ; GCN-O1-NEXT: Machine Branch Probability Analysis
 ; GCN-O1-NEXT: Register Usage Information Storage
 ; GCN-O1-NEXT:   ModulePass Manager
@@ -209,8 +207,6 @@
 ; GCN-O1-NEXT:       Lazy Branch Probability Analysis
 ; GCN-O1-NEXT:       Lazy Block Frequency Analysis
 ; GCN-O1-NEXT:       Expand memcmp() to load/stores
-; GCN-O1-NEXT:       Lower Garbage Collection Instructions
-; GCN-O1-NEXT:       Shadow Stack GC Lowering
 ; GCN-O1-NEXT:       Lower constant intrinsics
 ; GCN-O1-NEXT:       Remove unreachable blocks from the CFG
 ; GCN-O1-NEXT:       Natural Loop Information
@@ -413,8 +409,8 @@
 ; GCN-O1-OPTS-NEXT: External Alias Analysis
 ; GCN-O1-OPTS-NEXT: Type-Based Alias Analysis
 ; GCN-O1-OPTS-NEXT: Scoped NoAlias Alias Analysis
-; GCN-O1-OPTS-NEXT: Create Garbage Collector Module Metadata
 ; GCN-O1-OPTS-NEXT: Argument Register Usage Information Storage
+; GCN-O1-OPTS-NEXT: Create Garbage Collector Module Metadata
 ; GCN-O1-OPTS-NEXT: Machine Branch Probability Analysis
 ; GCN-O1-OPTS-NEXT: Register Usage Information Storage
 ; GCN-O1-OPTS-NEXT:   ModulePass Manager
@@ -475,8 +471,6 @@
 ; GCN-O1-OPTS-NEXT:       Lazy Branch Probability Analysis
 ; GCN-O1-OPTS-NEXT:       Lazy Block Frequency Analysis
 ; GCN-O1-OPTS-NEXT:       Expand memcmp() to load/stores
-; GCN-O1-OPTS-NEXT:       Lower Garbage Collection Instructions
-; GCN-O1-OPTS-NEXT:       Shadow Stack GC Lowering
 ; GCN-O1-OPTS-NEXT:       Lower constant intrinsics
 ; GCN-O1-OPTS-NEXT:       Remove unreachable blocks from the CFG
 ; GCN-O1-OPTS-NEXT:       Natural Loop Information
@@ -694,8 +688,8 @@
 ; GCN-O2-NEXT: External Alias Analysis
 ; GCN-O2-NEXT: Type-Based Alias Analysis
 ; GCN-O2-NEXT: Scoped NoAlias Alias Analysis
-; GCN-O2-NEXT: Create Garbage Collector Module Metadata
 ; GCN-O2-NEXT: Argument Register Usage Information Storage
+; GCN-O2-NEXT: Create Garbage Collector Module Metadata
 ; GCN-O2-NEXT: Machine Branch Probability Analysis
 ; GCN-O2-NEXT: Register Usage Information Storage
 ; GCN-O2-NEXT:   ModulePass Manager
@@ -756,8 +750,6 @@
 ; GCN-O2-NEXT:       Lazy Branch Probability Analysis
 ; GCN-O2-NEXT:       Lazy Block Frequency Analysis
 ; GCN-O2-NEXT:       Expand memcmp() to load/stores
-; GCN-O2-NEXT:       Lower Garbage Collection Instructions
-; GCN-O2-NEXT:       Shadow Stack GC Lowering
 ; GCN-O2-NEXT:       Lower constant intrinsics
 ; GCN-O2-NEXT:       Remove unreachable blocks from the CFG
 ; GCN-O2-NEXT:       Natural Loop Information
@@ -976,8 +968,8 @@
 ; GCN-O3-NEXT: External Alias Analysis
 ; GCN-O3-NEXT: Type-Based Alias Analysis
 ; GCN-O3-NEXT: Scoped NoAlias Alias Analysis
-; GCN-O3-NEXT: Create Garbage Collector Module Metadata
 ; GCN-O3-NEXT: Argument Register Usage Information Storage
+; GCN-O3-NEXT: Create Garbage Collector Module Metadata
 ; GCN-O3-NEXT: Machine Branch Probability Analysis
 ; GCN-O3-NEXT: Register Usage Information Storage
 ; GCN-O3-NEXT:   ModulePass Manager
@@ -1043,8 +1035,6 @@
 ; GCN-O3-NEXT:       Lazy Branch Probability Analysis
 ; GCN-O3-NEXT:       Lazy Block Frequency Analysis
 ; GCN-O3-NEXT:       Expand memcmp() to load/stores
-; GCN-O3-NEXT:       Lower Garbage Collection Instructions
-; GCN-O3-NEXT:       Shadow Stack GC Lowering
 ; GCN-O3-NEXT:       Lower constant intrinsics
 ; GCN-O3-NEXT:       Remove unreachable blocks from the CFG
 ; GCN-O3-NEXT:       Natural Loop Information

From 877e835addd78126a2ff2b8030af17a5ea92df09 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Wed, 7 Jul 2021 19:27:19 -0400
Subject: [PATCH 09/32] [gn build] (semi-manually) port 966386514bec

---
 llvm/utils/gn/secondary/compiler-rt/lib/hwasan/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/hwasan/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/hwasan/BUILD.gn
index 1718523875250..7deb8a3d3ed70 100644
--- a/llvm/utils/gn/secondary/compiler-rt/lib/hwasan/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/lib/hwasan/BUILD.gn
@@ -52,6 +52,7 @@ source_set("sources") {
     "hwasan_dynamic_shadow.cpp",
     "hwasan_dynamic_shadow.h",
     "hwasan_exceptions.cpp",
+    "hwasan_fuchsia.cpp",
     "hwasan_flags.h",
     "hwasan_globals.cpp",
     "hwasan_globals.h",

From 74c308c56a2d0f000dfed3287311ce46a94ae3c8 Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Wed, 7 Jul 2021 16:50:23 -0700
Subject: [PATCH 10/32] [Bazel] Fixes for
 b5d847b1b95750d0af40cfc8c71a8fec50bb8613 and
 6412a13539ab2914eed8e1df83c399b9a16e3408

---
 .../llvm-project-overlay/mlir/BUILD.bazel     | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 9cab8fa6b117b..27f96f443031e 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -2805,6 +2805,7 @@ cc_library(
         ":GPUToNVVMGen",
         ":GPUTransforms",
         ":IR",
+        ":LLVMCommonConversion",
         ":LLVMDialect",
         ":MathDialect",
         ":MemRefDialect",
@@ -2888,6 +2889,7 @@ cc_library(
         ":GPUDialect",
         ":GPUToROCDLTGen",
         ":GPUTransforms",
+        ":LLVMCommonConversion",
         ":MathDialect",
         ":Pass",
         ":ROCDLDialect",
@@ -3012,6 +3014,7 @@ cc_library(
         ":ConversionPassIncGen",
         ":GPUDialect",
         ":IR",
+        ":LLVMCommonConversion",
         ":LLVMDialect",
         ":Pass",
         ":SPIRVDialect",
@@ -4236,6 +4239,22 @@ alias(
     actual = "SCFToStandard",
 )
 
+cc_library(
+    name = "LLVMCommonConversion",
+    srcs = glob([
+        "lib/Conversion/LLVMCommon/*.cpp",
+    ]) + ["lib/Conversion/LLVMCommon/MemRefDescriptor.h"],
+    hdrs = glob(["include/mlir/Conversion/LLVMCommon/*.h"]),
+    includes = ["include"],
+    deps = [
+        ":IR",
+        ":LLVMDialect",
+        ":Support",
+        ":Transforms",
+        "//llvm:Core",
+    ],
+)
+
 cc_library(
     name = "StandardToLLVM",
     srcs = [
@@ -4253,6 +4272,7 @@ cc_library(
         ":DataLayoutInterfaces",
         ":DialectUtils",
         ":IR",
+        ":LLVMCommonConversion",
         ":LLVMDialect",
         ":MathDialect",
         ":MemRefDialect",
@@ -5127,6 +5147,7 @@ cc_binary(
         ":GPUToSPIRV",
         ":GPUToVulkanTransforms",
         ":GPUTransforms",
+        ":LLVMCommonConversion",
         ":LLVMDialect",
         ":LLVMToLLVMIRTranslation",
         ":MemRefDialect",
@@ -6229,6 +6250,7 @@ cc_library(
     ],
     deps = [
         ":ConversionPassIncGen",
+        ":DialectUtils",
         ":IR",
         ":LinalgOps",
         ":MathDialect",
@@ -6378,6 +6400,7 @@ cc_library(
         ":ComplexDialect",
         ":ConversionPassIncGen",
         ":IR",
+        ":LLVMCommonConversion",
         ":LLVMDialect",
         ":Pass",
         ":StandardToLLVM",

From e37dbc6e5703c2755d5fb81949eb32f07bc6ebd6 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Wed, 7 Jul 2021 21:59:07 -0400
Subject: [PATCH 11/32] [gn build] (manually) port ef16c8eaa5cd5679759
 (MCACustomBehaviorAMDGPU)

---
 .../gn/secondary/llvm/tools/llvm-mca/BUILD.gn     |  7 +++++++
 .../llvm/tools/llvm-mca/lib/AMDGPU/BUILD.gn       | 15 +++++++++++++++
 2 files changed, 22 insertions(+)
 create mode 100644 llvm/utils/gn/secondary/llvm/tools/llvm-mca/lib/AMDGPU/BUILD.gn

diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-mca/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-mca/BUILD.gn
index dcea89146765b..458598b682f86 100644
--- a/llvm/utils/gn/secondary/llvm/tools/llvm-mca/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-mca/BUILD.gn
@@ -1,3 +1,5 @@
+import("//llvm/lib/Target/targets.gni")
+
 executable("llvm-mca") {
   deps = [
     "//llvm/lib/MC",
@@ -30,4 +32,9 @@ executable("llvm-mca") {
     "Views/View.cpp",
     "llvm-mca.cpp",
   ]
+  defines = []
+  if (llvm_build_AMDGPU) {
+    deps += [ "//llvm/tools/llvm-mca/lib/AMDGPU" ]
+    defines += [ "HAS_AMDGPU" ]
+  }
 }
diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-mca/lib/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-mca/lib/AMDGPU/BUILD.gn
new file mode 100644
index 0000000000000..3bde981c58add
--- /dev/null
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-mca/lib/AMDGPU/BUILD.gn
@@ -0,0 +1,15 @@
+static_library("AMDGPU") {
+  output_name = "LLVMMCACustomBehaviourAMDGPU"
+  deps = [
+    "//llvm/lib/IR",
+    "//llvm/lib/Support",
+    "//llvm/lib/Target/AMDGPU",
+
+    # llvm-mca/libAMDGPU reaches inside the Target/AMDGPU tablegen internals
+    # and must depend on these Target/AMDGPU-internal build targets.
+    "//llvm/lib/Target/AMDGPU/MCTargetDesc",
+    "//llvm/lib/Target/AMDGPU/Utils",
+  ]
+  include_dirs = [ "//llvm/lib/Target/AMDGPU" ]
+  sources = [ "AMDGPUCustomBehaviour.cpp" ]
+}

From 31d10ea10ee1c24e6c7d7c172e52960717d41817 Mon Sep 17 00:00:00 2001
From: Jinsong Ji <jji@us.ibm.com>
Date: Thu, 8 Jul 2021 02:24:48 +0000
Subject: [PATCH 12/32] [AIX] Don't pass no-integrated-as by default

D105314 added the abibility choose to use AsmParser for parsing inline
asm. -no-intergrated-as will override this default if specified
explicitly.

If toolchain choose to use MCAsmParser for inline asm, don't pass
the option to disable integrated-as explictly unless set by user.

Reviewed By: #powerpc, shchenz

Differential Revision: https://reviews.llvm.org/D105512
---
 clang/include/clang/Driver/ToolChain.h |  4 ++++
 clang/lib/Driver/ToolChains/AIX.cpp    |  2 ++
 clang/lib/Driver/ToolChains/AIX.h      |  4 ++++
 clang/lib/Driver/ToolChains/Clang.cpp  |  4 +++-
 clang/test/Driver/aix-as.c             | 15 +++++++++++++++
 5 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h
index 8ec4cf853fed2..882ae40086cea 100644
--- a/clang/include/clang/Driver/ToolChain.h
+++ b/clang/include/clang/Driver/ToolChain.h
@@ -380,6 +380,10 @@ class ToolChain {
   /// Check if the toolchain should use the integrated assembler.
   virtual bool useIntegratedAs() const;
 
+  /// Check if the toolchain should use AsmParser to parse inlineAsm when
+  /// integrated assembler is not default.
+  virtual bool parseInlineAsmUsingAsmParser() const { return false; }
+
   /// IsMathErrnoDefault - Does this tool chain use -fmath-errno by default.
   virtual bool IsMathErrnoDefault() const { return true; }
 
diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp
index ca3fc5af76895..3000b8416adfd 100644
--- a/clang/lib/Driver/ToolChains/AIX.cpp
+++ b/clang/lib/Driver/ToolChains/AIX.cpp
@@ -176,6 +176,8 @@ void aix::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 /// AIX - AIX tool chain which can call as(1) and ld(1) directly.
 AIX::AIX(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
     : ToolChain(D, Triple, Args) {
+  ParseInlineAsmUsingAsmParser = Args.hasFlag(
+      options::OPT_fintegrated_as, options::OPT_fno_integrated_as, true);
   getLibraryPaths().push_back(getDriver().SysRoot + "/usr/lib");
 }
 
diff --git a/clang/lib/Driver/ToolChains/AIX.h b/clang/lib/Driver/ToolChains/AIX.h
index 1534af950c88f..d1ec6d10fb3a0 100644
--- a/clang/lib/Driver/ToolChains/AIX.h
+++ b/clang/lib/Driver/ToolChains/AIX.h
@@ -59,6 +59,9 @@ class LLVM_LIBRARY_VISIBILITY AIX : public ToolChain {
   AIX(const Driver &D, const llvm::Triple &Triple,
       const llvm::opt::ArgList &Args);
 
+  bool parseInlineAsmUsingAsmParser() const override {
+    return ParseInlineAsmUsingAsmParser;
+  }
   bool isPICDefault() const override { return true; }
   bool isPIEDefault() const override { return false; }
   bool isPICDefaultForced() const override { return true; }
@@ -87,6 +90,7 @@ class LLVM_LIBRARY_VISIBILITY AIX : public ToolChain {
 
 private:
   llvm::StringRef GetHeaderSysroot(const llvm::opt::ArgList &DriverArgs) const;
+  bool ParseInlineAsmUsingAsmParser;
 };
 
 } // end namespace toolchains
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 9c0922c8497cf..85204ceaa49a2 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5038,7 +5038,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
           << A->getValue() << A->getOption().getName();
   }
 
-  if (!TC.useIntegratedAs())
+  // If toolchain choose to use MCAsmParser for inline asm don't pass the
+  // option to disable integrated-as explictly.
+  if (!TC.useIntegratedAs() && !TC.parseInlineAsmUsingAsmParser())
     CmdArgs.push_back("-no-integrated-as");
 
   if (Args.hasArg(options::OPT_fdebug_pass_structure)) {
diff --git a/clang/test/Driver/aix-as.c b/clang/test/Driver/aix-as.c
index aa8c610359037..def2adc97daaa 100644
--- a/clang/test/Driver/aix-as.c
+++ b/clang/test/Driver/aix-as.c
@@ -63,3 +63,18 @@
 // CHECK-AS32-MultiInput: "{{.*}}as{{(.exe)?}}"
 // CHECK-AS32-MultiInput: "-a32"
 // CHECK-AS32-MultiInput: "-many"
+
+// Check not passing no-integrated-as flag by default.
+// RUN: %clang -no-canonical-prefixes %s -### -c -o %t.o 2>&1 \
+// RUN:         -target powerpc64-ibm-aix7.1.0.0 \
+// RUN:   | FileCheck --check-prefix=CHECK-IAS --implicit-check-not=-no-integrated-as %s
+// CHECK-IAS: InstalledDir
+// CHECK-IAS: "-a64"
+
+// Check passing no-integrated-as flag if specified by user.
+// RUN: %clang -no-canonical-prefixes %s -### -c -o %t.o 2>&1 \
+// RUN:         -target powerpc64-ibm-aix7.1.0.0 -fno-integrated-as \
+// RUN:   | FileCheck --check-prefix=CHECK-NOIAS %s
+// CHECK-NOIAS: InstalledDir
+// CHECK-NOIAS: -no-integrated-as
+// CHECK-NOIAS: "-a64"

From a22ecb4508288f6900ad9216ef1490ab72ad68ed Mon Sep 17 00:00:00 2001
From: Qiu Chaofan <qiucofan@cn.ibm.com>
Date: Thu, 8 Jul 2021 11:05:09 +0800
Subject: [PATCH 13/32] [PowerPC] Fix i64 to vector lowering on big endian

Lowering for scalar to vector would skip if current subtarget is big
endian and the scalar is larger or equal than 64 bits. However there's
some issue in implementation that SToVRHS may refer to SToVLHS's scalar
size if SToVLHS is present, which leads to some crash.o

Reviewed By: nemanjai, shchenz

Differential Revision: https://reviews.llvm.org/D105094
---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   | 20 +++++------
 .../PowerPC/p8-scalar_vector_conversions.ll   | 33 +++++++++++++++++++
 2 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 99b8cd5d20d30..39cf24b00ac3d 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -14524,18 +14524,15 @@ SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN,
     int NumEltsIn = SToVLHS ? SToVLHS.getValueType().getVectorNumElements()
                             : SToVRHS.getValueType().getVectorNumElements();
     int NumEltsOut = ShuffV.size();
-    unsigned InElemSizeInBits =
-        SToVLHS ? SToVLHS.getValueType().getScalarSizeInBits()
-                : SToVRHS.getValueType().getScalarSizeInBits();
-    unsigned OutElemSizeInBits = SToVLHS
-                                     ? LHS.getValueType().getScalarSizeInBits()
-                                     : RHS.getValueType().getScalarSizeInBits();
-
     // The width of the "valid lane" (i.e. the lane that contains the value that
     // is vectorized) needs to be expressed in terms of the number of elements
     // of the shuffle. It is thereby the ratio of the values before and after
     // any bitcast.
-    unsigned ValidLaneWidth = InElemSizeInBits / OutElemSizeInBits;
+    unsigned ValidLaneWidth =
+        SToVLHS ? SToVLHS.getValueType().getScalarSizeInBits() /
+                      LHS.getValueType().getScalarSizeInBits()
+                : SToVRHS.getValueType().getVectorNumElements() /
+                      RHS.getValueType().getScalarSizeInBits();
 
     // Initially assume that neither input is permuted. These will be adjusted
     // accordingly if either input is.
@@ -14548,9 +14545,10 @@ SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN,
     // ISD::SCALAR_TO_VECTOR.
     // On big endian systems, this only makes sense for element sizes smaller
     // than 64 bits since for 64-bit elements, all instructions already put
-    // the value into element zero.
+    // the value into element zero. Since scalar size of LHS and RHS may differ
+    // after isScalarToVec, this should be checked using their own sizes.
     if (SToVLHS) {
-      if (!IsLittleEndian && InElemSizeInBits >= 64)
+      if (!IsLittleEndian && SToVLHS.getValueType().getScalarSizeInBits() >= 64)
         return Res;
       // Set up the values for the shuffle vector fixup.
       LHSMaxIdx = NumEltsOut / NumEltsIn;
@@ -14560,7 +14558,7 @@ SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN,
       LHS = SToVLHS;
     }
     if (SToVRHS) {
-      if (!IsLittleEndian && InElemSizeInBits >= 64)
+      if (!IsLittleEndian && SToVRHS.getValueType().getScalarSizeInBits() >= 64)
         return Res;
       RHSMinIdx = NumEltsOut;
       RHSMaxIdx = NumEltsOut / NumEltsIn + RHSMinIdx;
diff --git a/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll b/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
index 6a251e076005a..27ee2fda1f1b0 100644
--- a/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
+++ b/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
@@ -2561,3 +2561,36 @@ entry:
   ret double %vecext
 ; FIXME: add check patterns when variable element extraction is implemented
 }
+
+; To check when LHS is i32 to vector and RHS is i64 to vector,
+; the combination should be skipped properly.
+define <2 x i64> @buildi2(i64 %arg, i32 %arg1) {
+; CHECK-LABEL: buildi2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sldi r4, r4, 32
+; CHECK-NEXT:    mtfprd f1, r3
+; CHECK-NEXT:    mtfprd f0, r4
+; CHECK-NEXT:    xxmrghd v2, vs0, vs1
+; CHECK-NEXT:    blr
+;
+; CHECK-LE-LABEL: buildi2:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    mtfprwz f0, r4
+; CHECK-LE-NEXT:    mtfprd f1, r3
+; CHECK-LE-NEXT:    xxmrgld v2, vs1, vs0
+; CHECK-LE-NEXT:    blr
+;
+; CHECK-AIX-LABEL: buildi2:
+; CHECK-AIX:       # %bb.0: # %entry
+; CHECK-AIX-NEXT:    sldi 4, 4, 32
+; CHECK-AIX-NEXT:    mtfprd 1, 3
+; CHECK-AIX-NEXT:    mtfprd 0, 4
+; CHECK-AIX-NEXT:    xxmrghd 34, 0, 1
+; CHECK-AIX-NEXT:    blr
+entry:
+  %lhs.i32 = insertelement <4 x i32> undef, i32 %arg1, i32 0
+  %rhs = insertelement <2 x i64> undef, i64 %arg, i32 0
+  %lhs = bitcast <4 x i32> %lhs.i32 to <2 x i64>
+  %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle
+}

From d38b9f1f31b1fa8ee885cfcd4ee7bd69771088c8 Mon Sep 17 00:00:00 2001
From: Patrick Holland <patrickeholland@gmail.com>
Date: Wed, 7 Jul 2021 20:48:42 -0700
Subject: [PATCH 14/32] Revert "[MCA] [AMDGPU] Adding an implementation to
 AMDGPUCustomBehaviour for handling s_waitcnt instructions."

Build failures when building with shared libraries. Reverting until I can fix.

Differential Revision: https://reviews.llvm.org/D104730
---
 llvm/lib/Target/AMDGPU/SISchedule.td          |  10 -
 .../test/tools/llvm-mca/AMDGPU/gfx10-double.s |  68 ++--
 .../tools/llvm-mca/AMDGPU/gfx9-retireooo.s    | 233 --------------
 .../lib/AMDGPU/AMDGPUCustomBehaviour.cpp      | 298 +-----------------
 .../lib/AMDGPU/AMDGPUCustomBehaviour.h        |  48 +--
 5 files changed, 36 insertions(+), 621 deletions(-)
 delete mode 100644 llvm/test/tools/llvm-mca/AMDGPU/gfx9-retireooo.s

diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index 0792b303b8309..b24c061af7ab7 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -137,7 +137,6 @@ def MIReadVGPR : SchedReadVariant<[
 // The latency values are 1 / (operations / cycle) / 4.
 multiclass SICommonWriteRes {
 
-  let RetireOOO = 1 in { // llvm-mca specific flag
   def : HWWriteRes<WriteBranch,  [HWBranch], 8>;
   def : HWWriteRes<WriteExport,  [HWExport], 4>;
   def : HWWriteRes<WriteLDS,     [HWLGKM],   5>; // Can be between 2 and 64
@@ -160,7 +159,6 @@ multiclass SICommonWriteRes {
   def : HWWriteRes<Write8PassMAI,  [HWXDL], 8>;
   let ResourceCycles = [16] in
   def : HWWriteRes<Write16PassMAI, [HWXDL], 16>;
-  } // End RetireOOO = 1
 
   def : ReadAdvance<MIVGPRRead, -2>;
 
@@ -184,7 +182,6 @@ let SchedModel = SIFullSpeedModel in {
 
 defm : SICommonWriteRes;
 
-let RetireOOO = 1 in { // llvm-mca specific flag
 def : HWVALUWriteRes<Write64Bit,       2>;
 def : HWVALUWriteRes<WriteIntMul,      4>;
 def : HWVALUWriteRes<WriteFloatFMA,    1>;
@@ -192,7 +189,6 @@ def : HWVALUWriteRes<WriteDouble,      4>;
 def : HWVALUWriteRes<WriteDoubleAdd,   2>;
 def : HWVALUWriteRes<WriteDoubleCvt,   4>;
 def : HWVALUWriteRes<WriteTrans64,     4>;
-} // End RetireOOO = 1
 
 def : InstRW<[WriteCopy], (instrs COPY)>;
 
@@ -202,7 +198,6 @@ let SchedModel = SIQuarterSpeedModel in {
 
 defm : SICommonWriteRes;
 
-let RetireOOO = 1 in { // llvm-mca specific flag
 def : HWVALUWriteRes<Write64Bit,       2>;
 def : HWVALUWriteRes<WriteIntMul,      4>;
 def : HWVALUWriteRes<WriteFloatFMA,    16>;
@@ -210,7 +205,6 @@ def : HWVALUWriteRes<WriteDouble,      16>;
 def : HWVALUWriteRes<WriteDoubleAdd,    8>;
 def : HWVALUWriteRes<WriteDoubleCvt,    4>;
 def : HWVALUWriteRes<WriteTrans64,     16>;
-} // End RetireOOO = 1
 
 def : InstRW<[WriteCopy], (instrs COPY)>;
 def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;
@@ -224,7 +218,6 @@ let SchedModel = SIDPFullSpeedModel in {
 
 defm : SICommonWriteRes;
 
-let RetireOOO = 1 in { // llvm-mca specific flag
 def : HWVALUWriteRes<WriteFloatFMA,    1>;
 def : HWVALUWriteRes<WriteDouble,      1>;
 def : HWVALUWriteRes<WriteDoubleAdd,   1>;
@@ -232,7 +225,6 @@ def : HWVALUWriteRes<WriteDoubleCvt,   1>;
 def : HWVALUWriteRes<WriteTrans64,     4>;
 def : HWVALUWriteRes<WriteIntMul,      1>;
 def : HWVALUWriteRes<Write64Bit,       1>;
-} // End RetireOOO = 1
 
 def : InstRW<[WriteCopy], (instrs COPY)>;
 def : InstRW<[Write64Bit], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;
@@ -248,7 +240,6 @@ let SchedModel = GFX10SpeedModel in {
 
 // The latency values are 1 / (operations / cycle).
 // Add 1 stall cycle for VGPR read.
-let RetireOOO = 1 in { // llvm-mca specific flag
 def : HWWriteRes<Write32Bit,         [HWVALU, HWRC],   5>;
 def : HWWriteRes<WriteFloatCvt,      [HWVALU, HWRC],   5>;
 def : HWWriteRes<Write64Bit,         [HWVALU, HWRC],   6>;
@@ -268,7 +259,6 @@ def : HWWriteRes<WriteSALU,          [HWSALU,   HWRC], 2>;
 def : HWWriteRes<WriteSMEM,          [HWLGKM,   HWRC], 20>;
 def : HWWriteRes<WriteVMEM,          [HWVMEM,   HWRC], 320>;
 def : HWWriteRes<WriteBarrier,       [HWBranch],       2000>;
-} // End RetireOOO = 1
 
 def : InstRW<[WriteCopy], (instrs COPY)>;
 
diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s
index 00b429ef6d67d..0ffdad05cfa67 100644
--- a/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s
+++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s
@@ -41,12 +41,12 @@ v_sqrt_f64 v[4:5], v[4:5]
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      28
-# CHECK-NEXT: Total Cycles:      205
+# CHECK-NEXT: Total Cycles:      224
 # CHECK-NEXT: Total uOps:        29
 
 # CHECK:      Dispatch Width:    1
-# CHECK-NEXT: uOps Per Cycle:    0.14
-# CHECK-NEXT: IPC:               0.14
+# CHECK-NEXT: uOps Per Cycle:    0.13
+# CHECK-NEXT: IPC:               0.13
 # CHECK-NEXT: Block RThroughput: 29.0
 
 # CHECK:      Instruction Info:
@@ -133,37 +133,37 @@ v_sqrt_f64 v[4:5], v[4:5]
 # CHECK-NEXT:  -      -      -     1.00    -     1.00   1.00    -     v_sqrt_f64_e32 v[4:5], v[4:5]
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          01234
-
-# CHECK:      [0,0]     DeeeeeeeeeeeeeeeeeeeeeE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_cvt_i32_f64_e32 v0, v[0:1]
-# CHECK-NEXT: [0,1]     .DeeeeeeeeeeeeeeeeeeeeeE .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_cvt_f64_i32_e32 v[2:3], v2
-# CHECK-NEXT: [0,2]     . DeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_cvt_f32_f64_e32 v4, v[4:5]
-# CHECK-NEXT: [0,3]     .  DeeeeeeeeeeeeeeeeeeeeeE    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_cvt_f64_f32_e32 v[6:7], v6
-# CHECK-NEXT: [0,4]     .   DeeeeeeeeeeeeeeeeeeeeeE   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_cvt_u32_f64_e32 v8, v[8:9]
-# CHECK-NEXT: [0,5]     .    DeeeeeeeeeeeeeeeeeeeeeE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_cvt_f64_u32_e32 v[10:11], v10
-# CHECK-NEXT: [0,6]     .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_frexp_exp_i32_f64_e32 v0, v[0:1]
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeeeeeeeeeeeeeeeeeeeeE    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_frexp_mant_f64_e32 v[2:3], v[2:3]
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeE   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_fract_f64_e32 v[4:5], v[4:5]
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeE   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_trunc_f64_e32 v[0:1], v[0:1]
-# CHECK-NEXT: [0,10]    .    .    .    .    .    .    .    .    .    DeeeeeeeeeeeeeeeeeeeeeE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_ceil_f64_e32 v[2:3], v[2:3]
-# CHECK-NEXT: [0,11]    .    .    .    .    .    .    .    .    .    .DeeeeeeeeeeeeeeeeeeeeeE .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_rndne_f64_e32 v[4:5], v[4:5]
-# CHECK-NEXT: [0,12]    .    .    .    .    .    .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_floor_f64_e32 v[6:7], v[6:7]
-# CHECK-NEXT: [0,13]    .    .    .    .    .    .    .    .    .    .    .    .    .    .DeeeeeeeeeeeeeeeeeeeeeE .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_fma_f64 v[0:1], v[0:1], v[0:1], v[0:1]
-# CHECK-NEXT: [0,14]    .    .    .    .    .    .    .    .    .    .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_add_f64 v[2:3], v[2:3], v[2:3]
-# CHECK-NEXT: [0,15]    .    .    .    .    .    .    .    .    .    .    .    .    .    .  DeeeeeeeeeeeeeeeeeeeeeE    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_mul_f64 v[4:5], v[4:5], v[4:5]
-# CHECK-NEXT: [0,16]    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeE   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_min_f64 v[6:7], v[6:7], v[6:7]
-# CHECK-NEXT: [0,17]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    DeeeeeeeeeeeeeeeeeeeeeE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_max_f64 v[8:9], v[8:9], v[8:9]
-# CHECK-NEXT: [0,18]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  DeeeeeeeeeeeeeeeeeeeeeE    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_div_fmas_f64 v[0:1], v[0:1], v[0:1], v[0:1]
-# CHECK-NEXT: [0,19]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    DeeeeeeeeeeeeeeeeeeeeeE  .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   v_div_fixup_f64 v[0:1], v[0:1], v[0:1], v[0:1]
-# CHECK-NEXT: [0,20]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .   .   v_ldexp_f64 v[2:3], v[2:3], v0
-# CHECK-NEXT: [0,21]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  DeeeeeeeeeeeeeeeeeeeeeE    .    .    .    .    .    .    .    .    .   .   v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1]
-# CHECK-NEXT: [0,22]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    DeeeeeeeeeeeeeeeeeeeeeE  .    .    .    .    .   .   v_trig_preop_f64 v[2:3], v[2:3], v0
-# CHECK-NEXT: [0,23]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .DeeeeeeeeeeeeeeeeeeeeeE .    .    .    .    .   .   v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[0:1]
-# CHECK-NEXT: [0,24]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeE.   .   v_cmp_class_f64_e64 vcc_lo, v[2:3], s0
-# CHECK-NEXT: [0,25]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  DeeeeeeeeeeeeeeeeeeeeeeeE .   v_rcp_f64_e32 v[0:1], v[0:1]
-# CHECK-NEXT: [0,26]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeeeE.   v_rsq_f64_e32 v[2:3], v[2:3]
-# CHECK-NEXT: [0,27]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    DeeeeeeeeeeeeeeeeeeeeeeeE   v_sqrt_f64_e32 v[4:5], v[4:5]
+# CHECK-NEXT:                     0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeeeeeeeeeeeeeeE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_cvt_i32_f64_e32 v0, v[0:1]
+# CHECK-NEXT: [0,1]     .DeeeeeeeeeeeeeeeeeeeeeE .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_cvt_f64_i32_e32 v[2:3], v2
+# CHECK-NEXT: [0,2]     . DeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_cvt_f32_f64_e32 v4, v[4:5]
+# CHECK-NEXT: [0,3]     .  DeeeeeeeeeeeeeeeeeeeeeE    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_cvt_f64_f32_e32 v[6:7], v6
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeeeeeeeeeeeeeeE   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_cvt_u32_f64_e32 v8, v[8:9]
+# CHECK-NEXT: [0,5]     .    DeeeeeeeeeeeeeeeeeeeeeE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_cvt_f64_u32_e32 v[10:11], v10
+# CHECK-NEXT: [0,6]     .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_frexp_exp_i32_f64_e32 v0, v[0:1]
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeeeeeeeeeeeeeeeeeeeeE    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_frexp_mant_f64_e32 v[2:3], v[2:3]
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeE   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_fract_f64_e32 v[4:5], v[4:5]
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeE   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_trunc_f64_e32 v[0:1], v[0:1]
+# CHECK-NEXT: [0,10]    .    .    .    .    .    .    .    .    .    DeeeeeeeeeeeeeeeeeeeeeE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_ceil_f64_e32 v[2:3], v[2:3]
+# CHECK-NEXT: [0,11]    .    .    .    .    .    .    .    .    .    .DeeeeeeeeeeeeeeeeeeeeeE .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_rndne_f64_e32 v[4:5], v[4:5]
+# CHECK-NEXT: [0,12]    .    .    .    .    .    .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_floor_f64_e32 v[6:7], v[6:7]
+# CHECK-NEXT: [0,13]    .    .    .    .    .    .    .    .    .    .    .    .    .    .DeeeeeeeeeeeeeeeeeeeeeE .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_fma_f64 v[0:1], v[0:1], v[0:1], v[0:1]
+# CHECK-NEXT: [0,14]    .    .    .    .    .    .    .    .    .    .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_add_f64 v[2:3], v[2:3], v[2:3]
+# CHECK-NEXT: [0,15]    .    .    .    .    .    .    .    .    .    .    .    .    .    .  DeeeeeeeeeeeeeeeeeeeeeE    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mul_f64 v[4:5], v[4:5], v[4:5]
+# CHECK-NEXT: [0,16]    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeE   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_min_f64 v[6:7], v[6:7], v[6:7]
+# CHECK-NEXT: [0,17]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    DeeeeeeeeeeeeeeeeeeeeeE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_max_f64 v[8:9], v[8:9], v[8:9]
+# CHECK-NEXT: [0,18]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  DeeeeeeeeeeeeeeeeeeeeeE    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_div_fmas_f64 v[0:1], v[0:1], v[0:1], v[0:1]
+# CHECK-NEXT: [0,19]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    DeeeeeeeeeeeeeeeeeeeeeE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_div_fixup_f64 v[0:1], v[0:1], v[0:1], v[0:1]
+# CHECK-NEXT: [0,20]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_ldexp_f64 v[2:3], v[2:3], v0
+# CHECK-NEXT: [0,21]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeE.    .    .    .    .    .    .    .    .    .  .   v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1]
+# CHECK-NEXT: [0,22]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeE   .    .    .    .    .  .   v_trig_preop_f64 v[2:3], v[2:3], v0
+# CHECK-NEXT: [0,23]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    DeeeeeeeeeeeeeeeeeeeeeE  .    .    .    .    .  .   v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[0:1]
+# CHECK-NEXT: [0,24]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .DeeeeeeeeeeeeeeeeeeeeeE .  .   v_cmp_class_f64_e64 vcc_lo, v[2:3], s0
+# CHECK-NEXT: [0,25]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    . DeeeeeeeeeeeeeeeeeeeeeeeE .   v_rcp_f64_e32 v[0:1], v[0:1]
+# CHECK-NEXT: [0,26]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  DeeeeeeeeeeeeeeeeeeeeeeeE.   v_rsq_f64_e32 v[2:3], v[2:3]
+# CHECK-NEXT: [0,27]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   DeeeeeeeeeeeeeeeeeeeeeeeE   v_sqrt_f64_e32 v[4:5], v[4:5]
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx9-retireooo.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx9-retireooo.s
deleted file mode 100644
index 706ed36f9e980..0000000000000
--- a/llvm/test/tools/llvm-mca/AMDGPU/gfx9-retireooo.s
+++ /dev/null
@@ -1,233 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx900 --timeline --iterations=1 --timeline-max-cycles=0 < %s | FileCheck %s
-
-s_load_dwordx2 s[2:3], s[0:1], 0x24
-s_load_dwordx2 s[0:1], s[0:1], 0x2c
-s_waitcnt lgkmcnt(0)
-v_mov_b32_e32 v0, s2
-v_mov_b32_e32 v1, s3
-flat_load_dword v2, v[0:1]
-flat_load_dword v3, v[0:1] offset:8
-flat_load_dword v4, v[0:1] offset:16
-flat_load_dword v5, v[0:1] offset:24
-v_mov_b32_e32 v0, s0
-v_mov_b32_e32 v1, s1
-v_mov_b32_e32 v6, s6
-v_mov_b32_e32 v7, s7
-v_mov_b32_e32 v8, s8
-v_mov_b32_e32 v9, s9
-v_mov_b32_e32 v10, s10
-v_mov_b32_e32 v11, s11
-v_mov_b32_e32 v12, s12
-v_mov_b32_e32 v13, s13
-v_mov_b32_e32 v14, s14
-v_mov_b32_e32 v15, s15
-v_mov_b32_e32 v16, s16
-v_mov_b32_e32 v17, s17
-v_mov_b32_e32 v18, s18
-v_mov_b32_e32 v19, s19
-v_mov_b32_e32 v20, s20
-v_mov_b32_e32 v21, s21
-v_mov_b32_e32 v22, s22
-v_mov_b32_e32 v23, s23
-v_mov_b32_e32 v24, s24
-v_mov_b32_e32 v25, s25
-v_mov_b32_e32 v26, s26
-v_mov_b32_e32 v27, s27
-v_mov_b32_e32 v28, s28
-v_mov_b32_e32 v29, s29
-s_waitcnt vmcnt(0) lgkmcnt(0)
-
-# CHECK:      Iterations:        1
-# CHECK-NEXT: Instructions:      36
-# CHECK-NEXT: Total Cycles:      94
-# CHECK-NEXT: Total uOps:        36
-
-# CHECK:      Dispatch Width:    1
-# CHECK-NEXT: uOps Per Cycle:    0.38
-# CHECK-NEXT: IPC:               0.38
-# CHECK-NEXT: Block RThroughput: 36.0
-
-# CHECK:      Instruction Info:
-# CHECK-NEXT: [1]: #uOps
-# CHECK-NEXT: [2]: Latency
-# CHECK-NEXT: [3]: RThroughput
-# CHECK-NEXT: [4]: MayLoad
-# CHECK-NEXT: [5]: MayStore
-# CHECK-NEXT: [6]: HasSideEffects (U)
-
-# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      5     1.00    *                   s_load_dwordx2 s[2:3], s[0:1], 0x24
-# CHECK-NEXT:  1      5     1.00    *                   s_load_dwordx2 s[0:1], s[0:1], 0x2c
-# CHECK-NEXT:  1      1     1.00                  U     s_waitcnt lgkmcnt(0)
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v0, s2
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v1, s3
-# CHECK-NEXT:  1      80    1.00    *             U     flat_load_dword v2, v[0:1]
-# CHECK-NEXT:  1      80    1.00    *             U     flat_load_dword v3, v[0:1] offset:8
-# CHECK-NEXT:  1      80    1.00    *             U     flat_load_dword v4, v[0:1] offset:16
-# CHECK-NEXT:  1      80    1.00    *             U     flat_load_dword v5, v[0:1] offset:24
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v0, s0
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v1, s1
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v6, s6
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v7, s7
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v8, s8
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v9, s9
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v10, s10
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v11, s11
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v12, s12
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v13, s13
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v14, s14
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v15, s15
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v16, s16
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v17, s17
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v18, s18
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v19, s19
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v20, s20
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v21, s21
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v22, s22
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v23, s23
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v24, s24
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v25, s25
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v26, s26
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v27, s27
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v28, s28
-# CHECK-NEXT:  1      1     1.00                  U     v_mov_b32_e32 v29, s29
-# CHECK-NEXT:  1      1     1.00                  U     s_waitcnt vmcnt(0) lgkmcnt(0)
-
-# CHECK:      Resources:
-# CHECK-NEXT: [0]   - HWBranch
-# CHECK-NEXT: [1]   - HWExport
-# CHECK-NEXT: [2]   - HWLGKM
-# CHECK-NEXT: [3]   - HWSALU
-# CHECK-NEXT: [4]   - HWVALU
-# CHECK-NEXT: [5]   - HWVMEM
-# CHECK-NEXT: [6]   - HWXDL
-
-# CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]
-# CHECK-NEXT:  -      -     2.00   2.00   28.00  4.00    -
-
-# CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  -      -     1.00    -      -      -      -     s_load_dwordx2 s[2:3], s[0:1], 0x24
-# CHECK-NEXT:  -      -     1.00    -      -      -      -     s_load_dwordx2 s[0:1], s[0:1], 0x2c
-# CHECK-NEXT:  -      -      -     1.00    -      -      -     s_waitcnt lgkmcnt(0)
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v0, s2
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v1, s3
-# CHECK-NEXT:  -      -      -      -      -     1.00    -     flat_load_dword v2, v[0:1]
-# CHECK-NEXT:  -      -      -      -      -     1.00    -     flat_load_dword v3, v[0:1] offset:8
-# CHECK-NEXT:  -      -      -      -      -     1.00    -     flat_load_dword v4, v[0:1] offset:16
-# CHECK-NEXT:  -      -      -      -      -     1.00    -     flat_load_dword v5, v[0:1] offset:24
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v0, s0
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v1, s1
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v6, s6
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v7, s7
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v8, s8
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v9, s9
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v10, s10
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v11, s11
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v12, s12
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v13, s13
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v14, s14
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v15, s15
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v16, s16
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v17, s17
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v18, s18
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v19, s19
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v20, s20
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v21, s21
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v22, s22
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v23, s23
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v24, s24
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v25, s25
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v26, s26
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v27, s27
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v28, s28
-# CHECK-NEXT:  -      -      -      -     1.00    -      -     v_mov_b32_e32 v29, s29
-# CHECK-NEXT:  -      -      -     1.00    -      -      -     s_waitcnt vmcnt(0) lgkmcnt(0)
-
-# CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          0123456789          0123456789          0123
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   s_load_dwordx2 s[2:3], s[0:1], 0x24
-# CHECK-NEXT: [0,1]     .DeeeeE   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   s_load_dwordx2 s[0:1], s[0:1], 0x2c
-# CHECK-NEXT: [0,2]     .    .DE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   s_waitcnt lgkmcnt(0)
-# CHECK-NEXT: [0,3]     .    . DE .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v0, s2
-# CHECK-NEXT: [0,4]     .    .  DE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v1, s3
-# CHECK-NEXT: [0,5]     .    .   DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE.  .   flat_load_dword v2, v[0:1]
-# CHECK-NEXT: [0,6]     .    .    DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE  .   flat_load_dword v3, v[0:1] offset:8
-# CHECK-NEXT: [0,7]     .    .    .DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE .   flat_load_dword v4, v[0:1] offset:16
-# CHECK-NEXT: [0,8]     .    .    . DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE.   flat_load_dword v5, v[0:1] offset:24
-# CHECK-NEXT: [0,9]     .    .    .  DE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v0, s0
-# CHECK-NEXT: [0,10]    .    .    .   DE    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v1, s1
-# CHECK-NEXT: [0,11]    .    .    .    DE   .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v6, s6
-# CHECK-NEXT: [0,12]    .    .    .    .DE  .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v7, s7
-# CHECK-NEXT: [0,13]    .    .    .    . DE .    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v8, s8
-# CHECK-NEXT: [0,14]    .    .    .    .  DE.    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v9, s9
-# CHECK-NEXT: [0,15]    .    .    .    .   DE    .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v10, s10
-# CHECK-NEXT: [0,16]    .    .    .    .    DE   .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v11, s11
-# CHECK-NEXT: [0,17]    .    .    .    .    .DE  .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v12, s12
-# CHECK-NEXT: [0,18]    .    .    .    .    . DE .    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v13, s13
-# CHECK-NEXT: [0,19]    .    .    .    .    .  DE.    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v14, s14
-# CHECK-NEXT: [0,20]    .    .    .    .    .   DE    .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v15, s15
-# CHECK-NEXT: [0,21]    .    .    .    .    .    DE   .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v16, s16
-# CHECK-NEXT: [0,22]    .    .    .    .    .    .DE  .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v17, s17
-# CHECK-NEXT: [0,23]    .    .    .    .    .    . DE .    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v18, s18
-# CHECK-NEXT: [0,24]    .    .    .    .    .    .  DE.    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v19, s19
-# CHECK-NEXT: [0,25]    .    .    .    .    .    .   DE    .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v20, s20
-# CHECK-NEXT: [0,26]    .    .    .    .    .    .    DE   .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v21, s21
-# CHECK-NEXT: [0,27]    .    .    .    .    .    .    .DE  .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v22, s22
-# CHECK-NEXT: [0,28]    .    .    .    .    .    .    . DE .    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v23, s23
-# CHECK-NEXT: [0,29]    .    .    .    .    .    .    .  DE.    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v24, s24
-# CHECK-NEXT: [0,30]    .    .    .    .    .    .    .   DE    .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v25, s25
-# CHECK-NEXT: [0,31]    .    .    .    .    .    .    .    DE   .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v26, s26
-# CHECK-NEXT: [0,32]    .    .    .    .    .    .    .    .DE  .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v27, s27
-# CHECK-NEXT: [0,33]    .    .    .    .    .    .    .    . DE .    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v28, s28
-# CHECK-NEXT: [0,34]    .    .    .    .    .    .    .    .  DE.    .    .    .    .    .    .    .    .    .    .  .   v_mov_b32_e32 v29, s29
-# CHECK-NEXT: [0,35]    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    .    . DE   s_waitcnt vmcnt(0) lgkmcnt(0)
-
-# CHECK:      Average Wait times (based on the timeline view):
-# CHECK-NEXT: [0]: Executions
-# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
-# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
-# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
-
-# CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     1     0.0    0.0    0.0       s_load_dwordx2 s[2:3], s[0:1], 0x24
-# CHECK-NEXT: 1.     1     0.0    0.0    0.0       s_load_dwordx2 s[0:1], s[0:1], 0x2c
-# CHECK-NEXT: 2.     1     0.0    0.0    0.0       s_waitcnt lgkmcnt(0)
-# CHECK-NEXT: 3.     1     0.0    0.0    0.0       v_mov_b32_e32 v0, s2
-# CHECK-NEXT: 4.     1     0.0    0.0    0.0       v_mov_b32_e32 v1, s3
-# CHECK-NEXT: 5.     1     0.0    0.0    0.0       flat_load_dword v2, v[0:1]
-# CHECK-NEXT: 6.     1     0.0    0.0    0.0       flat_load_dword v3, v[0:1] offset:8
-# CHECK-NEXT: 7.     1     0.0    0.0    0.0       flat_load_dword v4, v[0:1] offset:16
-# CHECK-NEXT: 8.     1     0.0    0.0    0.0       flat_load_dword v5, v[0:1] offset:24
-# CHECK-NEXT: 9.     1     0.0    0.0    0.0       v_mov_b32_e32 v0, s0
-# CHECK-NEXT: 10.    1     0.0    0.0    0.0       v_mov_b32_e32 v1, s1
-# CHECK-NEXT: 11.    1     0.0    0.0    0.0       v_mov_b32_e32 v6, s6
-# CHECK-NEXT: 12.    1     0.0    0.0    0.0       v_mov_b32_e32 v7, s7
-# CHECK-NEXT: 13.    1     0.0    0.0    0.0       v_mov_b32_e32 v8, s8
-# CHECK-NEXT: 14.    1     0.0    0.0    0.0       v_mov_b32_e32 v9, s9
-# CHECK-NEXT: 15.    1     0.0    0.0    0.0       v_mov_b32_e32 v10, s10
-# CHECK-NEXT: 16.    1     0.0    0.0    0.0       v_mov_b32_e32 v11, s11
-# CHECK-NEXT: 17.    1     0.0    0.0    0.0       v_mov_b32_e32 v12, s12
-# CHECK-NEXT: 18.    1     0.0    0.0    0.0       v_mov_b32_e32 v13, s13
-# CHECK-NEXT: 19.    1     0.0    0.0    0.0       v_mov_b32_e32 v14, s14
-# CHECK-NEXT: 20.    1     0.0    0.0    0.0       v_mov_b32_e32 v15, s15
-# CHECK-NEXT: 21.    1     0.0    0.0    0.0       v_mov_b32_e32 v16, s16
-# CHECK-NEXT: 22.    1     0.0    0.0    0.0       v_mov_b32_e32 v17, s17
-# CHECK-NEXT: 23.    1     0.0    0.0    0.0       v_mov_b32_e32 v18, s18
-# CHECK-NEXT: 24.    1     0.0    0.0    0.0       v_mov_b32_e32 v19, s19
-# CHECK-NEXT: 25.    1     0.0    0.0    0.0       v_mov_b32_e32 v20, s20
-# CHECK-NEXT: 26.    1     0.0    0.0    0.0       v_mov_b32_e32 v21, s21
-# CHECK-NEXT: 27.    1     0.0    0.0    0.0       v_mov_b32_e32 v22, s22
-# CHECK-NEXT: 28.    1     0.0    0.0    0.0       v_mov_b32_e32 v23, s23
-# CHECK-NEXT: 29.    1     0.0    0.0    0.0       v_mov_b32_e32 v24, s24
-# CHECK-NEXT: 30.    1     0.0    0.0    0.0       v_mov_b32_e32 v25, s25
-# CHECK-NEXT: 31.    1     0.0    0.0    0.0       v_mov_b32_e32 v26, s26
-# CHECK-NEXT: 32.    1     0.0    0.0    0.0       v_mov_b32_e32 v27, s27
-# CHECK-NEXT: 33.    1     0.0    0.0    0.0       v_mov_b32_e32 v28, s28
-# CHECK-NEXT: 34.    1     0.0    0.0    0.0       v_mov_b32_e32 v29, s29
-# CHECK-NEXT: 35.    1     0.0    0.0    0.0       s_waitcnt vmcnt(0) lgkmcnt(0)
-# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
diff --git a/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.cpp b/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.cpp
index 6ee77fa2b3845..a655f3faf1bf2 100644
--- a/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.cpp
+++ b/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.cpp
@@ -19,311 +19,15 @@
 namespace llvm {
 namespace mca {
 
-void AMDGPUInstrPostProcess::postProcessInstruction(
-    std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
-  switch (MCI.getOpcode()) {
-  case AMDGPU::S_WAITCNT:
-  case AMDGPU::S_WAITCNT_EXPCNT:
-  case AMDGPU::S_WAITCNT_LGKMCNT:
-  case AMDGPU::S_WAITCNT_VMCNT:
-  case AMDGPU::S_WAITCNT_VSCNT:
-  case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
-  case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
-  case AMDGPU::S_WAITCNT_VMCNT_gfx10:
-  case AMDGPU::S_WAITCNT_VSCNT_gfx10:
-  case AMDGPU::S_WAITCNT_gfx10:
-  case AMDGPU::S_WAITCNT_gfx6_gfx7:
-  case AMDGPU::S_WAITCNT_vi:
-    return processWaitCnt(Inst, MCI);
-  }
-}
-
-// s_waitcnt instructions encode important information as immediate operands
-// which are lost during the MCInst -> mca::Instruction lowering.
-void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst,
-                                            const MCInst &MCI) {
-  for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) {
-    MCAOperand Op;
-    const MCOperand &MCOp = MCI.getOperand(Idx);
-    if (MCOp.isReg()) {
-      Op = MCAOperand::createReg(MCOp.getReg());
-    } else if (MCOp.isImm()) {
-      Op = MCAOperand::createImm(MCOp.getImm());
-    }
-    Op.setIndex(Idx);
-    Inst->addOperand(Op);
-  }
-}
-
 AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
                                              const SourceMgr &SrcMgr,
                                              const MCInstrInfo &MCII)
-    : CustomBehaviour(STI, SrcMgr, MCII) {
-  generateWaitCntInfo();
-}
+    : CustomBehaviour(STI, SrcMgr, MCII) {}
 
 unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
                                                   const InstRef &IR) {
-  const Instruction &Inst = *IR.getInstruction();
-  unsigned Opcode = Inst.getOpcode();
-
-  // llvm-mca is generally run on fully compiled assembly so we wouldn't see any
-  // pseudo instructions here. However, there are plans for the future to make
-  // it possible to use mca within backend passes. As such, I have left the
-  // pseudo version of s_waitcnt within this switch statement.
-  switch (Opcode) {
-  default:
-    return 0;
-  case AMDGPU::S_WAITCNT: // This instruction
-  case AMDGPU::S_WAITCNT_EXPCNT:
-  case AMDGPU::S_WAITCNT_LGKMCNT:
-  case AMDGPU::S_WAITCNT_VMCNT:
-  case AMDGPU::S_WAITCNT_VSCNT: // to this instruction are all pseudo.
-  case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
-  case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
-  case AMDGPU::S_WAITCNT_VMCNT_gfx10:
-  case AMDGPU::S_WAITCNT_VSCNT_gfx10:
-  case AMDGPU::S_WAITCNT_gfx10:
-  case AMDGPU::S_WAITCNT_gfx6_gfx7:
-  case AMDGPU::S_WAITCNT_vi:
-    // s_endpgm also behaves as if there is an implicit
-    // s_waitcnt 0, but I'm not sure if it would be appropriate
-    // to model this in llvm-mca based on how the iterations work
-    // while simulating the pipeline over and over.
-    return handleWaitCnt(IssuedInst, IR);
-  }
-
   return 0;
 }
 
-unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst,
-                                              const InstRef &IR) {
-  // Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr.
-  // I do not know how that instruction works so I did not attempt to model it.
-  // set the max values to begin
-  unsigned Vmcnt = 63;
-  unsigned Expcnt = 7;
-  unsigned Lgkmcnt = 31;
-  unsigned Vscnt = 63;
-  unsigned CurrVmcnt = 0;
-  unsigned CurrExpcnt = 0;
-  unsigned CurrLgkmcnt = 0;
-  unsigned CurrVscnt = 0;
-  unsigned CyclesToWaitVm = ~0U;
-  unsigned CyclesToWaitExp = ~0U;
-  unsigned CyclesToWaitLgkm = ~0U;
-  unsigned CyclesToWaitVs = ~0U;
-
-  computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt);
-
-  // We will now look at each of the currently executing instructions
-  // to find out if this wait instruction still needs to wait.
-  for (auto I = IssuedInst.begin(), E = IssuedInst.end(); I != E; I++) {
-    const InstRef &PrevIR = *I;
-    const Instruction &PrevInst = *PrevIR.getInstruction();
-    const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size();
-    const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex];
-    const int CyclesLeft = PrevInst.getCyclesLeft();
-    assert(CyclesLeft != UNKNOWN_CYCLES &&
-           "We should know how many cycles are left for this instruction");
-    if (PrevInstWaitInfo.VmCnt) {
-      CurrVmcnt++;
-      if ((unsigned)CyclesLeft < CyclesToWaitVm)
-        CyclesToWaitVm = CyclesLeft;
-    }
-    if (PrevInstWaitInfo.ExpCnt) {
-      CurrExpcnt++;
-      if ((unsigned)CyclesLeft < CyclesToWaitExp)
-        CyclesToWaitExp = CyclesLeft;
-    }
-    if (PrevInstWaitInfo.LgkmCnt) {
-      CurrLgkmcnt++;
-      if ((unsigned)CyclesLeft < CyclesToWaitLgkm)
-        CyclesToWaitLgkm = CyclesLeft;
-    }
-    if (PrevInstWaitInfo.VsCnt) {
-      CurrVscnt++;
-      if ((unsigned)CyclesLeft < CyclesToWaitVs)
-        CyclesToWaitVs = CyclesLeft;
-    }
-  }
-
-  unsigned CyclesToWait = ~0U;
-  if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait)
-    CyclesToWait = CyclesToWaitVm;
-  if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait)
-    CyclesToWait = CyclesToWaitExp;
-  if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait)
-    CyclesToWait = CyclesToWaitLgkm;
-  if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait)
-    CyclesToWait = CyclesToWaitVs;
-
-  // We may underestimate how many cycles we need to wait, but this
-  // isn't a big deal. Our return value is just how many cycles until
-  // this function gets run again. So as long as we don't overestimate
-  // the wait time, we'll still end up stalling at this instruction
-  // for the correct number of cycles.
-
-  if (CyclesToWait == ~0U)
-    return 0;
-  return CyclesToWait;
-}
-
-void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt,
-                                           unsigned &Expcnt, unsigned &Lgkmcnt,
-                                           unsigned &Vscnt) {
-  AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
-  const Instruction &Inst = *IR.getInstruction();
-  unsigned Opcode = Inst.getOpcode();
-
-  switch (Opcode) {
-  case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
-  case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
-  case AMDGPU::S_WAITCNT_VMCNT_gfx10:
-  case AMDGPU::S_WAITCNT_VSCNT_gfx10: {
-    // Should probably be checking for nullptr
-    // here, but I'm not sure how I should handle the case
-    // where we see a nullptr.
-    const MCAOperand *OpReg = Inst.getOperand(0);
-    const MCAOperand *OpImm = Inst.getOperand(1);
-    assert(OpReg && OpReg->isReg() && "First operand should be a register.");
-    assert(OpImm && OpImm->isImm() && "Second operand should be an immediate.");
-    if (OpReg->getReg() != AMDGPU::SGPR_NULL) {
-      // Instruction is using a real register.
-      // Since we can't know what value this register will have,
-      // we can't compute what the value of this wait should be.
-      WithColor::warning() << "The register component of "
-                           << MCII.getName(Opcode) << " will be completely "
-                           << "ignored. So the wait may not be accurate.\n";
-    }
-    switch (Opcode) {
-    // Redundant switch so I don't have to repeat the code above
-    // for each case. There are more clever ways to avoid this
-    // extra switch and anyone can feel free to implement one of them.
-    case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
-      Expcnt = OpImm->getImm();
-      break;
-    case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
-      Lgkmcnt = OpImm->getImm();
-      break;
-    case AMDGPU::S_WAITCNT_VMCNT_gfx10:
-      Vmcnt = OpImm->getImm();
-      break;
-    case AMDGPU::S_WAITCNT_VSCNT_gfx10:
-      Vscnt = OpImm->getImm();
-      break;
-    }
-    return;
-  }
-  case AMDGPU::S_WAITCNT_gfx10:
-  case AMDGPU::S_WAITCNT_gfx6_gfx7:
-  case AMDGPU::S_WAITCNT_vi:
-    unsigned WaitCnt = Inst.getOperand(0)->getImm();
-    AMDGPU::decodeWaitcnt(IV, WaitCnt, Vmcnt, Expcnt, Lgkmcnt);
-    return;
-  }
-}
-
-void AMDGPUCustomBehaviour::generateWaitCntInfo() {
-  // The core logic from this function is taken from
-  // SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions
-  // that are being looked at are in the MachineInstr format, whereas we have
-  // access to the MCInst format. The side effects of this are that we can't use
-  // the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst)
-  // functions. Therefore, we conservatively assume that these functions will
-  // return true. This may cause a few instructions to be incorrectly tagged
-  // with an extra CNT. However, these are instructions that do interact with at
-  // least one CNT so giving them an extra CNT shouldn't cause issues in most
-  // scenarios.
-  AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
-  InstrWaitCntInfo.resize(SrcMgr.size());
-
-  int Index = 0;
-  for (auto I = SrcMgr.begin(), E = SrcMgr.end(); I != E; ++I, ++Index) {
-    const std::unique_ptr<Instruction> &Inst = *I;
-    unsigned Opcode = Inst->getOpcode();
-    const MCInstrDesc &MCID = MCII.get(Opcode);
-    if ((MCID.TSFlags & SIInstrFlags::DS) &&
-        (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) {
-      InstrWaitCntInfo[Index].LgkmCnt = true;
-      if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, AMDGPU::OpName::gds))
-        InstrWaitCntInfo[Index].ExpCnt = true;
-    } else if (MCID.TSFlags & SIInstrFlags::FLAT) {
-      // We conservatively assume that mayAccessVMEMThroughFlat(Inst)
-      // and mayAccessLDSThroughFlat(Inst) would both return true for this
-      // instruction. We have to do this because those functions use
-      // information about the memory operands that we don't have access to.
-      InstrWaitCntInfo[Index].LgkmCnt = true;
-      if (!STI.hasFeature(AMDGPU::FeatureVscnt))
-        InstrWaitCntInfo[Index].VmCnt = true;
-      else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet))
-        InstrWaitCntInfo[Index].VmCnt = true;
-      else
-        InstrWaitCntInfo[Index].VsCnt = true;
-    } else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opcode)) {
-      if (!STI.hasFeature(AMDGPU::FeatureVscnt))
-        InstrWaitCntInfo[Index].VmCnt = true;
-      else if ((MCID.mayLoad() &&
-                !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) ||
-               ((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() &&
-                !MCID.mayStore()))
-        InstrWaitCntInfo[Index].VmCnt = true;
-      else if (MCID.mayStore())
-        InstrWaitCntInfo[Index].VsCnt = true;
-
-      // (IV.Major < 7) is meant to represent
-      // GCNTarget.vmemWriteNeedsExpWaitcnt()
-      // which is defined as
-      // { return getGeneration() < SEA_ISLANDS; }
-      if (IV.Major < 7 &&
-          (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet)))
-        InstrWaitCntInfo[Index].ExpCnt = true;
-    } else if (MCID.TSFlags & SIInstrFlags::SMRD) {
-      InstrWaitCntInfo[Index].LgkmCnt = true;
-    } else if (MCID.TSFlags & SIInstrFlags::EXP) {
-      InstrWaitCntInfo[Index].ExpCnt = true;
-    } else {
-      switch (Opcode) {
-      case AMDGPU::S_SENDMSG:
-      case AMDGPU::S_SENDMSGHALT:
-      case AMDGPU::S_MEMTIME:
-      case AMDGPU::S_MEMREALTIME:
-        InstrWaitCntInfo[Index].LgkmCnt = true;
-        break;
-      }
-    }
-  }
-}
-
-// taken from SIInstrInfo::isVMEM()
-bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) {
-  return MCID.TSFlags & SIInstrFlags::MUBUF ||
-         MCID.TSFlags & SIInstrFlags::MTBUF ||
-         MCID.TSFlags & SIInstrFlags::MIMG;
-}
-
-// taken from SIInstrInfo::hasModifiersSet()
-bool AMDGPUCustomBehaviour::hasModifiersSet(
-    const std::unique_ptr<Instruction> &Inst, unsigned OpName) const {
-  int Idx = AMDGPU::getNamedOperandIdx(Inst->getOpcode(), OpName);
-  if (Idx == -1)
-    return false;
-
-  const MCAOperand *Op = Inst->getOperand(Idx);
-  if (Op == nullptr || !Op->isImm() || !Op->getImm())
-    return false;
-
-  return true;
-}
-
-// taken from SIInstrInfo::isAlwaysGDS()
-bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
-  return Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::DS_GWS_INIT ||
-         Opcode == AMDGPU::DS_GWS_SEMA_V || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
-         Opcode == AMDGPU::DS_GWS_SEMA_P ||
-         Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
-         Opcode == AMDGPU::DS_GWS_BARRIER;
-}
-
 } // namespace mca
 } // namespace llvm
diff --git a/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.h b/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.h
index e1efafa427fd5..0dd21c7b4c446 100644
--- a/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.h
+++ b/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.h
@@ -23,8 +23,6 @@ namespace llvm {
 namespace mca {
 
 class AMDGPUInstrPostProcess : public InstrPostProcess {
-  void processWaitCnt(std::unique_ptr<Instruction> &Inst, const MCInst &MCI);
-
 public:
   AMDGPUInstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
       : InstrPostProcess(STI, MCII) {}
@@ -32,54 +30,10 @@ class AMDGPUInstrPostProcess : public InstrPostProcess {
   ~AMDGPUInstrPostProcess() {}
 
   void postProcessInstruction(std::unique_ptr<Instruction> &Inst,
-                              const MCInst &MCI) override;
-};
-
-struct WaitCntInfo {
-  bool VmCnt = false;
-  bool ExpCnt = false;
-  bool LgkmCnt = false;
-  bool VsCnt = false;
+                              const MCInst &MCI) override {}
 };
 
 class AMDGPUCustomBehaviour : public CustomBehaviour {
-  /// Whenever MCA would like to dispatch an s_waitcnt instructions,
-  /// we must check all the instruction that are still executing to see if
-  /// they modify the same CNT as we need to wait for. This vector
-  /// gets built in the constructor and contains 1 WaitCntInfo struct
-  /// for each instruction within the SrcManager. Each element
-  /// tells us which CNTs that instruction may interact with.
-  /// We conservatively assume some instructions interact with more
-  /// CNTs than they do in reality, so we will occasionally wait
-  /// longer than necessary, but we shouldn't ever wait for shorter.
-  std::vector<WaitCntInfo> InstrWaitCntInfo;
-
-  /// This method gets called from the constructor and is
-  /// where we setup the InstrWaitCntInfo vector.
-  /// The core logic for determining which CNTs an instruction
-  /// interacts with is taken from SIInsertWaitcnts::updateEventWaitcntAfter().
-  /// Unfortunately, some of the logic from that function is not avalable to us
-  /// in this scope so we conservatively end up assuming that some
-  /// instructions interact with more CNTs than they do in reality.
-  void generateWaitCntInfo();
-  /// Helper function used in generateWaitCntInfo()
-  bool hasModifiersSet(const std::unique_ptr<Instruction> &Inst,
-                       unsigned OpName) const;
-  /// Helper function used in generateWaitCntInfo()
-  bool isAlwaysGDS(uint16_t Opcode) const;
-  /// Helper function used in generateWaitCntInfo()
-  bool isVMEM(const MCInstrDesc &MCID);
-  /// This method gets called from checkCustomHazard when mca is attempting to
-  /// dispatch an s_waitcnt instruction (or one of its variants). The method
-  /// looks at each of the instructions that are still executing in the pipeline
-  /// to determine if the waitcnt should force a wait.
-  unsigned handleWaitCnt(ArrayRef<InstRef> IssuedInst, const InstRef &IR);
-  /// Based on the type of s_waitcnt instruction we are looking at, and what its
-  /// operands are, this method will set the values for each of the cnt
-  /// references provided as arguments.
-  void computeWaitCnt(const InstRef &IR, unsigned &Vmcnt, unsigned &Expcnt,
-                      unsigned &Lgkmcnt, unsigned &Vscnt);
-
 public:
   AMDGPUCustomBehaviour(const MCSubtargetInfo &STI, const SourceMgr &SrcMgr,
                         const MCInstrInfo &MCII);

From 88efb59b7829a97b3ea7d847bd84e8905a7dee42 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Wed, 7 Jul 2021 21:16:06 +1000
Subject: [PATCH 15/32] [ORC] Fix file comments.

---
 llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcError.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcError.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcError.h
index 379dd9efefd15..8dffea70e3355 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcError.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcError.h
@@ -1,4 +1,4 @@
-//===------ OrcError.h - Reject symbol lookup requests ------*- C++ -*-===//
+//===--------------- OrcError.h - Orc Error Types ---------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//   Define an error category, error codes, and helper utilities for Orc.
+// Define an error category, error codes, and helper utilities for Orc.
 //
 //===----------------------------------------------------------------------===//
 

From 5471766f9d16fbc5a82dd9503729747d901242a1 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Thu, 8 Jul 2021 14:10:15 +1000
Subject: [PATCH 16/32] [ORC] Replace MachOJITDylibInitializers::SectionExtent
 with ExecutorAddressRange

MachOJITDylibInitializers::SectionExtent represented the address range of a
section as an (address, size) pair. The new ExecutorAddressRange type
generalizes this to an address range (for any object, not necessarily a section)
represented as a (start-address, end-address) pair.

The aim is to express more of ORC (and the ORC runtime) in terms of simple types
that can be serialized/deserialized via SPS. This will simplify SPS-based RPC
involving arguments/return-values of these types.
---
 .../llvm/ExecutionEngine/Orc/MachOPlatform.h  | 23 +++---
 .../Orc/Shared/CommonOrcRuntimeTypes.h        | 66 +++++++++++++++++
 .../lib/ExecutionEngine/Orc/MachOPlatform.cpp | 71 +++++++++++--------
 3 files changed, 115 insertions(+), 45 deletions(-)
 create mode 100644 llvm/include/llvm/ExecutionEngine/Orc/Shared/CommonOrcRuntimeTypes.h

diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
index f04bef161ea71..f9d0b587a1bed 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
@@ -17,6 +17,7 @@
 #include "llvm/ExecutionEngine/Orc/Core.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 #include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/Orc/Shared/CommonOrcRuntimeTypes.h"
 
 #include <future>
 #include <thread>
@@ -31,21 +32,13 @@ bool objCRegistrationEnabled();
 
 class MachOJITDylibInitializers {
 public:
-  struct SectionExtent {
-    SectionExtent() = default;
-    SectionExtent(JITTargetAddress Address, uint64_t NumPtrs)
-        : Address(Address), NumPtrs(NumPtrs) {}
-    JITTargetAddress Address = 0;
-    uint64_t NumPtrs = 0;
-  };
-
-  using RawPointerSectionList = std::vector<SectionExtent>;
+  using RawPointerSectionList = std::vector<shared::ExecutorAddressRange>;
 
   void setObjCImageInfoAddr(JITTargetAddress ObjCImageInfoAddr) {
     this->ObjCImageInfoAddr = ObjCImageInfoAddr;
   }
 
-  void addModInitsSection(SectionExtent ModInit) {
+  void addModInitsSection(shared::ExecutorAddressRange ModInit) {
     ModInitSections.push_back(std::move(ModInit));
   }
 
@@ -53,7 +46,7 @@ class MachOJITDylibInitializers {
     return ModInitSections;
   }
 
-  void addObjCSelRefsSection(SectionExtent ObjCSelRefs) {
+  void addObjCSelRefsSection(shared::ExecutorAddressRange ObjCSelRefs) {
     ObjCSelRefsSections.push_back(std::move(ObjCSelRefs));
   }
 
@@ -61,7 +54,7 @@ class MachOJITDylibInitializers {
     return ObjCSelRefsSections;
   }
 
-  void addObjCClassListSection(SectionExtent ObjCClassList) {
+  void addObjCClassListSection(shared::ExecutorAddressRange ObjCClassList) {
     ObjCClassListSections.push_back(std::move(ObjCClassList));
   }
 
@@ -152,9 +145,9 @@ class MachOPlatform : public Platform {
   };
 
   void registerInitInfo(JITDylib &JD, JITTargetAddress ObjCImageInfoAddr,
-                        MachOJITDylibInitializers::SectionExtent ModInits,
-                        MachOJITDylibInitializers::SectionExtent ObjCSelRefs,
-                        MachOJITDylibInitializers::SectionExtent ObjCClassList);
+                        shared::ExecutorAddressRange ModInits,
+                        shared::ExecutorAddressRange ObjCSelRefs,
+                        shared::ExecutorAddressRange ObjCClassList);
 
   ExecutionSession &ES;
   ObjectLinkingLayer &ObjLinkingLayer;
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/CommonOrcRuntimeTypes.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/CommonOrcRuntimeTypes.h
new file mode 100644
index 0000000000000..8b0e6272a555b
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/CommonOrcRuntimeTypes.h
@@ -0,0 +1,66 @@
+//===------------------- CommonOrcRuntimeTypes.h ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Generic types usable with SPS and the ORC runtime.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_SHARED_COMMONORCRUNTIMETYPES_H
+#define LLVM_EXECUTIONENGINE_ORC_SHARED_COMMONORCRUNTIMETYPES_H
+
+#include "llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h"
+
+namespace llvm {
+namespace orc {
+namespace shared {
+
+/// Represents an address range in the exceutor process.
+struct ExecutorAddressRange {
+  ExecutorAddressRange() = default;
+  ExecutorAddressRange(JITTargetAddress StartAddress,
+                       JITTargetAddress EndAddress)
+      : StartAddress(StartAddress), EndAddress(EndAddress) {}
+
+  bool empty() const { return StartAddress == EndAddress; }
+  size_t size() const { return EndAddress - StartAddress; }
+
+  JITTargetAddress StartAddress = 0;
+  JITTargetAddress EndAddress = 0;
+};
+
+using SPSExecutorAddressRange =
+    SPSTuple<SPSExecutorAddress, SPSExecutorAddress>;
+
+/// Serialization traits for address ranges.
+template <>
+class SPSSerializationTraits<SPSExecutorAddressRange, ExecutorAddressRange> {
+public:
+  static size_t size(const ExecutorAddressRange &Value) {
+    return SPSArgList<SPSExecutorAddress, SPSExecutorAddress>::size(
+        Value.StartAddress, Value.EndAddress);
+  }
+
+  static bool serialize(SPSOutputBuffer &BOB,
+                        const ExecutorAddressRange &Value) {
+    return SPSArgList<SPSExecutorAddress, SPSExecutorAddress>::serialize(
+        BOB, Value.StartAddress, Value.EndAddress);
+  }
+
+  static bool deserialize(SPSInputBuffer &BIB, ExecutorAddressRange &Value) {
+    return SPSArgList<SPSExecutorAddress, SPSExecutorAddress>::deserialize(
+        BIB, Value.StartAddress, Value.EndAddress);
+  }
+};
+
+using SPSExecutorAddressRangeSequence = SPSSequence<SPSExecutorAddressRange>;
+
+} // End namespace shared.
+} // End namespace orc.
+} // End namespace llvm.
+
+#endif // LLVM_EXECUTIONENGINE_ORC_SHARED_COMMONORCRUNTIMETYPES_H
diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index 80df097a07410..74c88b0c1c85b 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -88,11 +88,15 @@ bool objCRegistrationEnabled() {
 
 void MachOJITDylibInitializers::runModInits() const {
   for (const auto &ModInit : ModInitSections) {
-    for (uint64_t I = 0; I != ModInit.NumPtrs; ++I) {
-      auto *InitializerAddr = jitTargetAddressToPointer<uintptr_t *>(
-          ModInit.Address + (I * sizeof(uintptr_t)));
-      auto *Initializer =
-          jitTargetAddressToFunction<void (*)()>(*InitializerAddr);
+    assert(ModInit.size() % sizeof(uintptr_t) == 0 &&
+           "ModInit section size is not a pointer multiple?");
+    for (uintptr_t *
+             InitPtr =
+                jitTargetAddressToPointer<uintptr_t *>(ModInit.StartAddress),
+            *InitEnd =
+                jitTargetAddressToPointer<uintptr_t *>(ModInit.EndAddress);
+         InitPtr != InitEnd; ++InitPtr) {
+      auto *Initializer = reinterpret_cast<void (*)()>(*InitPtr);
       Initializer();
     }
   }
@@ -102,8 +106,11 @@ void MachOJITDylibInitializers::registerObjCSelectors() const {
   assert(objCRegistrationEnabled() && "ObjC registration not enabled.");
 
   for (const auto &ObjCSelRefs : ObjCSelRefsSections) {
-    for (uint64_t I = 0; I != ObjCSelRefs.NumPtrs; ++I) {
-      auto SelEntryAddr = ObjCSelRefs.Address + (I * sizeof(uintptr_t));
+    assert(ObjCSelRefs.size() % sizeof(uintptr_t) == 0 &&
+           "ObjCSelRefs section size is not a pointer multiple?");
+    for (JITTargetAddress SelEntryAddr = ObjCSelRefs.StartAddress;
+         SelEntryAddr != ObjCSelRefs.EndAddress;
+         SelEntryAddr += sizeof(uintptr_t)) {
       const auto *SelName =
           *jitTargetAddressToPointer<const char **>(SelEntryAddr);
       auto Sel = sel_registerName(SelName);
@@ -128,8 +135,11 @@ Error MachOJITDylibInitializers::registerObjCClasses() const {
   auto ClassSelector = sel_registerName("class");
 
   for (const auto &ObjCClassList : ObjCClassListSections) {
-    for (uint64_t I = 0; I != ObjCClassList.NumPtrs; ++I) {
-      auto ClassPtrAddr = ObjCClassList.Address + (I * sizeof(uintptr_t));
+    assert(ObjCClassList.size() % sizeof(uintptr_t) == 0 &&
+           "ObjCClassList section size is not a pointer multiple?");
+    for (JITTargetAddress ClassPtrAddr = ObjCClassList.StartAddress;
+         ClassPtrAddr != ObjCClassList.EndAddress;
+         ClassPtrAddr += sizeof(uintptr_t)) {
       auto Cls = *jitTargetAddressToPointer<Class *>(ClassPtrAddr);
       auto *ClassCompiled =
           *jitTargetAddressToPointer<ObjCClassCompiled **>(ClassPtrAddr);
@@ -264,37 +274,36 @@ MachOPlatform::getDeinitializerSequence(JITDylib &JD) {
 
 void MachOPlatform::registerInitInfo(
     JITDylib &JD, JITTargetAddress ObjCImageInfoAddr,
-    MachOJITDylibInitializers::SectionExtent ModInits,
-    MachOJITDylibInitializers::SectionExtent ObjCSelRefs,
-    MachOJITDylibInitializers::SectionExtent ObjCClassList) {
+    shared::ExecutorAddressRange ModInits,
+    shared::ExecutorAddressRange ObjCSelRefs,
+    shared::ExecutorAddressRange ObjCClassList) {
   std::lock_guard<std::mutex> Lock(InitSeqsMutex);
 
   auto &InitSeq = InitSeqs[&JD];
 
   InitSeq.setObjCImageInfoAddr(ObjCImageInfoAddr);
 
-  if (ModInits.Address)
+  if (ModInits.StartAddress)
     InitSeq.addModInitsSection(std::move(ModInits));
 
-  if (ObjCSelRefs.Address)
+  if (ObjCSelRefs.StartAddress)
     InitSeq.addObjCSelRefsSection(std::move(ObjCSelRefs));
 
-  if (ObjCClassList.Address)
+  if (ObjCClassList.StartAddress)
     InitSeq.addObjCClassListSection(std::move(ObjCClassList));
 }
 
-static Expected<MachOJITDylibInitializers::SectionExtent>
+static Expected<shared::ExecutorAddressRange>
 getSectionExtent(jitlink::LinkGraph &G, StringRef SectionName) {
   auto *Sec = G.findSectionByName(SectionName);
   if (!Sec)
-    return MachOJITDylibInitializers::SectionExtent();
+    return shared::ExecutorAddressRange();
   jitlink::SectionRange R(*Sec);
   if (R.getSize() % G.getPointerSize() != 0)
     return make_error<StringError>(SectionName + " section size is not a "
                                                  "multiple of the pointer size",
                                    inconvertibleErrorCode());
-  return MachOJITDylibInitializers::SectionExtent(
-      R.getStart(), R.getSize() / G.getPointerSize());
+  return shared::ExecutorAddressRange{R.getStart(), R.getEnd()};
 }
 
 void MachOPlatform::InitScraperPlugin::modifyPassConfig(
@@ -326,8 +335,7 @@ void MachOPlatform::InitScraperPlugin::modifyPassConfig(
 
   Config.PostFixupPasses.push_back([this, &JD = MR.getTargetJITDylib()](
                                        jitlink::LinkGraph &G) -> Error {
-    MachOJITDylibInitializers::SectionExtent ModInits, ObjCSelRefs,
-        ObjCClassList;
+    shared::ExecutorAddressRange ModInits, ObjCSelRefs, ObjCClassList;
 
     JITTargetAddress ObjCImageInfoAddr = 0;
     if (auto *ObjCImageInfoSec =
@@ -359,23 +367,26 @@ void MachOPlatform::InitScraperPlugin::modifyPassConfig(
     LLVM_DEBUG({
       dbgs() << "MachOPlatform: Scraped " << G.getName() << " init sections:\n";
       dbgs() << "  __objc_selrefs: ";
-      if (ObjCSelRefs.NumPtrs)
-        dbgs() << ObjCSelRefs.NumPtrs << " pointer(s) at "
-               << formatv("{0:x16}", ObjCSelRefs.Address) << "\n";
+      auto NumObjCSelRefs = ObjCSelRefs.size() / sizeof(uintptr_t);
+      if (NumObjCSelRefs)
+        dbgs() << NumObjCSelRefs << " pointer(s) at "
+               << formatv("{0:x16}", ObjCSelRefs.StartAddress) << "\n";
       else
         dbgs() << "none\n";
 
       dbgs() << "  __objc_classlist: ";
-      if (ObjCClassList.NumPtrs)
-        dbgs() << ObjCClassList.NumPtrs << " pointer(s) at "
-               << formatv("{0:x16}", ObjCClassList.Address) << "\n";
+      auto NumObjCClasses = ObjCClassList.size() / sizeof(uintptr_t);
+      if (NumObjCClasses)
+        dbgs() << NumObjCClasses << " pointer(s) at "
+               << formatv("{0:x16}", ObjCClassList.StartAddress) << "\n";
       else
         dbgs() << "none\n";
 
       dbgs() << "  __mod_init_func: ";
-      if (ModInits.NumPtrs)
-        dbgs() << ModInits.NumPtrs << " pointer(s) at "
-               << formatv("{0:x16}", ModInits.Address) << "\n";
+      auto NumModInits = ModInits.size() / sizeof(uintptr_t);
+      if (NumModInits)
+        dbgs() << NumModInits << " pointer(s) at "
+               << formatv("{0:x16}", ModInits.StartAddress) << "\n";
       else
         dbgs() << "none\n";
     });

From f8c5a4c67075877e1b6976bb7372aa96f02c11bc Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Wed, 7 Jul 2021 23:14:31 -0700
Subject: [PATCH 17/32] [WebAssembly] Optimize out shift masks

WebAssembly's shift instructions implicitly masks the shift count, so optimize
out redundant explicit masks of the shift count. For vector shifts, this
currently only works if the mask is applied before splatting the shift count,
but this should be addressed in a future commit. Resolves PR49655.

Differential Revision: https://reviews.llvm.org/D105600
---
 .../WebAssembly/WebAssemblyInstrInteger.td    |   8 +
 .../WebAssembly/WebAssemblyInstrSIMD.td       |  29 +
 .../test/CodeGen/WebAssembly/masked-shifts.ll | 531 ++++++++++++++++++
 3 files changed, 568 insertions(+)
 create mode 100644 llvm/test/CodeGen/WebAssembly/masked-shifts.ll

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
index 18250cf8ef850..7a0c524d63b0d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
@@ -93,6 +93,14 @@ defm EQZ_I64 : I<(outs I32:$dst), (ins I64:$src), (outs), (ins),
                  [(set I32:$dst, (setcc I64:$src, 0, SETEQ))],
                  "i64.eqz \t$dst, $src", "i64.eqz", 0x50>;
 
+// Optimize away an explicit mask on a shift count.
+def : Pat<(shl I32:$lhs, (and I32:$rhs, 31)), (SHL_I32 I32:$lhs, I32:$rhs)>;
+def : Pat<(sra I32:$lhs, (and I32:$rhs, 31)), (SHR_S_I32 I32:$lhs, I32:$rhs)>;
+def : Pat<(srl I32:$lhs, (and I32:$rhs, 31)), (SHR_U_I32 I32:$lhs, I32:$rhs)>;
+def : Pat<(shl I64:$lhs, (and I64:$rhs, 63)), (SHL_I64 I64:$lhs, I64:$rhs)>;
+def : Pat<(sra I64:$lhs, (and I64:$rhs, 63)), (SHR_S_I64 I64:$lhs, I64:$rhs)>;
+def : Pat<(srl I64:$lhs, (and I64:$rhs, 63)), (SHR_U_I64 I64:$lhs, I64:$rhs)>;
+
 // Optimize away an explicit mask on a rotate count.
 def : Pat<(rotl I32:$lhs, (and I32:$rhs, 31)), (ROTL_I32 I32:$lhs, I32:$rhs)>;
 def : Pat<(rotr I32:$lhs, (and I32:$rhs, 31)), (ROTR_I32 I32:$lhs, I32:$rhs)>;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 2c35b4944fc47..d7058ff049362 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -898,6 +898,35 @@ defm SHL : SIMDShiftInt<wasm_shl, "shl", 107>;
 defm SHR_S : SIMDShiftInt<wasm_shr_s, "shr_s", 108>;
 defm SHR_U : SIMDShiftInt<wasm_shr_u, "shr_u", 109>;
 
+// Optimize away an explicit mask on a shift count.
+def : Pat<(wasm_shl (v16i8 V128:$lhs), (and I32:$rhs, 7)),
+          (SHL_I8x16 V128:$lhs, I32:$rhs)>;
+def : Pat<(wasm_shr_s (v16i8 V128:$lhs), (and I32:$rhs, 7)),
+          (SHR_S_I8x16 V128:$lhs, I32:$rhs)>;
+def : Pat<(wasm_shr_u (v16i8 V128:$lhs), (and I32:$rhs, 7)),
+          (SHR_U_I8x16 V128:$lhs, I32:$rhs)>;
+
+def : Pat<(wasm_shl (v8i16 V128:$lhs), (and I32:$rhs, 15)),
+          (SHL_I16x8 V128:$lhs, I32:$rhs)>;
+def : Pat<(wasm_shr_s (v8i16 V128:$lhs), (and I32:$rhs, 15)),
+          (SHR_S_I16x8 V128:$lhs, I32:$rhs)>;
+def : Pat<(wasm_shr_u (v8i16 V128:$lhs), (and I32:$rhs, 15)),
+          (SHR_U_I16x8 V128:$lhs, I32:$rhs)>;
+
+def : Pat<(wasm_shl (v4i32 V128:$lhs), (and I32:$rhs, 31)),
+          (SHL_I32x4 V128:$lhs, I32:$rhs)>;
+def : Pat<(wasm_shr_s (v4i32 V128:$lhs), (and I32:$rhs, 31)),
+          (SHR_S_I32x4 V128:$lhs, I32:$rhs)>;
+def : Pat<(wasm_shr_u (v4i32 V128:$lhs), (and I32:$rhs, 31)),
+          (SHR_U_I32x4 V128:$lhs, I32:$rhs)>;
+
+def : Pat<(wasm_shl (v2i64 V128:$lhs), (trunc (and I64:$rhs, 63))),
+          (SHL_I64x2 V128:$lhs, (I32_WRAP_I64 I64:$rhs))>;
+def : Pat<(wasm_shr_s (v2i64 V128:$lhs), (trunc (and I64:$rhs, 63))),
+          (SHR_S_I64x2 V128:$lhs, (I32_WRAP_I64 I64:$rhs))>;
+def : Pat<(wasm_shr_u (v2i64 V128:$lhs), (trunc (and I64:$rhs, 63))),
+          (SHR_U_I64x2 V128:$lhs, (I32_WRAP_I64 I64:$rhs))>;
+
 //===----------------------------------------------------------------------===//
 // Integer binary arithmetic
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/WebAssembly/masked-shifts.ll b/llvm/test/CodeGen/WebAssembly/masked-shifts.ll
new file mode 100644
index 0000000000000..75db5e190bd22
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/masked-shifts.ll
@@ -0,0 +1,531 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s
+
+;; Check that masked shift counts are optimized out.
+
+;; TODO: optimize the *_late functions.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+define i32 @shl_i32(i32 %v, i32 %x) {
+; CHECK-LABEL: shl_i32:
+; CHECK:         .functype shl_i32 (i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.shl
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i32 %x, 31
+  %a = shl i32 %v, %m
+  ret i32 %a
+}
+
+define i32 @sra_i32(i32 %v, i32 %x) {
+; CHECK-LABEL: sra_i32:
+; CHECK:         .functype sra_i32 (i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i32 %x, 31
+  %a = ashr i32 %v, %m
+  ret i32 %a
+}
+
+define i32 @srl_i32(i32 %v, i32 %x) {
+; CHECK-LABEL: srl_i32:
+; CHECK:         .functype srl_i32 (i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.shr_u
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i32 %x, 31
+  %a = lshr i32 %v, %m
+  ret i32 %a
+}
+
+define i64 @shl_i64(i64 %v, i64 %x) {
+; CHECK-LABEL: shl_i64:
+; CHECK:         .functype shl_i64 (i64, i64) -> (i64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64.shl
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i64 %x, 63
+  %a = shl i64 %v, %m
+  ret i64 %a
+}
+
+define i64 @sra_i64(i64 %v, i64 %x) {
+; CHECK-LABEL: sra_i64:
+; CHECK:         .functype sra_i64 (i64, i64) -> (i64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i64 %x, 63
+  %a = ashr i64 %v, %m
+  ret i64 %a
+}
+
+define i64 @srl_i64(i64 %v, i64 %x) {
+; CHECK-LABEL: srl_i64:
+; CHECK:         .functype srl_i64 (i64, i64) -> (i64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64.shr_u
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i64 %x, 63
+  %a = lshr i64 %v, %m
+  ret i64 %a
+}
+
+define <16 x i8> @shl_v16i8(<16 x i8> %v, i8 %x) {
+; CHECK-LABEL: shl_v16i8:
+; CHECK:         .functype shl_v16i8 (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shl
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i8 %x, 7
+  %t = insertelement <16 x i8> undef, i8 %m, i32 0
+  %s = shufflevector <16 x i8> %t, <16 x i8> undef,
+    <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
+                i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = shl <16 x i8> %v, %s
+  ret <16 x i8> %a
+}
+
+define <16 x i8> @shl_v16i8_late(<16 x i8> %v, i8 %x) {
+; CHECK-LABEL: shl_v16i8_late:
+; CHECK:         .functype shl_v16i8_late (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.splat
+; CHECK-NEXT:    v128.const 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i8x16.extract_lane_u 0
+; CHECK-NEXT:    i8x16.shl
+; CHECK-NEXT:    # fallthrough-return
+  %t = insertelement <16 x i8> undef, i8 %x, i32 0
+  %s = shufflevector <16 x i8> %t, <16 x i8> undef,
+    <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
+                i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %m = and <16 x i8> %s, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,
+                          i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  %a = shl <16 x i8> %v, %m
+  ret <16 x i8> %a
+}
+
+define <16 x i8> @ashr_v16i8(<16 x i8> %v, i8 %x) {
+; CHECK-LABEL: ashr_v16i8:
+; CHECK:         .functype ashr_v16i8 (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i8 %x, 7
+  %t = insertelement <16 x i8> undef, i8 %m, i32 0
+  %s = shufflevector <16 x i8> %t, <16 x i8> undef,
+    <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
+                i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = ashr <16 x i8> %v, %s
+  ret <16 x i8> %a
+}
+
+define <16 x i8> @ashr_v16i8_late(<16 x i8> %v, i8 %x) {
+; CHECK-LABEL: ashr_v16i8_late:
+; CHECK:         .functype ashr_v16i8_late (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.splat
+; CHECK-NEXT:    v128.const 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i8x16.extract_lane_u 0
+; CHECK-NEXT:    i8x16.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %t = insertelement <16 x i8> undef, i8 %x, i32 0
+  %s = shufflevector <16 x i8> %t, <16 x i8> undef,
+    <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
+                i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %m = and <16 x i8> %s, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,
+                          i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  %a = ashr <16 x i8> %v, %m
+  ret <16 x i8> %a
+}
+
+define <16 x i8> @lshr_v16i8(<16 x i8> %v, i8 %x) {
+; CHECK-LABEL: lshr_v16i8:
+; CHECK:         .functype lshr_v16i8 (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shr_u
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i8 %x, 7
+  %t = insertelement <16 x i8> undef, i8 %m, i32 0
+  %s = shufflevector <16 x i8> %t, <16 x i8> undef,
+    <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
+                i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = lshr <16 x i8> %v, %s
+  ret <16 x i8> %a
+}
+
+define <16 x i8> @lshr_v16i8_late(<16 x i8> %v, i8 %x) {
+; CHECK-LABEL: lshr_v16i8_late:
+; CHECK:         .functype lshr_v16i8_late (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.splat
+; CHECK-NEXT:    v128.const 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i8x16.extract_lane_u 0
+; CHECK-NEXT:    i8x16.shr_u
+; CHECK-NEXT:    # fallthrough-return
+  %t = insertelement <16 x i8> undef, i8 %x, i32 0
+  %s = shufflevector <16 x i8> %t, <16 x i8> undef,
+    <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
+                i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %m = and <16 x i8> %s, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,
+                          i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  %a = lshr <16 x i8> %v, %m
+  ret <16 x i8> %a
+}
+
+define <8 x i16> @shl_v8i16(<8 x i16> %v, i16 %x) {
+; CHECK-LABEL: shl_v8i16:
+; CHECK:         .functype shl_v8i16 (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i16x8.shl
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i16 %x, 15
+  %t = insertelement <8 x i16> undef, i16 %m, i32 0
+  %s = shufflevector <8 x i16> %t, <8 x i16> undef,
+    <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = shl <8 x i16> %v, %s
+  ret <8 x i16> %a
+}
+
+define <8 x i16> @shl_v8i16_late(<8 x i16> %v, i16 %x) {
+; CHECK-LABEL: shl_v8i16_late:
+; CHECK:         .functype shl_v8i16_late (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i16x8.splat
+; CHECK-NEXT:    v128.const 15, 15, 15, 15, 15, 15, 15, 15
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.extract_lane_u 0
+; CHECK-NEXT:    i16x8.shl
+; CHECK-NEXT:    # fallthrough-return
+  %t = insertelement <8 x i16> undef, i16 %x, i32 0
+  %s = shufflevector <8 x i16> %t, <8 x i16> undef,
+    <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %m = and <8 x i16> %s,
+    <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+  %a = shl <8 x i16> %v, %m
+  ret <8 x i16> %a
+}
+
+define <8 x i16> @ashr_v8i16(<8 x i16> %v, i16 %x) {
+; CHECK-LABEL: ashr_v8i16:
+; CHECK:         .functype ashr_v8i16 (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i16x8.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i16 %x, 15
+  %t = insertelement <8 x i16> undef, i16 %m, i32 0
+  %s = shufflevector <8 x i16> %t, <8 x i16> undef,
+    <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = ashr <8 x i16> %v, %s
+  ret <8 x i16> %a
+}
+
+define <8 x i16> @ashr_v8i16_late(<8 x i16> %v, i16 %x) {
+; CHECK-LABEL: ashr_v8i16_late:
+; CHECK:         .functype ashr_v8i16_late (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i16x8.splat
+; CHECK-NEXT:    v128.const 15, 15, 15, 15, 15, 15, 15, 15
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.extract_lane_u 0
+; CHECK-NEXT:    i16x8.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %t = insertelement <8 x i16> undef, i16 %x, i32 0
+  %s = shufflevector <8 x i16> %t, <8 x i16> undef,
+    <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %m = and <8 x i16> %s,
+    <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+  %a = ashr <8 x i16> %v, %m
+  ret <8 x i16> %a
+}
+
+define <8 x i16> @lshr_v8i16(<8 x i16> %v, i16 %x) {
+; CHECK-LABEL: lshr_v8i16:
+; CHECK:         .functype lshr_v8i16 (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i16x8.shr_u
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i16 %x, 15
+  %t = insertelement <8 x i16> undef, i16 %m, i32 0
+  %s = shufflevector <8 x i16> %t, <8 x i16> undef,
+    <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = lshr <8 x i16> %v, %s
+  ret <8 x i16> %a
+}
+
+define <8 x i16> @lshr_v8i16_late(<8 x i16> %v, i16 %x) {
+; CHECK-LABEL: lshr_v8i16_late:
+; CHECK:         .functype lshr_v8i16_late (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i16x8.splat
+; CHECK-NEXT:    v128.const 15, 15, 15, 15, 15, 15, 15, 15
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.extract_lane_u 0
+; CHECK-NEXT:    i16x8.shr_u
+; CHECK-NEXT:    # fallthrough-return
+  %t = insertelement <8 x i16> undef, i16 %x, i32 0
+  %s = shufflevector <8 x i16> %t, <8 x i16> undef,
+    <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %m = and <8 x i16> %s,
+    <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+  %a = lshr <8 x i16> %v, %m
+  ret <8 x i16> %a
+}
+
+define <4 x i32> @shl_v4i32(<4 x i32> %v, i32 %x) {
+; CHECK-LABEL: shl_v4i32:
+; CHECK:         .functype shl_v4i32 (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.shl
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i32 %x, 31
+  %t = insertelement <4 x i32> undef, i32 %m, i32 0
+  %s = shufflevector <4 x i32> %t, <4 x i32> undef,
+    <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %a = shl <4 x i32> %v, %s
+  ret <4 x i32> %a
+}
+
+define <4 x i32> @shl_v4i32_late(<4 x i32> %v, i32 %x) {
+; CHECK-LABEL: shl_v4i32_late:
+; CHECK:         .functype shl_v4i32_late (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.splat
+; CHECK-NEXT:    v128.const 31, 31, 31, 31
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i32x4.extract_lane 0
+; CHECK-NEXT:    i32x4.shl
+; CHECK-NEXT:    # fallthrough-return
+  %t = insertelement <4 x i32> undef, i32 %x, i32 0
+  %s = shufflevector <4 x i32> %t, <4 x i32> undef,
+    <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %m = and <4 x i32> %s, <i32 31, i32 31, i32 31, i32 31>
+  %a = shl <4 x i32> %v, %m
+  ret <4 x i32> %a
+}
+
+define <4 x i32> @ashr_v4i32(<4 x i32> %v, i32 %x) {
+; CHECK-LABEL: ashr_v4i32:
+; CHECK:         .functype ashr_v4i32 (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i32 %x, 31
+  %t = insertelement <4 x i32> undef, i32 %m, i32 0
+  %s = shufflevector <4 x i32> %t, <4 x i32> undef,
+    <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %a = ashr <4 x i32> %v, %s
+  ret <4 x i32> %a
+}
+
+define <4 x i32> @ashr_v4i32_late(<4 x i32> %v, i32 %x) {
+; CHECK-LABEL: ashr_v4i32_late:
+; CHECK:         .functype ashr_v4i32_late (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.splat
+; CHECK-NEXT:    v128.const 31, 31, 31, 31
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i32x4.extract_lane 0
+; CHECK-NEXT:    i32x4.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %t = insertelement <4 x i32> undef, i32 %x, i32 0
+  %s = shufflevector <4 x i32> %t, <4 x i32> undef,
+    <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %m = and <4 x i32> %s, <i32 31, i32 31, i32 31, i32 31>
+  %a = ashr <4 x i32> %v, %m
+  ret <4 x i32> %a
+}
+
+define <4 x i32> @lshr_v4i32(<4 x i32> %v, i32 %x) {
+; CHECK-LABEL: lshr_v4i32:
+; CHECK:         .functype lshr_v4i32 (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.shr_u
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i32 %x, 31
+  %t = insertelement <4 x i32> undef, i32 %m, i32 0
+  %s = shufflevector <4 x i32> %t, <4 x i32> undef,
+    <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %a = lshr <4 x i32> %v, %s
+  ret <4 x i32> %a
+}
+
+define <4 x i32> @lshr_v4i32_late(<4 x i32> %v, i32 %x) {
+; CHECK-LABEL: lshr_v4i32_late:
+; CHECK:         .functype lshr_v4i32_late (v128, i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.splat
+; CHECK-NEXT:    v128.const 31, 31, 31, 31
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i32x4.extract_lane 0
+; CHECK-NEXT:    i32x4.shr_u
+; CHECK-NEXT:    # fallthrough-return
+  %t = insertelement <4 x i32> undef, i32 %x, i32 0
+  %s = shufflevector <4 x i32> %t, <4 x i32> undef,
+    <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %m = and <4 x i32> %s, <i32 31, i32 31, i32 31, i32 31>
+  %a = lshr <4 x i32> %v, %m
+  ret <4 x i32> %a
+}
+
+define <2 x i64> @shl_v2i64(<2 x i64> %v, i64 %x) {
+; CHECK-LABEL: shl_v2i64:
+; CHECK:         .functype shl_v2i64 (v128, i64) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.wrap_i64
+; CHECK-NEXT:    i64x2.shl
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i64 %x, 63
+  %t = insertelement <2 x i64> undef, i64 %m, i32 0
+  %s = shufflevector <2 x i64> %t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %a = shl <2 x i64> %v, %s
+  ret <2 x i64> %a
+}
+
+define <2 x i64> @shl_v2i64_late(<2 x i64> %v, i64 %x) {
+; CHECK-LABEL: shl_v2i64_late:
+; CHECK:         .functype shl_v2i64_late (v128, i64) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64x2.splat
+; CHECK-NEXT:    v128.const 63, 63
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i64x2.extract_lane 0
+; CHECK-NEXT:    i32.wrap_i64
+; CHECK-NEXT:    i64x2.shl
+; CHECK-NEXT:    # fallthrough-return
+  %t = insertelement <2 x i64> undef, i64 %x, i32 0
+  %s = shufflevector <2 x i64> %t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %m = and <2 x i64> %s, <i64 63, i64 63>
+  %a = shl <2 x i64> %v, %m
+  ret <2 x i64> %a
+}
+
+define <2 x i64> @ashr_v2i64(<2 x i64> %v, i64 %x) {
+; CHECK-LABEL: ashr_v2i64:
+; CHECK:         .functype ashr_v2i64 (v128, i64) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.wrap_i64
+; CHECK-NEXT:    i64x2.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i64 %x, 63
+  %t = insertelement <2 x i64> undef, i64 %m, i32 0
+  %s = shufflevector <2 x i64> %t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %a = ashr <2 x i64> %v, %s
+  ret <2 x i64> %a
+}
+
+define <2 x i64> @ashr_v2i64_late(<2 x i64> %v, i64 %x) {
+; CHECK-LABEL: ashr_v2i64_late:
+; CHECK:         .functype ashr_v2i64_late (v128, i64) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64x2.splat
+; CHECK-NEXT:    v128.const 63, 63
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i64x2.extract_lane 0
+; CHECK-NEXT:    i32.wrap_i64
+; CHECK-NEXT:    i64x2.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %t = insertelement <2 x i64> undef, i64 %x, i32 0
+  %s = shufflevector <2 x i64> %t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %m = and <2 x i64> %s, <i64 63, i64 63>
+  %a = ashr <2 x i64> %v, %m
+  ret <2 x i64> %a
+}
+
+define <2 x i64> @lshr_v2i64(<2 x i64> %v, i64 %x) {
+; CHECK-LABEL: lshr_v2i64:
+; CHECK:         .functype lshr_v2i64 (v128, i64) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.wrap_i64
+; CHECK-NEXT:    i64x2.shr_u
+; CHECK-NEXT:    # fallthrough-return
+  %m = and i64 %x, 63
+  %t = insertelement <2 x i64> undef, i64 %m, i32 0
+  %s = shufflevector <2 x i64> %t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %a = lshr <2 x i64> %v, %s
+  ret <2 x i64> %a
+}
+
+define <2 x i64> @lshr_v2i64_late(<2 x i64> %v, i64 %x) {
+; CHECK-LABEL: lshr_v2i64_late:
+; CHECK:         .functype lshr_v2i64_late (v128, i64) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64x2.splat
+; CHECK-NEXT:    v128.const 63, 63
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i64x2.extract_lane 0
+; CHECK-NEXT:    i32.wrap_i64
+; CHECK-NEXT:    i64x2.shr_u
+; CHECK-NEXT:    # fallthrough-return
+  %t = insertelement <2 x i64> undef, i64 %x, i32 0
+  %s = shufflevector <2 x i64> %t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  %m = and <2 x i64> %s, <i64 63, i64 63>
+  %a = lshr <2 x i64> %v, %m
+  ret <2 x i64> %a
+}

From 0fd5e7b2d8ca4ed46d76187feb4b903ed0a3ea75 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Wed, 7 Jul 2021 23:31:48 -0700
Subject: [PATCH 18/32] [WebAssembly][lld] Fix segfault on .bss sections in
 mapfile

When memory is declared in the Wasm module, we rely on the implicit zero
initialization behavior and do not explicitly output .bss sections. The means
that they do not have associated `outputSec` entries, which was causing
segfaults in the mapfile support. Fix the issue by guarding against null
`outputSec` and falling back to using a zero offset.

Differential Revision: https://reviews.llvm.org/D102951
---
 lld/test/wasm/map-file.s | 38 +++++++++++++++++++++++++-------------
 lld/wasm/MapFile.cpp     | 12 ++++++++----
 2 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/lld/test/wasm/map-file.s b/lld/test/wasm/map-file.s
index a215cc14d4cc0..9ca2f196f331e 100644
--- a/lld/test/wasm/map-file.s
+++ b/lld/test/wasm/map-file.s
@@ -10,6 +10,9 @@ wasm_global:
 bar:
     .functype bar () -> ()
     i32.const   somedata
+    i32.const   somezeroes
+    drop
+    drop
     end_function
 
 write_global:
@@ -30,9 +33,15 @@ somedata:
     .int32 123
 .size somedata, 4
 
+.section .bss.somezeroes,"",@
+somezeroes:
+    .int32 0
+.size somezeroes, 4
+
 .section .debug_info,"",@
     .int32 bar
 
+
 #      CHECK:    Addr      Off     Size Out     In      Symbol
 # CHECK-NEXT:       -        8        a TYPE
 # CHECK-NEXT:       -       12        6 FUNCTION
@@ -42,19 +51,22 @@ somedata:
 # CHECK-NEXT:       0        0        0         __stack_pointer
 # CHECK-NEXT:       1        0        0         wasm_global
 # CHECK-NEXT:       -       33       15 EXPORT
-# CHECK-NEXT:       -       48       26 CODE
-# CHECK-NEXT:       -       49        9         {{.*}}{{/|\\}}map-file.s.tmp1.o:(bar)
-# CHECK-NEXT:       -       49        9                 bar
-# CHECK-NEXT:       -       52        b         {{.*}}{{/|\\}}map-file.s.tmp1.o:(write_global)
-# CHECK-NEXT:       -       52        b                 write_global
-# CHECK-NEXT:       -       5d        f         {{.*}}{{/|\\}}map-file.s.tmp1.o:(_start)
-# CHECK-NEXT:       -       5d        f                 _start
-# CHECK-NEXT:       -       6e        d DATA
-# CHECK-NEXT:     400       6f        4 .data
-# CHECK-NEXT:     400       75        4         {{.*}}{{/|\\}}map-file.s.tmp1.o:(.data.somedata)
-# CHECK-NEXT:     400       75        4                 somedata
-# CHECK-NEXT:       -       7b       12 CUSTOM(.debug_info)
-# CHECK-NEXT:       -       8d       50 CUSTOM(name)
+# CHECK-NEXT:       -       48       2e CODE
+# CHECK-NEXT:       -       49       11         {{.*}}{{/|\\}}map-file.s.tmp1.o:(bar)
+# CHECK-NEXT:       -       49       11                 bar
+# CHECK-NEXT:       -       5a        b         {{.*}}{{/|\\}}map-file.s.tmp1.o:(write_global)
+# CHECK-NEXT:       -       5a        b                 write_global
+# CHECK-NEXT:       -       65        f         {{.*}}{{/|\\}}map-file.s.tmp1.o:(_start)
+# CHECK-NEXT:       -       65        f                 _start
+# CHECK-NEXT:       -       76        d DATA
+# CHECK-NEXT:     400       77        4 .data
+# CHECK-NEXT:     400       7d        4         {{.*}}{{/|\\}}map-file.s.tmp1.o:(.data.somedata)
+# CHECK-NEXT:     400       7d        4                 somedata
+# CHECK-NEXT:     404       76        4 .bss
+# CHECK-NEXT:     404        0        4         {{.*}}{{/|\\}}map-file.s.tmp1.o:(.bss.somezeroes)
+# CHECK-NEXT:     404        0        4                 somezeroes
+# CHECK-NEXT:       -       83       12 CUSTOM(.debug_info)
+# CHECK-NEXT:       -       95       50 CUSTOM(name)
 
 # RUN: not wasm-ld %t1.o -o /dev/null -Map=/ 2>&1 \
 # RUN:  | FileCheck -check-prefix=FAIL %s
diff --git a/lld/wasm/MapFile.cpp b/lld/wasm/MapFile.cpp
index c964efe1e742f..9dbab5046e23a 100644
--- a/lld/wasm/MapFile.cpp
+++ b/lld/wasm/MapFile.cpp
@@ -80,7 +80,9 @@ getSymbolStrings(ArrayRef<Symbol *> syms) {
     auto *chunk = syms[i]->getChunk();
     if (chunk == nullptr)
       return;
-    uint64_t fileOffset = chunk->outputSec->getOffset() + chunk->outSecOff;
+    uint64_t fileOffset = chunk->outputSec != nullptr
+                              ? chunk->outputSec->getOffset() + chunk->outSecOff
+                              : 0;
     uint64_t vma = -1;
     uint64_t size = 0;
     if (auto *DD = dyn_cast<DefinedData>(syms[i])) {
@@ -138,9 +140,11 @@ void lld::wasm::writeMapFile(ArrayRef<OutputSection *> outputSections) {
                     oseg->size);
         os << oseg->name << '\n';
         for (auto *chunk : oseg->inputSegments) {
-          writeHeader(os, chunk->getVA(),
-                      chunk->outputSec->getOffset() + chunk->outSecOff,
-                      chunk->getSize());
+          uint64_t offset =
+              chunk->outputSec != nullptr
+                  ? chunk->outputSec->getOffset() + chunk->outSecOff
+                  : 0;
+          writeHeader(os, chunk->getVA(), offset, chunk->getSize());
           os.indent(8) << toString(chunk) << '\n';
           for (Symbol *sym : sectionSyms[chunk])
             os << symStr[sym] << '\n';

From 963378bd8278220eb382bec76846ef39e4ea597e Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Thu, 8 Jul 2021 15:29:47 +1000
Subject: [PATCH 19/32] [ORC] Improve computeLocalDeps /
 computeNamedSymbolDependencies performance.

The computeNamedSymbolDependencies and computeLocalDeps methods on
ObjectLinkingLayerJITLinkContext are responsible for computing, for each symbol
in the current MaterializationResponsibility, the set of non-locally-scoped
symbols that are depended on. To calculate this we have to consider the effect
of chains of dependence through locally scoped symbols in the LinkGraph. E.g.

        .text
        .globl  foo
foo:
        callq   bar                    ## foo depneds on external 'bar'
        movq    Ltmp1(%rip), %rcx      ## foo depends on locally scoped 'Ltmp1'
        addl    (%rcx), %eax
        retq

        .data
Ltmp1:
        .quad   x                      ## Ltmp1 depends on external 'x'

In this example symbol 'foo' depends directly on 'bar', and indirectly on 'x'
via 'Ltmp1', which is locally scoped.

Performance of the existing implementations appears to have been mediocre:
Based on flame graphs posted by @drmeister (in #jit on the LLVM discord server)
the computeLocalDeps function was taking up a substantial amount of time when
starting up Clasp (https://github.com/clasp-developers/clasp).

This commit attempts to address the performance problems in three ways:

1. Using jitlink::Blocks instead of jitlink::Symbols as the nodes of the
dependencies-introduced-by-locally-scoped-symbols graph.

Using either Blocks or Symbols as nodes provides the same information, but since
there may be more than one locally scoped symbol per block the block-based
version of the dependence graph should always be a subgraph of the Symbol-based
version, and so faster to operate on.

2. Improved worklist management.

The older version of computeLocalDeps used a fixed worklist containing all
nodes, and iterated over this list propagating dependencies until no further
changes were required. The worklist was not sorted into a useful order before
the loop started.

The new version uses a variable work-stack, visiting nodes in DFS order and
only adding nodes when there is meaningful work to do on them.

Compared to the old version the new version avoids revisiting nodes which
haven't changed, and I suspect it converges more quickly (due to the DFS
ordering).

3. Laziness and caching.

Mappings of...

jitlink::Symbol* -> Interned Name (as SymbolStringPtr)
jitlink::Block* -> Immediate dependencies (as SymbolNameSet)
jitlink::Block* -> Transitive dependencies (as SymbolNameSet)

are all built lazily and cached while running computeNamedSymbolDependencies.

According to @drmeister these changes reduced Clasp startup time in his test
setup (averaged over a handful of starts) from 4.8 to 2.8 seconds (with
ORC/JITLink linking ~11,000 object files in that time), which seems like
enough to justify switching to the new algorithm in the absence of any other
perf numbers.
---
 .../llvm/ExecutionEngine/Orc/MachOPlatform.h  |   8 +-
 .../ExecutionEngine/Orc/ObjectLinkingLayer.h  |  17 +-
 .../lib/ExecutionEngine/Orc/MachOPlatform.cpp |  30 +-
 .../Orc/ObjectLinkingLayer.cpp                | 267 ++++++++++--------
 4 files changed, 180 insertions(+), 142 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
index f9d0b587a1bed..9eb2ce33cf817 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
@@ -111,8 +111,8 @@ class MachOPlatform : public Platform {
                           jitlink::LinkGraph &G,
                           jitlink::PassConfiguration &Config) override;
 
-    LocalDependenciesMap getSyntheticSymbolLocalDependencies(
-        MaterializationResponsibility &MR) override;
+    SyntheticSymbolDependenciesMap
+    getSyntheticSymbolDependencies(MaterializationResponsibility &MR) override;
 
     // FIXME: We should be tentatively tracking scraped sections and discarding
     // if the MR fails.
@@ -129,9 +129,9 @@ class MachOPlatform : public Platform {
 
   private:
     using InitSymbolDepMap =
-        DenseMap<MaterializationResponsibility *, JITLinkSymbolVector>;
+        DenseMap<MaterializationResponsibility *, JITLinkSymbolSet>;
 
-    void preserveInitSectionIfPresent(JITLinkSymbolVector &Syms,
+    void preserveInitSectionIfPresent(JITLinkSymbolSet &Symbols,
                                       jitlink::LinkGraph &G,
                                       StringRef SectionName);
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h b/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h
index 55d0634a82ae5..3bb83342dcdbb 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h
@@ -64,8 +64,9 @@ class ObjectLinkingLayer : public RTTIExtends<ObjectLinkingLayer, ObjectLayer>,
   /// configured.
   class Plugin {
   public:
-    using JITLinkSymbolVector = std::vector<const jitlink::Symbol *>;
-    using LocalDependenciesMap = DenseMap<SymbolStringPtr, JITLinkSymbolVector>;
+    using JITLinkSymbolSet = DenseSet<jitlink::Symbol *>;
+    using SyntheticSymbolDependenciesMap =
+        DenseMap<SymbolStringPtr, JITLinkSymbolSet>;
 
     virtual ~Plugin();
     virtual void modifyPassConfig(MaterializationResponsibility &MR,
@@ -89,12 +90,12 @@ class ObjectLinkingLayer : public RTTIExtends<ObjectLinkingLayer, ObjectLayer>,
                                              ResourceKey SrcKey) = 0;
 
     /// Return any dependencies that synthetic symbols (e.g. init symbols)
-    /// have on locally scoped jitlink::Symbols. This is used by the
-    /// ObjectLinkingLayer to update the dependencies for the synthetic
-    /// symbols.
-    virtual LocalDependenciesMap
-    getSyntheticSymbolLocalDependencies(MaterializationResponsibility &MR) {
-      return LocalDependenciesMap();
+    /// have on symbols in the LinkGraph.
+    /// This is used by the ObjectLinkingLayer to update the dependencies for
+    /// the synthetic symbols.
+    virtual SyntheticSymbolDependenciesMap
+    getSyntheticSymbolDependencies(MaterializationResponsibility &MR) {
+      return SyntheticSymbolDependenciesMap();
     }
   };
 
diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index 74c88b0c1c85b..39557a485cf28 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -314,17 +314,14 @@ void MachOPlatform::InitScraperPlugin::modifyPassConfig(
     return;
 
   Config.PrePrunePasses.push_back([this, &MR](jitlink::LinkGraph &G) -> Error {
-    JITLinkSymbolVector InitSectionSymbols;
-    preserveInitSectionIfPresent(InitSectionSymbols, G,
-                                 "__DATA,__mod_init_func");
-    preserveInitSectionIfPresent(InitSectionSymbols, G,
-                                 "__DATA,__objc_selrefs");
-    preserveInitSectionIfPresent(InitSectionSymbols, G,
-                                 "__DATA,__objc_classlist");
-
-    if (!InitSectionSymbols.empty()) {
+    JITLinkSymbolSet InitSectionSyms;
+    preserveInitSectionIfPresent(InitSectionSyms, G, "__DATA,__mod_init_func");
+    preserveInitSectionIfPresent(InitSectionSyms, G, "__DATA,__objc_selrefs");
+    preserveInitSectionIfPresent(InitSectionSyms, G, "__DATA,__objc_classlist");
+
+    if (!InitSectionSyms.empty()) {
       std::lock_guard<std::mutex> Lock(InitScraperMutex);
-      InitSymbolDeps[&MR] = std::move(InitSectionSymbols);
+      InitSymbolDeps[&MR] = std::move(InitSectionSyms);
     }
 
     if (auto Err = processObjCImageInfo(G, MR))
@@ -398,27 +395,26 @@ void MachOPlatform::InitScraperPlugin::modifyPassConfig(
   });
 }
 
-ObjectLinkingLayer::Plugin::LocalDependenciesMap
-MachOPlatform::InitScraperPlugin::getSyntheticSymbolLocalDependencies(
+ObjectLinkingLayer::Plugin::SyntheticSymbolDependenciesMap
+MachOPlatform::InitScraperPlugin::getSyntheticSymbolDependencies(
     MaterializationResponsibility &MR) {
   std::lock_guard<std::mutex> Lock(InitScraperMutex);
   auto I = InitSymbolDeps.find(&MR);
   if (I != InitSymbolDeps.end()) {
-    LocalDependenciesMap Result;
+    SyntheticSymbolDependenciesMap Result;
     Result[MR.getInitializerSymbol()] = std::move(I->second);
     InitSymbolDeps.erase(&MR);
     return Result;
   }
-  return LocalDependenciesMap();
+  return SyntheticSymbolDependenciesMap();
 }
 
 void MachOPlatform::InitScraperPlugin::preserveInitSectionIfPresent(
-    JITLinkSymbolVector &Symbols, jitlink::LinkGraph &G,
-    StringRef SectionName) {
+    JITLinkSymbolSet &Symbols, jitlink::LinkGraph &G, StringRef SectionName) {
   if (auto *Sec = G.findSectionByName(SectionName)) {
     auto SecBlocks = Sec->blocks();
     if (!llvm::empty(SecBlocks))
-      Symbols.push_back(
+      Symbols.insert(
           &G.addAnonymousSymbol(**SecBlocks.begin(), 0, 0, false, true));
   }
 }
diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
index c10aa15ef2697..a45b18544609d 100644
--- a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
@@ -331,12 +331,82 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext {
   }
 
 private:
-  struct LocalSymbolNamedDependencies {
+  // Symbol name dependencies:
+  // Internal: Defined in this graph.
+  // External: Defined externally.
+  struct BlockSymbolDependencies {
     SymbolNameSet Internal, External;
   };
 
-  using LocalSymbolNamedDependenciesMap =
-      DenseMap<const Symbol *, LocalSymbolNamedDependencies>;
+  // Lazily populated map of blocks to BlockSymbolDependencies values.
+  class BlockDependenciesMap {
+  public:
+    BlockDependenciesMap(ExecutionSession &ES,
+                         DenseMap<const Block *, DenseSet<Block *>> BlockDeps)
+        : ES(ES), BlockDeps(std::move(BlockDeps)) {}
+
+    const BlockSymbolDependencies &operator[](const Block &B) {
+      // Check the cache first.
+      auto I = BlockTransitiveDepsCache.find(&B);
+      if (I != BlockTransitiveDepsCache.end())
+        return I->second;
+
+      // No value. Populate the cache.
+      BlockSymbolDependencies BTDCacheVal;
+      auto BDI = BlockDeps.find(&B);
+      assert(BDI != BlockDeps.end() && "No block dependencies");
+
+      for (auto *BDep : BDI->second) {
+        auto &BID = getBlockImmediateDeps(*BDep);
+        for (auto &ExternalDep : BID.External)
+          BTDCacheVal.External.insert(ExternalDep);
+        for (auto &InternalDep : BID.Internal)
+          BTDCacheVal.Internal.insert(InternalDep);
+      }
+
+      return BlockTransitiveDepsCache
+          .insert(std::make_pair(&B, std::move(BTDCacheVal)))
+          .first->second;
+    }
+
+    SymbolStringPtr &getInternedName(Symbol &Sym) {
+      auto I = NameCache.find(&Sym);
+      if (I != NameCache.end())
+        return I->second;
+
+      return NameCache.insert(std::make_pair(&Sym, ES.intern(Sym.getName())))
+          .first->second;
+    }
+
+  private:
+    BlockSymbolDependencies &getBlockImmediateDeps(Block &B) {
+      // Check the cache first.
+      auto I = BlockImmediateDepsCache.find(&B);
+      if (I != BlockImmediateDepsCache.end())
+        return I->second;
+
+      BlockSymbolDependencies BIDCacheVal;
+      for (auto &E : B.edges()) {
+        auto &Tgt = E.getTarget();
+        if (Tgt.getScope() != Scope::Local) {
+          if (Tgt.isExternal())
+            BIDCacheVal.External.insert(getInternedName(Tgt));
+          else
+            BIDCacheVal.Internal.insert(getInternedName(Tgt));
+        }
+      }
+
+      return BlockImmediateDepsCache
+          .insert(std::make_pair(&B, std::move(BIDCacheVal)))
+          .first->second;
+    }
+
+    ExecutionSession &ES;
+    DenseMap<const Block *, DenseSet<Block *>> BlockDeps;
+    DenseMap<const Symbol *, SymbolStringPtr> NameCache;
+    DenseMap<const Block *, BlockSymbolDependencies> BlockImmediateDepsCache;
+    DenseMap<const Block *, BlockSymbolDependencies> BlockTransitiveDepsCache;
+  };
 
   Error claimOrExternalizeWeakAndCommonSymbols(LinkGraph &G) {
     auto &ES = Layer.getExecutionSession();
@@ -384,7 +454,7 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext {
 
   Error computeNamedSymbolDependencies(LinkGraph &G) {
     auto &ES = MR->getTargetJITDylib().getExecutionSession();
-    auto LocalDeps = computeLocalDeps(G);
+    auto BlockDeps = computeBlockNonLocalDeps(G);
 
     // Compute dependencies for symbols defined in the JITLink graph.
     for (auto *Sym : G.defined_symbols()) {
@@ -395,58 +465,41 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext {
       assert(Sym->hasName() &&
              "Defined non-local jitlink::Symbol should have a name");
 
-      SymbolNameSet ExternalSymDeps, InternalSymDeps;
-
-      // Find internal and external named symbol dependencies.
-      for (auto &E : Sym->getBlock().edges()) {
-        auto &TargetSym = E.getTarget();
-
-        if (TargetSym.getScope() != Scope::Local) {
-          if (TargetSym.isExternal())
-            ExternalSymDeps.insert(ES.intern(TargetSym.getName()));
-          else if (&TargetSym != Sym)
-            InternalSymDeps.insert(ES.intern(TargetSym.getName()));
-        } else {
-          assert(TargetSym.isDefined() &&
-                 "local symbols must be defined");
-          auto I = LocalDeps.find(&TargetSym);
-          if (I != LocalDeps.end()) {
-            for (auto &S : I->second.External)
-              ExternalSymDeps.insert(S);
-            for (auto &S : I->second.Internal)
-              InternalSymDeps.insert(S);
-          }
-        }
-      }
-
-      if (ExternalSymDeps.empty() && InternalSymDeps.empty())
+      auto &SymDeps = BlockDeps[Sym->getBlock()];
+      if (SymDeps.External.empty() && SymDeps.Internal.empty())
         continue;
 
       auto SymName = ES.intern(Sym->getName());
-      if (!ExternalSymDeps.empty())
-        ExternalNamedSymbolDeps[SymName] = std::move(ExternalSymDeps);
-      if (!InternalSymDeps.empty())
-        InternalNamedSymbolDeps[SymName] = std::move(InternalSymDeps);
+      if (!SymDeps.External.empty())
+        ExternalNamedSymbolDeps[SymName] = SymDeps.External;
+      if (!SymDeps.Internal.empty())
+        InternalNamedSymbolDeps[SymName] = SymDeps.Internal;
     }
 
     for (auto &P : Layer.Plugins) {
-      auto SyntheticLocalDeps = P->getSyntheticSymbolLocalDependencies(*MR);
-      if (SyntheticLocalDeps.empty())
+      auto SynthDeps = P->getSyntheticSymbolDependencies(*MR);
+      if (SynthDeps.empty())
         continue;
 
-      for (auto &KV : SyntheticLocalDeps) {
+      DenseSet<Block *> BlockVisited;
+      for (auto &KV : SynthDeps) {
         auto &Name = KV.first;
-        auto &LocalDepsForName = KV.second;
-        for (auto *Local : LocalDepsForName) {
-          assert(Local->getScope() == Scope::Local &&
-                 "Dependence on non-local symbol");
-          auto LocalNamedDepsItr = LocalDeps.find(Local);
-          if (LocalNamedDepsItr == LocalDeps.end())
-            continue;
-          for (auto &S : LocalNamedDepsItr->second.Internal)
-            InternalNamedSymbolDeps[Name].insert(S);
-          for (auto &S : LocalNamedDepsItr->second.External)
-            ExternalNamedSymbolDeps[Name].insert(S);
+        auto &DepsForName = KV.second;
+        for (auto *Sym : DepsForName) {
+          if (Sym->getScope() == Scope::Local) {
+            auto &BDeps = BlockDeps[Sym->getBlock()];
+            for (auto &S : BDeps.Internal)
+              InternalNamedSymbolDeps[Name].insert(S);
+            for (auto &S : BDeps.External)
+              ExternalNamedSymbolDeps[Name].insert(S);
+          } else {
+            if (Sym->isExternal())
+              ExternalNamedSymbolDeps[Name].insert(
+                  BlockDeps.getInternedName(*Sym));
+            else
+              InternalNamedSymbolDeps[Name].insert(
+                  BlockDeps.getInternedName(*Sym));
+          }
         }
       }
     }
@@ -454,81 +507,69 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext {
     return Error::success();
   }
 
-  LocalSymbolNamedDependenciesMap computeLocalDeps(LinkGraph &G) {
-    DenseMap<jitlink::Symbol *, DenseSet<jitlink::Symbol *>> DepMap;
-
-    // For all local symbols:
-    // (1) Add their named dependencies.
-    // (2) Add them to the worklist for further iteration if they have any
-    //     depend on any other local symbols.
-    struct WorklistEntry {
-      WorklistEntry(Symbol *Sym, DenseSet<Symbol *> LocalDeps)
-          : Sym(Sym), LocalDeps(std::move(LocalDeps)) {}
-
-      Symbol *Sym = nullptr;
-      DenseSet<Symbol *> LocalDeps;
+  BlockDependenciesMap computeBlockNonLocalDeps(LinkGraph &G) {
+    // First calculate the reachable-via-non-local-symbol blocks for each block.
+    struct BlockInfo {
+      DenseSet<Block *> Dependencies;
+      DenseSet<Block *> Dependants;
+      bool DependenciesChanged = true;
     };
-    std::vector<WorklistEntry> Worklist;
-    for (auto *Sym : G.defined_symbols())
-      if (Sym->getScope() == Scope::Local) {
-        auto &SymNamedDeps = DepMap[Sym];
-        DenseSet<Symbol *> LocalDeps;
-
-        for (auto &E : Sym->getBlock().edges()) {
-          auto &TargetSym = E.getTarget();
-          if (TargetSym.getScope() != Scope::Local)
-            SymNamedDeps.insert(&TargetSym);
-          else {
-            assert(TargetSym.isDefined() &&
-                   "local symbols must be defined");
-            LocalDeps.insert(&TargetSym);
+    DenseMap<Block *, BlockInfo> BlockInfos;
+    SmallVector<Block *> WorkList;
+
+    // Pre-allocate map entries. This prevents any iterator/reference
+    // invalidation in the next loop.
+    for (auto *B : G.blocks())
+      (void)BlockInfos[B];
+
+    // Build initial worklist, record block dependencies/dependants and
+    // non-local symbol dependencies.
+    for (auto *B : G.blocks()) {
+      auto &BI = BlockInfos[B];
+      for (auto &E : B->edges()) {
+        if (E.getTarget().getScope() == Scope::Local) {
+          auto &TgtB = E.getTarget().getBlock();
+          if (&TgtB != B) {
+            BI.Dependencies.insert(&TgtB);
+            BlockInfos[&TgtB].Dependants.insert(B);
           }
         }
-
-        if (!LocalDeps.empty())
-          Worklist.push_back(WorklistEntry(Sym, std::move(LocalDeps)));
       }
 
-    // Loop over all local symbols with local dependencies, propagating
-    // their respective non-local dependencies. Iterate until we hit a stable
-    // state.
-    bool Changed;
-    do {
-      Changed = false;
-      for (auto &WLEntry : Worklist) {
-        auto *Sym = WLEntry.Sym;
-        auto &NamedDeps = DepMap[Sym];
-        auto &LocalDeps = WLEntry.LocalDeps;
-
-        for (auto *TargetSym : LocalDeps) {
-          auto I = DepMap.find(TargetSym);
-          if (I != DepMap.end())
-            for (const auto &S : I->second)
-              Changed |= NamedDeps.insert(S).second;
-        }
-      }
-    } while (Changed);
+      // If this node has both dependants and dependencies then add it to the
+      // worklist to propagate the dependencies to the dependants.
+      if (!BI.Dependants.empty() && !BI.Dependencies.empty())
+        WorkList.push_back(B);
+    }
 
-    // Intern the results to produce a mapping of jitlink::Symbol* to internal
-    // and external symbol names.
-    auto &ES = Layer.getExecutionSession();
-    LocalSymbolNamedDependenciesMap Result;
-    for (auto &KV : DepMap) {
-      auto *Local = KV.first;
-      assert(Local->getScope() == Scope::Local &&
-             "DepMap keys should all be local symbols");
-      auto &LocalNamedDeps = Result[Local];
-      for (auto *Named : KV.second) {
-        assert(Named->getScope() != Scope::Local &&
-               "DepMap values should all be non-local symbol sets");
-        if (Named->isExternal())
-          LocalNamedDeps.External.insert(ES.intern(Named->getName()));
-        else
-          LocalNamedDeps.Internal.insert(ES.intern(Named->getName()));
+    // Propagate block-level dependencies through the block-dependence graph.
+    while (!WorkList.empty()) {
+      auto *B = WorkList.back();
+      WorkList.pop_back();
+
+      auto &BI = BlockInfos[B];
+      assert(BI.DependenciesChanged &&
+             "Block in worklist has unchanged dependencies");
+      BI.DependenciesChanged = false;
+      for (auto *Dependant : BI.Dependants) {
+        auto &DependantBI = BlockInfos[Dependant];
+        for (auto *Dependency : BI.Dependencies) {
+          if (Dependant != Dependency &&
+              DependantBI.Dependencies.insert(Dependency).second)
+            if (!DependantBI.DependenciesChanged) {
+              DependantBI.DependenciesChanged = true;
+              WorkList.push_back(Dependant);
+            }
+        }
       }
     }
 
-    return Result;
+    DenseMap<const Block *, DenseSet<Block *>> BlockDeps;
+    for (auto &KV : BlockInfos)
+      BlockDeps[KV.first] = std::move(KV.second.Dependencies);
+
+    return BlockDependenciesMap(Layer.getExecutionSession(),
+                                std::move(BlockDeps));
   }
 
   void registerDependencies(const SymbolDependenceMap &QueryDeps) {

From d7afd11e3dc14d50156618cb27689f1425239c86 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Thu, 8 Jul 2021 16:29:39 +1000
Subject: [PATCH 20/32] [ORC] Introduce ExecutorAddress type, fix broken LLDB
 bot.

ExecutorAddressRange depended on JITTargetAddress, but JITTargetAddress is
defined in ExecutionEngine, which OrcShared should not depend on.

This seems like as good a time as any to introduce a new ExecutorAddress type
to eventually replace JITTargetAddress. For now it's just another uint64_t
alias, but it will soon be changed to a class type to provide greater type
safety.
---
 .../ExecutionEngine/Orc/Shared/CommonOrcRuntimeTypes.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/CommonOrcRuntimeTypes.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/CommonOrcRuntimeTypes.h
index 8b0e6272a555b..efc4409b84f47 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/CommonOrcRuntimeTypes.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/CommonOrcRuntimeTypes.h
@@ -19,18 +19,20 @@ namespace llvm {
 namespace orc {
 namespace shared {
 
+// Placeholder for future replacement for JITTargetAddress.
+using ExecutorAddress = uint64_t;
+
 /// Represents an address range in the exceutor process.
 struct ExecutorAddressRange {
   ExecutorAddressRange() = default;
-  ExecutorAddressRange(JITTargetAddress StartAddress,
-                       JITTargetAddress EndAddress)
+  ExecutorAddressRange(ExecutorAddress StartAddress, ExecutorAddress EndAddress)
       : StartAddress(StartAddress), EndAddress(EndAddress) {}
 
   bool empty() const { return StartAddress == EndAddress; }
   size_t size() const { return EndAddress - StartAddress; }
 
-  JITTargetAddress StartAddress = 0;
-  JITTargetAddress EndAddress = 0;
+  ExecutorAddress StartAddress = 0;
+  ExecutorAddress EndAddress = 0;
 };
 
 using SPSExecutorAddressRange =

From 511af1b1ad005af61ce792286a76633cd56ef7f9 Mon Sep 17 00:00:00 2001
From: Tobias Gysi <gysit@google.com>
Date: Thu, 8 Jul 2021 06:23:55 +0000
Subject: [PATCH 21/32] [mlir][linalg] Tighter StructuredOp Verification.

Verify the number of results matches exactly the number of output tensors. Simplify the FillOp verification since part of it got redundant.

Differential Revision: https://reviews.llvm.org/D105427
---
 mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp | 14 ++++++--------
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp        |  4 ----
 mlir/test/Dialect/Linalg/invalid.mlir           |  6 +++---
 3 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
index 7774dbe5cd722..7d22cfd3ef0eb 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
@@ -338,10 +338,12 @@ LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) {
     return op->emitOpError("expected at least one output operand");
   if (failed(OpTrait::impl::verifyNOperands(op, numInputs + numOutputs)))
     return failure();
-  // Should have at least one output tensor per result tensor.
-  // Can also have outbut buffers that do not correspond to results.
-  if (op->getNumResults() > linalgOp.getOutputTensorOperands().size())
-    return op->emitOpError("unexpected #results > #outputs");
+  // Verify the number of results matches the number of output tensors.
+  if (op->getNumResults() != linalgOp.getOutputTensorOperands().size())
+    return op->emitOpError("expected the number of results (")
+           << op->getNumResults()
+           << ") to be equal to the number of output tensors ("
+           << linalgOp.getOutputTensorOperands().size() << ")";
 
   // Before checking indexing maps, we need to make sure the attributes
   // referenced by it are valid.
@@ -394,10 +396,6 @@ LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) {
         "all have buffer type");
 
   for (OpOperand *opOperand : linalgOp.getOutputTensorOperands()) {
-    // TODO: Enforce one output tensor per result?
-    if (opOperand->getOperandNumber() - linalgOp.getNumInputs() >=
-        linalgOp->getNumResults())
-      continue;
     OpResult result = linalgOp.getTiedOpResult(opOperand);
     if (result.getType() != opOperand->get().getType())
       return op->emitOpError("expected type of operand #")
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 66cad6eaa3ccc..93062b10ccc63 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -458,10 +458,6 @@ static LogicalResult verify(FillOp op) {
   Type fillType = op.value().getType();
   if (getElementTypeOrSelf(output->get()) != fillType)
     return op.emitOpError("expects fill type to match view elemental type");
-  if (!op.getNumResults() && !output->get().getType().isa<MemRefType>()) {
-    return op.emitOpError(
-        "expected fill op with no result value to use memref type");
-  }
   return success();
 }
 
diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir
index 8f26533f0b32f..6d8536a730d7a 100644
--- a/mlir/test/Dialect/Linalg/invalid.mlir
+++ b/mlir/test/Dialect/Linalg/invalid.mlir
@@ -640,7 +640,7 @@ func @pad_yield_type(%arg0: tensor<?x4xi32>, %arg1: i8) -> tensor<?x9xi32> {
 func @illegal_fill_tensor_no_return(%arg0 : index, %arg1 : index, %arg2 : f32)
 {
   %0 = linalg.init_tensor [%arg0, %arg1] : tensor<?x?xf32>
-  // expected-error @+1 {{expected fill op with no result value to use memref type}}
+  // expected-error @+1 {{expected the number of results (0) to be equal to the number of output tensors (1)}}
   linalg.fill(%arg2, %0) : f32, tensor<?x?xf32>
 }
 
@@ -648,7 +648,7 @@ func @illegal_fill_tensor_no_return(%arg0 : index, %arg1 : index, %arg2 : f32)
 
 func @illegal_fill_memref_with_return(%arg0 : memref<?x?xf32>, %arg1 : f32) -> memref<?x?xf32>
 {
-  // expected-error @+1 {{unexpected #results > #outputs}}
+  // expected-error @+1 {{expected the number of results (1) to be equal to the number of output tensors (0)}}
   %0 = linalg.fill(%arg1, %arg0) : f32, memref<?x?xf32> -> memref<?x?xf32>
   return %0 : memref<?x?xf32>
 }
@@ -658,7 +658,7 @@ func @illegal_fill_memref_with_return(%arg0 : memref<?x?xf32>, %arg1 : f32) -> m
 func @illegal_fill_memref_with_tensor_return
   (%arg0 : memref<?x?xf32>, %arg1 : f32) -> tensor<?x?xf32>
 {
-  // expected-error @+1 {{unexpected #results > #outputs}}
+  // expected-error @+1 {{expected the number of results (1) to be equal to the number of output tensors (0)}}
   %0 = linalg.fill(%arg1, %arg0) : f32, memref<?x?xf32> -> tensor<?x?xf32>
   return %0 : tensor<?x?xf32>
 }

From 21fd8759529707b0f4430ebe8f27a01edc7f655e Mon Sep 17 00:00:00 2001
From: Mikael Holmen <mikael.holmen@ericsson.com>
Date: Thu, 8 Jul 2021 09:46:30 +0200
Subject: [PATCH 22/32] [lld/mac] Fix warning about unused variable [NFC]

Change "dyn_cast" to "isa" to get rid of the unused
variable "bitcodeFile".

gcc warned with

lld/MachO/Driver.cpp:531:17: warning: unused variable 'bitcodeFile' [-Wunused-variable]
531 |       if (auto *bitcodeFile = dyn_cast<BitcodeFile>(file)) {
    |                 ^~~~~~~~~~~
---
 lld/MachO/Driver.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index f886f0e03929c..045ca85dcab31 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -528,7 +528,7 @@ static void compileBitcodeFiles() {
   // FIXME: Remove this once LTO.cpp honors config->exportDynamic.
   if (config->exportDynamic)
     for (InputFile *file : inputFiles)
-      if (auto *bitcodeFile = dyn_cast<BitcodeFile>(file)) {
+      if (isa<BitcodeFile>(file)) {
         warn("the effect of -export_dynamic on LTO is not yet implemented");
         break;
       }

From 9ced1e44adef3b329dde4c8f681d4d6b5825ea54 Mon Sep 17 00:00:00 2001
From: Sebastian Neubauer <sebastian.neubauer@amd.com>
Date: Thu, 8 Jul 2021 10:07:33 +0200
Subject: [PATCH 23/32] [AMDGPU] Fix typo

---
 llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
index 7beef25794b16..acfee63d203ab 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
@@ -365,7 +365,7 @@ void SIOptimizeVGPRLiveRange::updateLiveRangeInElseRegion(
     unsigned BBNum = MBB->getNumber();
     if (OldVarInfo.AliveBlocks.test(BBNum)) {
       NewVarInfo.AliveBlocks.set(BBNum);
-      LLVM_DEBUG(dbgs() << "Removing ALiveBlock " << printMBBReference(*MBB)
+      LLVM_DEBUG(dbgs() << "Removing AliveBlock " << printMBBReference(*MBB)
                         << '\n');
       OldVarInfo.AliveBlocks.reset(BBNum);
     }

From 684dfe8adb7eb6037b20e12364cae97c01ff2190 Mon Sep 17 00:00:00 2001
From: Alex Zinenko <zinenko@google.com>
Date: Wed, 7 Jul 2021 11:45:27 +0200
Subject: [PATCH 24/32] [mlir] factor out ConvertToLLVMPattern

This class and classes that extend it are general utilities for any dialect
that is being converted into the LLVM dialect. They are in no way specific to
Standard-to-LLVM conversion and should not make their users depend on it.

Reviewed By: nicolasvasilache

Differential Revision: https://reviews.llvm.org/D105542
---
 .../mlir/Conversion/LLVMCommon/Pattern.h      | 193 +++++++++
 .../Conversion/LLVMCommon/VectorPattern.h     |  85 ++++
 .../StandardToLLVM/ConvertStandardToLLVM.h    | 242 ++---------
 .../ConvertStandardToLLVMPass.h               |  28 --
 mlir/lib/Conversion/GPUCommon/CMakeLists.txt  |   1 +
 .../GPUCommon/GPUToLLVMConversion.cpp         |   2 +
 mlir/lib/Conversion/LLVMCommon/CMakeLists.txt |   2 +
 mlir/lib/Conversion/LLVMCommon/Pattern.cpp    | 269 ++++++++++++
 .../Conversion/LLVMCommon/VectorPattern.cpp   | 142 ++++++
 .../Conversion/OpenMPToLLVM/CMakeLists.txt    |   1 +
 .../Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp  |   2 +
 .../StandardToLLVM/StandardToLLVM.cpp         | 406 +-----------------
 12 files changed, 732 insertions(+), 641 deletions(-)
 create mode 100644 mlir/include/mlir/Conversion/LLVMCommon/Pattern.h
 create mode 100644 mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h
 create mode 100644 mlir/lib/Conversion/LLVMCommon/Pattern.cpp
 create mode 100644 mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp

diff --git a/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h b/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h
new file mode 100644
index 0000000000000..323c9cfeb97f5
--- /dev/null
+++ b/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h
@@ -0,0 +1,193 @@
+//===- Pattern.h - Pattern for conversion to the LLVM dialect ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_LLVMCOMMON_PATTERN_H
+#define MLIR_CONVERSION_LLVMCOMMON_PATTERN_H
+
+#include "mlir/Conversion/LLVMCommon/MemRefBuilder.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace mlir {
+
+namespace LLVM {
+namespace detail {
+/// Replaces the given operation "op" with a new operation of type "targetOp"
+/// and given operands.
+LogicalResult oneToOneRewrite(Operation *op, StringRef targetOp,
+                              ValueRange operands,
+                              LLVMTypeConverter &typeConverter,
+                              ConversionPatternRewriter &rewriter);
+} // namespace detail
+} // namespace LLVM
+
+/// Base class for operation conversions targeting the LLVM IR dialect. It
+/// provides the conversion patterns with access to the LLVMTypeConverter and
+/// the LowerToLLVMOptions. The class captures the LLVMTypeConverter and the
+/// LowerToLLVMOptions by reference meaning the references have to remain alive
+/// during the entire pattern lifetime.
+class ConvertToLLVMPattern : public ConversionPattern {
+public:
+  ConvertToLLVMPattern(StringRef rootOpName, MLIRContext *context,
+                       LLVMTypeConverter &typeConverter,
+                       PatternBenefit benefit = 1);
+
+protected:
+  /// Returns the LLVM dialect.
+  LLVM::LLVMDialect &getDialect() const;
+
+  LLVMTypeConverter *getTypeConverter() const;
+
+  /// Gets the MLIR type wrapping the LLVM integer type whose bit width is
+  /// defined by the used type converter.
+  Type getIndexType() const;
+
+  /// Gets the MLIR type wrapping the LLVM integer type whose bit width
+  /// corresponds to that of a LLVM pointer type.
+  Type getIntPtrType(unsigned addressSpace = 0) const;
+
+  /// Gets the MLIR type wrapping the LLVM void type.
+  Type getVoidType() const;
+
+  /// Get the MLIR type wrapping the LLVM i8* type.
+  Type getVoidPtrType() const;
+
+  /// Create a constant Op producing a value of `resultType` from an index-typed
+  /// integer attribute.
+  static Value createIndexAttrConstant(OpBuilder &builder, Location loc,
+                                       Type resultType, int64_t value);
+
+  /// Create an LLVM dialect operation defining the given index constant.
+  Value createIndexConstant(ConversionPatternRewriter &builder, Location loc,
+                            uint64_t value) const;
+
+  // This is a strided getElementPtr variant that linearizes subscripts as:
+  //   `base_offset + index_0 * stride_0 + ... + index_n * stride_n`.
+  Value getStridedElementPtr(Location loc, MemRefType type, Value memRefDesc,
+                             ValueRange indices,
+                             ConversionPatternRewriter &rewriter) const;
+
+  /// Returns if the given memref has identity maps and the element type is
+  /// convertible to LLVM.
+  bool isConvertibleAndHasIdentityMaps(MemRefType type) const;
+
+  /// Returns the type of a pointer to an element of the memref.
+  Type getElementPtrType(MemRefType type) const;
+
+  /// Computes sizes, strides and buffer size in bytes of `memRefType` with
+  /// identity layout. Emits constant ops for the static sizes of `memRefType`,
+  /// and uses `dynamicSizes` for the others. Emits instructions to compute
+  /// strides and buffer size from these sizes.
+  ///
+  /// For example, memref<4x?xf32> emits:
+  /// `sizes[0]`   = llvm.mlir.constant(4 : index) : i64
+  /// `sizes[1]`   = `dynamicSizes[0]`
+  /// `strides[1]` = llvm.mlir.constant(1 : index) : i64
+  /// `strides[0]` = `sizes[0]`
+  /// %size        = llvm.mul `sizes[0]`, `sizes[1]` : i64
+  /// %nullptr     = llvm.mlir.null : !llvm.ptr<f32>
+  /// %gep         = llvm.getelementptr %nullptr[%size]
+  ///                  : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
+  /// `sizeBytes`  = llvm.ptrtoint %gep : !llvm.ptr<f32> to i64
+  void getMemRefDescriptorSizes(Location loc, MemRefType memRefType,
+                                ValueRange dynamicSizes,
+                                ConversionPatternRewriter &rewriter,
+                                SmallVectorImpl<Value> &sizes,
+                                SmallVectorImpl<Value> &strides,
+                                Value &sizeBytes) const;
+
+  /// Computes the size of type in bytes.
+  Value getSizeInBytes(Location loc, Type type,
+                       ConversionPatternRewriter &rewriter) const;
+
+  /// Computes total number of elements for the given shape.
+  Value getNumElements(Location loc, ArrayRef<Value> shape,
+                       ConversionPatternRewriter &rewriter) const;
+
+  /// Creates and populates a canonical memref descriptor struct.
+  MemRefDescriptor
+  createMemRefDescriptor(Location loc, MemRefType memRefType,
+                         Value allocatedPtr, Value alignedPtr,
+                         ArrayRef<Value> sizes, ArrayRef<Value> strides,
+                         ConversionPatternRewriter &rewriter) const;
+};
+
+/// Utility class for operation conversions targeting the LLVM dialect that
+/// match exactly one source operation.
+template <typename SourceOp>
+class ConvertOpToLLVMPattern : public ConvertToLLVMPattern {
+public:
+  explicit ConvertOpToLLVMPattern(LLVMTypeConverter &typeConverter,
+                                  PatternBenefit benefit = 1)
+      : ConvertToLLVMPattern(SourceOp::getOperationName(),
+                             &typeConverter.getContext(), typeConverter,
+                             benefit) {}
+
+  /// Wrappers around the RewritePattern methods that pass the derived op type.
+  void rewrite(Operation *op, ArrayRef<Value> operands,
+               ConversionPatternRewriter &rewriter) const final {
+    rewrite(cast<SourceOp>(op), operands, rewriter);
+  }
+  LogicalResult match(Operation *op) const final {
+    return match(cast<SourceOp>(op));
+  }
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    return matchAndRewrite(cast<SourceOp>(op), operands, rewriter);
+  }
+
+  /// Rewrite and Match methods that operate on the SourceOp type. These must be
+  /// overridden by the derived pattern class.
+  virtual void rewrite(SourceOp op, ArrayRef<Value> operands,
+                       ConversionPatternRewriter &rewriter) const {
+    llvm_unreachable("must override rewrite or matchAndRewrite");
+  }
+  virtual LogicalResult match(SourceOp op) const {
+    llvm_unreachable("must override match or matchAndRewrite");
+  }
+  virtual LogicalResult
+  matchAndRewrite(SourceOp op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const {
+    if (succeeded(match(op))) {
+      rewrite(op, operands, rewriter);
+      return success();
+    }
+    return failure();
+  }
+
+private:
+  using ConvertToLLVMPattern::match;
+  using ConvertToLLVMPattern::matchAndRewrite;
+};
+
+/// Generic implementation of one-to-one conversion from "SourceOp" to
+/// "TargetOp" where the latter belongs to the LLVM dialect or an equivalent.
+/// Upholds a convention that multi-result operations get converted into an
+/// operation returning the LLVM IR structure type, in which case individual
+/// values must be extracted from using LLVM::ExtractValueOp before being used.
+template <typename SourceOp, typename TargetOp>
+class OneToOneConvertToLLVMPattern : public ConvertOpToLLVMPattern<SourceOp> {
+public:
+  using ConvertOpToLLVMPattern<SourceOp>::ConvertOpToLLVMPattern;
+  using Super = OneToOneConvertToLLVMPattern<SourceOp, TargetOp>;
+
+  /// Converts the type of the result to an LLVM type, pass operands as is,
+  /// preserve attributes.
+  LogicalResult
+  matchAndRewrite(SourceOp op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    return LLVM::detail::oneToOneRewrite(op, TargetOp::getOperationName(),
+                                         operands, *this->getTypeConverter(),
+                                         rewriter);
+  }
+};
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_LLVMCOMMON_PATTERN_H
diff --git a/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h b/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h
new file mode 100644
index 0000000000000..383516ac3cd6d
--- /dev/null
+++ b/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h
@@ -0,0 +1,85 @@
+//===- VectorPattern.h - Conversion pattern to the LLVM dialect -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_LLVMCOMMON_VECTORPATTERN_H
+#define MLIR_CONVERSION_LLVMCOMMON_VECTORPATTERN_H
+
+#include "mlir/Conversion/LLVMCommon/Pattern.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace mlir {
+
+namespace LLVM {
+namespace detail {
+// Helper struct to "unroll" operations on n-D vectors in terms of operations on
+// 1-D LLVM vectors.
+struct NDVectorTypeInfo {
+  // LLVM array struct which encodes n-D vectors.
+  Type llvmNDVectorTy;
+  // LLVM vector type which encodes the inner 1-D vector type.
+  Type llvm1DVectorTy;
+  // Multiplicity of llvmNDVectorTy to llvm1DVectorTy.
+  SmallVector<int64_t, 4> arraySizes;
+};
+
+// For >1-D vector types, extracts the necessary information to iterate over all
+// 1-D subvectors in the underlying llrepresentation of the n-D vector
+// Iterates on the llvm array type until we hit a non-array type (which is
+// asserted to be an llvm vector type).
+NDVectorTypeInfo extractNDVectorTypeInfo(VectorType vectorType,
+                                         LLVMTypeConverter &converter);
+
+// Express `linearIndex` in terms of coordinates of `basis`.
+// Returns the empty vector when linearIndex is out of the range [0, P] where
+// P is the product of all the basis coordinates.
+//
+// Prerequisites:
+//   Basis is an array of nonnegative integers (signed type inherited from
+//   vector shape type).
+SmallVector<int64_t, 4> getCoordinates(ArrayRef<int64_t> basis,
+                                       unsigned linearIndex);
+
+// Iterate of linear index, convert to coords space and insert splatted 1-D
+// vector in each position.
+void nDVectorIterate(const NDVectorTypeInfo &info, OpBuilder &builder,
+                     function_ref<void(ArrayAttr)> fun);
+
+LogicalResult handleMultidimensionalVectors(
+    Operation *op, ValueRange operands, LLVMTypeConverter &typeConverter,
+    std::function<Value(Type, ValueRange)> createOperand,
+    ConversionPatternRewriter &rewriter);
+
+LogicalResult vectorOneToOneRewrite(Operation *op, StringRef targetOp,
+                                    ValueRange operands,
+                                    LLVMTypeConverter &typeConverter,
+                                    ConversionPatternRewriter &rewriter);
+} // namespace detail
+} // namespace LLVM
+
+/// Basic lowering implementation to rewrite Ops with just one result to the
+/// LLVM Dialect. This supports higher-dimensional vector types.
+template <typename SourceOp, typename TargetOp>
+class VectorConvertToLLVMPattern : public ConvertOpToLLVMPattern<SourceOp> {
+public:
+  using ConvertOpToLLVMPattern<SourceOp>::ConvertOpToLLVMPattern;
+  using Super = VectorConvertToLLVMPattern<SourceOp, TargetOp>;
+
+  LogicalResult
+  matchAndRewrite(SourceOp op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    static_assert(
+        std::is_base_of<OpTrait::OneResult<SourceOp>, SourceOp>::value,
+        "expected single result op");
+    return LLVM::detail::vectorOneToOneRewrite(
+        op, TargetOp::getOperationName(), operands, *this->getTypeConverter(),
+        rewriter);
+  }
+};
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_LLVMCOMMON_VECTORPATTERN_H
diff --git a/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h b/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h
index a76e91ae3d006..604556f3e0a47 100644
--- a/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h
+++ b/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h
@@ -15,167 +15,37 @@
 #ifndef MLIR_CONVERSION_STANDARDTOLLVM_CONVERTSTANDARDTOLLVM_H
 #define MLIR_CONVERSION_STANDARDTOLLVM_CONVERTSTANDARDTOLLVM_H
 
-#include "mlir/Conversion/LLVMCommon/MemRefBuilder.h"
-#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
-#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
-#include "mlir/Transforms/DialectConversion.h"
-
-namespace llvm {
-class IntegerType;
-class LLVMContext;
-class Module;
-class Type;
-} // namespace llvm
+#include "mlir/Conversion/LLVMCommon/Pattern.h"
 
 namespace mlir {
 
-class BaseMemRefType;
-class ComplexType;
-class DataLayoutAnalysis;
 class LLVMTypeConverter;
-class UnrankedMemRefType;
-
-namespace LLVM {
-class LLVMDialect;
-class LLVMPointerType;
-} // namespace LLVM
-
-// ------------------
-
-/// Base class for operation conversions targeting the LLVM IR dialect. It
-/// provides the conversion patterns with access to the LLVMTypeConverter and
-/// the LowerToLLVMOptions. The class captures the LLVMTypeConverter and the
-/// LowerToLLVMOptions by reference meaning the references have to remain alive
-/// during the entire pattern lifetime.
-class ConvertToLLVMPattern : public ConversionPattern {
-public:
-  ConvertToLLVMPattern(StringRef rootOpName, MLIRContext *context,
-                       LLVMTypeConverter &typeConverter,
-                       PatternBenefit benefit = 1);
-
-protected:
-  /// Returns the LLVM dialect.
-  LLVM::LLVMDialect &getDialect() const;
-
-  LLVMTypeConverter *getTypeConverter() const;
-
-  /// Gets the MLIR type wrapping the LLVM integer type whose bit width is
-  /// defined by the used type converter.
-  Type getIndexType() const;
-
-  /// Gets the MLIR type wrapping the LLVM integer type whose bit width
-  /// corresponds to that of a LLVM pointer type.
-  Type getIntPtrType(unsigned addressSpace = 0) const;
-
-  /// Gets the MLIR type wrapping the LLVM void type.
-  Type getVoidType() const;
-
-  /// Get the MLIR type wrapping the LLVM i8* type.
-  Type getVoidPtrType() const;
-
-  /// Create an LLVM dialect operation defining the given index constant.
-  Value createIndexConstant(ConversionPatternRewriter &builder, Location loc,
-                            uint64_t value) const;
-
-  // This is a strided getElementPtr variant that linearizes subscripts as:
-  //   `base_offset + index_0 * stride_0 + ... + index_n * stride_n`.
-  Value getStridedElementPtr(Location loc, MemRefType type, Value memRefDesc,
-                             ValueRange indices,
-                             ConversionPatternRewriter &rewriter) const;
-
-  /// Returns if the given memref has identity maps and the element type is
-  /// convertible to LLVM.
-  bool isConvertibleAndHasIdentityMaps(MemRefType type) const;
-
-  /// Returns the type of a pointer to an element of the memref.
-  Type getElementPtrType(MemRefType type) const;
-
-  /// Computes sizes, strides and buffer size in bytes of `memRefType` with
-  /// identity layout. Emits constant ops for the static sizes of `memRefType`,
-  /// and uses `dynamicSizes` for the others. Emits instructions to compute
-  /// strides and buffer size from these sizes.
-  ///
-  /// For example, memref<4x?xf32> emits:
-  /// `sizes[0]`   = llvm.mlir.constant(4 : index) : i64
-  /// `sizes[1]`   = `dynamicSizes[0]`
-  /// `strides[1]` = llvm.mlir.constant(1 : index) : i64
-  /// `strides[0]` = `sizes[0]`
-  /// %size        = llvm.mul `sizes[0]`, `sizes[1]` : i64
-  /// %nullptr     = llvm.mlir.null : !llvm.ptr<f32>
-  /// %gep         = llvm.getelementptr %nullptr[%size]
-  ///                  : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
-  /// `sizeBytes`  = llvm.ptrtoint %gep : !llvm.ptr<f32> to i64
-  void getMemRefDescriptorSizes(Location loc, MemRefType memRefType,
-                                ValueRange dynamicSizes,
-                                ConversionPatternRewriter &rewriter,
-                                SmallVectorImpl<Value> &sizes,
-                                SmallVectorImpl<Value> &strides,
-                                Value &sizeBytes) const;
-
-  /// Computes the size of type in bytes.
-  Value getSizeInBytes(Location loc, Type type,
-                       ConversionPatternRewriter &rewriter) const;
-
-  /// Computes total number of elements for the given shape.
-  Value getNumElements(Location loc, ArrayRef<Value> shape,
-                       ConversionPatternRewriter &rewriter) const;
-
-  /// Creates and populates a canonical memref descriptor struct.
-  MemRefDescriptor
-  createMemRefDescriptor(Location loc, MemRefType memRefType,
-                         Value allocatedPtr, Value alignedPtr,
-                         ArrayRef<Value> sizes, ArrayRef<Value> strides,
-                         ConversionPatternRewriter &rewriter) const;
-};
-
-/// Utility class for operation conversions targeting the LLVM dialect that
-/// match exactly one source operation.
-template <typename SourceOp>
-class ConvertOpToLLVMPattern : public ConvertToLLVMPattern {
-public:
-  explicit ConvertOpToLLVMPattern(LLVMTypeConverter &typeConverter,
-                                  PatternBenefit benefit = 1)
-      : ConvertToLLVMPattern(SourceOp::getOperationName(),
-                             &typeConverter.getContext(), typeConverter,
-                             benefit) {}
-
-  /// Wrappers around the RewritePattern methods that pass the derived op type.
-  void rewrite(Operation *op, ArrayRef<Value> operands,
-               ConversionPatternRewriter &rewriter) const final {
-    rewrite(cast<SourceOp>(op), operands, rewriter);
-  }
-  LogicalResult match(Operation *op) const final {
-    return match(cast<SourceOp>(op));
-  }
-  LogicalResult
-  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
-                  ConversionPatternRewriter &rewriter) const final {
-    return matchAndRewrite(cast<SourceOp>(op), operands, rewriter);
-  }
-
-  /// Rewrite and Match methods that operate on the SourceOp type. These must be
-  /// overridden by the derived pattern class.
-  virtual void rewrite(SourceOp op, ArrayRef<Value> operands,
-                       ConversionPatternRewriter &rewriter) const {
-    llvm_unreachable("must override rewrite or matchAndRewrite");
-  }
-  virtual LogicalResult match(SourceOp op) const {
-    llvm_unreachable("must override match or matchAndRewrite");
-  }
-  virtual LogicalResult
-  matchAndRewrite(SourceOp op, ArrayRef<Value> operands,
-                  ConversionPatternRewriter &rewriter) const {
-    if (succeeded(match(op))) {
-      rewrite(op, operands, rewriter);
-      return success();
-    }
-    return failure();
-  }
-
-private:
-  using ConvertToLLVMPattern::match;
-  using ConvertToLLVMPattern::matchAndRewrite;
-};
+class RewritePatternSet;
+
+/// Collect a set of patterns to convert memory-related operations from the
+/// Standard dialect to the LLVM dialect, excluding non-memory-related
+/// operations and FuncOp.
+void populateStdToLLVMMemoryConversionPatterns(LLVMTypeConverter &converter,
+                                               RewritePatternSet &patterns);
+
+/// Collect a set of patterns to convert from the Standard dialect to the LLVM
+/// dialect, excluding the memory-related operations.
+void populateStdToLLVMNonMemoryConversionPatterns(LLVMTypeConverter &converter,
+                                                  RewritePatternSet &patterns);
+
+/// Collect the default pattern to convert a FuncOp to the LLVM dialect. If
+/// `emitCWrappers` is set, the pattern will also produce functions
+/// that pass memref descriptors by pointer-to-structure in addition to the
+/// default unpacked form.
+void populateStdToLLVMFuncOpConversionPattern(LLVMTypeConverter &converter,
+                                              RewritePatternSet &patterns);
+
+/// Collect the patterns to convert from the Standard dialect to LLVM. The
+/// conversion patterns capture the LLVMTypeConverter and the LowerToLLVMOptions
+/// by reference meaning the references have to remain alive during the entire
+/// pattern lifetime.
+void populateStdToLLVMConversionPatterns(LLVMTypeConverter &converter,
+                                         RewritePatternSet &patterns);
 
 /// Lowering for AllocOp and AllocaOp.
 struct AllocLikeOpLLVMLowering : public ConvertToLLVMPattern {
@@ -226,64 +96,6 @@ struct AllocLikeOpLLVMLowering : public ConvertToLLVMPattern {
                   ConversionPatternRewriter &rewriter) const override;
 };
 
-namespace LLVM {
-namespace detail {
-/// Replaces the given operation "op" with a new operation of type "targetOp"
-/// and given operands.
-LogicalResult oneToOneRewrite(Operation *op, StringRef targetOp,
-                              ValueRange operands,
-                              LLVMTypeConverter &typeConverter,
-                              ConversionPatternRewriter &rewriter);
-
-LogicalResult vectorOneToOneRewrite(Operation *op, StringRef targetOp,
-                                    ValueRange operands,
-                                    LLVMTypeConverter &typeConverter,
-                                    ConversionPatternRewriter &rewriter);
-} // namespace detail
-} // namespace LLVM
-
-/// Generic implementation of one-to-one conversion from "SourceOp" to
-/// "TargetOp" where the latter belongs to the LLVM dialect or an equivalent.
-/// Upholds a convention that multi-result operations get converted into an
-/// operation returning the LLVM IR structure type, in which case individual
-/// values must be extracted from using LLVM::ExtractValueOp before being used.
-template <typename SourceOp, typename TargetOp>
-class OneToOneConvertToLLVMPattern : public ConvertOpToLLVMPattern<SourceOp> {
-public:
-  using ConvertOpToLLVMPattern<SourceOp>::ConvertOpToLLVMPattern;
-  using Super = OneToOneConvertToLLVMPattern<SourceOp, TargetOp>;
-
-  /// Converts the type of the result to an LLVM type, pass operands as is,
-  /// preserve attributes.
-  LogicalResult
-  matchAndRewrite(SourceOp op, ArrayRef<Value> operands,
-                  ConversionPatternRewriter &rewriter) const override {
-    return LLVM::detail::oneToOneRewrite(op, TargetOp::getOperationName(),
-                                         operands, *this->getTypeConverter(),
-                                         rewriter);
-  }
-};
-
-/// Basic lowering implementation to rewrite Ops with just one result to the
-/// LLVM Dialect. This supports higher-dimensional vector types.
-template <typename SourceOp, typename TargetOp>
-class VectorConvertToLLVMPattern : public ConvertOpToLLVMPattern<SourceOp> {
-public:
-  using ConvertOpToLLVMPattern<SourceOp>::ConvertOpToLLVMPattern;
-  using Super = VectorConvertToLLVMPattern<SourceOp, TargetOp>;
-
-  LogicalResult
-  matchAndRewrite(SourceOp op, ArrayRef<Value> operands,
-                  ConversionPatternRewriter &rewriter) const override {
-    static_assert(
-        std::is_base_of<OpTrait::OneResult<SourceOp>, SourceOp>::value,
-        "expected single result op");
-    return LLVM::detail::vectorOneToOneRewrite(
-        op, TargetOp::getOperationName(), operands, *this->getTypeConverter(),
-        rewriter);
-  }
-};
-
 /// Derived class that automatically populates legalization information for
 /// different LLVM ops.
 class LLVMConversionTarget : public ConversionTarget {
diff --git a/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h b/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h
index 560794a0a925f..6d809d97234e1 100644
--- a/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h
+++ b/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h
@@ -12,38 +12,10 @@
 #include <memory>
 
 namespace mlir {
-class LLVMTypeConverter;
 class LowerToLLVMOptions;
 class ModuleOp;
 template <typename T>
 class OperationPass;
-class RewritePatternSet;
-using OwningRewritePatternList = RewritePatternSet;
-
-/// Collect a set of patterns to convert memory-related operations from the
-/// Standard dialect to the LLVM dialect, excluding non-memory-related
-/// operations and FuncOp.
-void populateStdToLLVMMemoryConversionPatterns(LLVMTypeConverter &converter,
-                                               RewritePatternSet &patterns);
-
-/// Collect a set of patterns to convert from the Standard dialect to the LLVM
-/// dialect, excluding the memory-related operations.
-void populateStdToLLVMNonMemoryConversionPatterns(LLVMTypeConverter &converter,
-                                                  RewritePatternSet &patterns);
-
-/// Collect the default pattern to convert a FuncOp to the LLVM dialect. If
-/// `emitCWrappers` is set, the pattern will also produce functions
-/// that pass memref descriptors by pointer-to-structure in addition to the
-/// default unpacked form.
-void populateStdToLLVMFuncOpConversionPattern(LLVMTypeConverter &converter,
-                                              RewritePatternSet &patterns);
-
-/// Collect the patterns to convert from the Standard dialect to LLVM. The
-/// conversion patterns capture the LLVMTypeConverter and the LowerToLLVMOptions
-/// by reference meaning the references have to remain alive during the entire
-/// pattern lifetime.
-void populateStdToLLVMConversionPatterns(LLVMTypeConverter &converter,
-                                         RewritePatternSet &patterns);
 
 /// Creates a pass to convert the Standard dialect into the LLVMIR dialect.
 /// stdlib malloc/free is used by default for allocating memrefs allocated with
diff --git a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
index 71694bf925188..988071e6a00de 100644
--- a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
@@ -32,6 +32,7 @@ add_mlir_conversion_library(MLIRGPUToGPURuntimeTransforms
   MLIRAsyncToLLVM
   MLIRGPUTransforms
   MLIRIR
+  MLIRLLVMCommonConversion
   MLIRLLVMIR
   MLIRPass
   MLIRSupport
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index dcb28f462d570..557eabcad79e9 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -17,7 +17,9 @@
 
 #include "../PassDetail.h"
 #include "mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h"
+#include "mlir/Conversion/LLVMCommon/Pattern.h"
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
 #include "mlir/Dialect/Async/IR/Async.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
diff --git a/mlir/lib/Conversion/LLVMCommon/CMakeLists.txt b/mlir/lib/Conversion/LLVMCommon/CMakeLists.txt
index 14c7c94b983f9..3657e56e61b17 100644
--- a/mlir/lib/Conversion/LLVMCommon/CMakeLists.txt
+++ b/mlir/lib/Conversion/LLVMCommon/CMakeLists.txt
@@ -1,8 +1,10 @@
 add_mlir_conversion_library(MLIRLLVMCommonConversion
   LoweringOptions.cpp
   MemRefBuilder.cpp
+  Pattern.cpp
   StructBuilder.cpp
   TypeConverter.cpp
+  VectorPattern.cpp
 
   LINK_COMPONENTS
   Core
diff --git a/mlir/lib/Conversion/LLVMCommon/Pattern.cpp b/mlir/lib/Conversion/LLVMCommon/Pattern.cpp
new file mode 100644
index 0000000000000..6e5a827b34e8d
--- /dev/null
+++ b/mlir/lib/Conversion/LLVMCommon/Pattern.cpp
@@ -0,0 +1,269 @@
+//===- Pattern.cpp - Conversion pattern to the LLVM dialect ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/LLVMCommon/Pattern.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
+#include "mlir/IR/AffineMap.h"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// ConvertToLLVMPattern
+//===----------------------------------------------------------------------===//
+
+ConvertToLLVMPattern::ConvertToLLVMPattern(StringRef rootOpName,
+                                           MLIRContext *context,
+                                           LLVMTypeConverter &typeConverter,
+                                           PatternBenefit benefit)
+    : ConversionPattern(typeConverter, rootOpName, benefit, context) {}
+
+LLVMTypeConverter *ConvertToLLVMPattern::getTypeConverter() const {
+  return static_cast<LLVMTypeConverter *>(
+      ConversionPattern::getTypeConverter());
+}
+
+LLVM::LLVMDialect &ConvertToLLVMPattern::getDialect() const {
+  return *getTypeConverter()->getDialect();
+}
+
+Type ConvertToLLVMPattern::getIndexType() const {
+  return getTypeConverter()->getIndexType();
+}
+
+Type ConvertToLLVMPattern::getIntPtrType(unsigned addressSpace) const {
+  return IntegerType::get(&getTypeConverter()->getContext(),
+                          getTypeConverter()->getPointerBitwidth(addressSpace));
+}
+
+Type ConvertToLLVMPattern::getVoidType() const {
+  return LLVM::LLVMVoidType::get(&getTypeConverter()->getContext());
+}
+
+Type ConvertToLLVMPattern::getVoidPtrType() const {
+  return LLVM::LLVMPointerType::get(
+      IntegerType::get(&getTypeConverter()->getContext(), 8));
+}
+
+Value ConvertToLLVMPattern::createIndexAttrConstant(OpBuilder &builder,
+                                                    Location loc,
+                                                    Type resultType,
+                                                    int64_t value) {
+  return builder.create<LLVM::ConstantOp>(
+      loc, resultType, builder.getIntegerAttr(builder.getIndexType(), value));
+}
+
+Value ConvertToLLVMPattern::createIndexConstant(
+    ConversionPatternRewriter &builder, Location loc, uint64_t value) const {
+  return createIndexAttrConstant(builder, loc, getIndexType(), value);
+}
+
+Value ConvertToLLVMPattern::getStridedElementPtr(
+    Location loc, MemRefType type, Value memRefDesc, ValueRange indices,
+    ConversionPatternRewriter &rewriter) const {
+
+  int64_t offset;
+  SmallVector<int64_t, 4> strides;
+  auto successStrides = getStridesAndOffset(type, strides, offset);
+  assert(succeeded(successStrides) && "unexpected non-strided memref");
+  (void)successStrides;
+
+  MemRefDescriptor memRefDescriptor(memRefDesc);
+  Value base = memRefDescriptor.alignedPtr(rewriter, loc);
+
+  Value index;
+  if (offset != 0) // Skip if offset is zero.
+    index = MemRefType::isDynamicStrideOrOffset(offset)
+                ? memRefDescriptor.offset(rewriter, loc)
+                : createIndexConstant(rewriter, loc, offset);
+
+  for (int i = 0, e = indices.size(); i < e; ++i) {
+    Value increment = indices[i];
+    if (strides[i] != 1) { // Skip if stride is 1.
+      Value stride = MemRefType::isDynamicStrideOrOffset(strides[i])
+                         ? memRefDescriptor.stride(rewriter, loc, i)
+                         : createIndexConstant(rewriter, loc, strides[i]);
+      increment = rewriter.create<LLVM::MulOp>(loc, increment, stride);
+    }
+    index =
+        index ? rewriter.create<LLVM::AddOp>(loc, index, increment) : increment;
+  }
+
+  Type elementPtrType = memRefDescriptor.getElementPtrType();
+  return index ? rewriter.create<LLVM::GEPOp>(loc, elementPtrType, base, index)
+               : base;
+}
+
+// Check if the MemRefType `type` is supported by the lowering. We currently
+// only support memrefs with identity maps.
+bool ConvertToLLVMPattern::isConvertibleAndHasIdentityMaps(
+    MemRefType type) const {
+  if (!typeConverter->convertType(type.getElementType()))
+    return false;
+  return type.getAffineMaps().empty() ||
+         llvm::all_of(type.getAffineMaps(),
+                      [](AffineMap map) { return map.isIdentity(); });
+}
+
+Type ConvertToLLVMPattern::getElementPtrType(MemRefType type) const {
+  auto elementType = type.getElementType();
+  auto structElementType = typeConverter->convertType(elementType);
+  return LLVM::LLVMPointerType::get(structElementType,
+                                    type.getMemorySpaceAsInt());
+}
+
+void ConvertToLLVMPattern::getMemRefDescriptorSizes(
+    Location loc, MemRefType memRefType, ValueRange dynamicSizes,
+    ConversionPatternRewriter &rewriter, SmallVectorImpl<Value> &sizes,
+    SmallVectorImpl<Value> &strides, Value &sizeBytes) const {
+  assert(isConvertibleAndHasIdentityMaps(memRefType) &&
+         "layout maps must have been normalized away");
+  assert(count(memRefType.getShape(), ShapedType::kDynamicSize) ==
+             static_cast<ssize_t>(dynamicSizes.size()) &&
+         "dynamicSizes size doesn't match dynamic sizes count in memref shape");
+
+  sizes.reserve(memRefType.getRank());
+  unsigned dynamicIndex = 0;
+  for (int64_t size : memRefType.getShape()) {
+    sizes.push_back(size == ShapedType::kDynamicSize
+                        ? dynamicSizes[dynamicIndex++]
+                        : createIndexConstant(rewriter, loc, size));
+  }
+
+  // Strides: iterate sizes in reverse order and multiply.
+  int64_t stride = 1;
+  Value runningStride = createIndexConstant(rewriter, loc, 1);
+  strides.resize(memRefType.getRank());
+  for (auto i = memRefType.getRank(); i-- > 0;) {
+    strides[i] = runningStride;
+
+    int64_t size = memRefType.getShape()[i];
+    if (size == 0)
+      continue;
+    bool useSizeAsStride = stride == 1;
+    if (size == ShapedType::kDynamicSize)
+      stride = ShapedType::kDynamicSize;
+    if (stride != ShapedType::kDynamicSize)
+      stride *= size;
+
+    if (useSizeAsStride)
+      runningStride = sizes[i];
+    else if (stride == ShapedType::kDynamicSize)
+      runningStride =
+          rewriter.create<LLVM::MulOp>(loc, runningStride, sizes[i]);
+    else
+      runningStride = createIndexConstant(rewriter, loc, stride);
+  }
+
+  // Buffer size in bytes.
+  Type elementPtrType = getElementPtrType(memRefType);
+  Value nullPtr = rewriter.create<LLVM::NullOp>(loc, elementPtrType);
+  Value gepPtr = rewriter.create<LLVM::GEPOp>(
+      loc, elementPtrType, ArrayRef<Value>{nullPtr, runningStride});
+  sizeBytes = rewriter.create<LLVM::PtrToIntOp>(loc, getIndexType(), gepPtr);
+}
+
+Value ConvertToLLVMPattern::getSizeInBytes(
+    Location loc, Type type, ConversionPatternRewriter &rewriter) const {
+  // Compute the size of an individual element. This emits the MLIR equivalent
+  // of the following sizeof(...) implementation in LLVM IR:
+  //   %0 = getelementptr %elementType* null, %indexType 1
+  //   %1 = ptrtoint %elementType* %0 to %indexType
+  // which is a common pattern of getting the size of a type in bytes.
+  auto convertedPtrType =
+      LLVM::LLVMPointerType::get(typeConverter->convertType(type));
+  auto nullPtr = rewriter.create<LLVM::NullOp>(loc, convertedPtrType);
+  auto gep = rewriter.create<LLVM::GEPOp>(
+      loc, convertedPtrType,
+      ArrayRef<Value>{nullPtr, createIndexConstant(rewriter, loc, 1)});
+  return rewriter.create<LLVM::PtrToIntOp>(loc, getIndexType(), gep);
+}
+
+Value ConvertToLLVMPattern::getNumElements(
+    Location loc, ArrayRef<Value> shape,
+    ConversionPatternRewriter &rewriter) const {
+  // Compute the total number of memref elements.
+  Value numElements =
+      shape.empty() ? createIndexConstant(rewriter, loc, 1) : shape.front();
+  for (unsigned i = 1, e = shape.size(); i < e; ++i)
+    numElements = rewriter.create<LLVM::MulOp>(loc, numElements, shape[i]);
+  return numElements;
+}
+
+/// Creates and populates the memref descriptor struct given all its fields.
+MemRefDescriptor ConvertToLLVMPattern::createMemRefDescriptor(
+    Location loc, MemRefType memRefType, Value allocatedPtr, Value alignedPtr,
+    ArrayRef<Value> sizes, ArrayRef<Value> strides,
+    ConversionPatternRewriter &rewriter) const {
+  auto structType = typeConverter->convertType(memRefType);
+  auto memRefDescriptor = MemRefDescriptor::undef(rewriter, loc, structType);
+
+  // Field 1: Allocated pointer, used for malloc/free.
+  memRefDescriptor.setAllocatedPtr(rewriter, loc, allocatedPtr);
+
+  // Field 2: Actual aligned pointer to payload.
+  memRefDescriptor.setAlignedPtr(rewriter, loc, alignedPtr);
+
+  // Field 3: Offset in aligned pointer.
+  memRefDescriptor.setOffset(rewriter, loc,
+                             createIndexConstant(rewriter, loc, 0));
+
+  // Fields 4: Sizes.
+  for (auto en : llvm::enumerate(sizes))
+    memRefDescriptor.setSize(rewriter, loc, en.index(), en.value());
+
+  // Field 5: Strides.
+  for (auto en : llvm::enumerate(strides))
+    memRefDescriptor.setStride(rewriter, loc, en.index(), en.value());
+
+  return memRefDescriptor;
+}
+
+//===----------------------------------------------------------------------===//
+// Detail methods
+//===----------------------------------------------------------------------===//
+
+/// Replaces the given operation "op" with a new operation of type "targetOp"
+/// and given operands.
+LogicalResult LLVM::detail::oneToOneRewrite(
+    Operation *op, StringRef targetOp, ValueRange operands,
+    LLVMTypeConverter &typeConverter, ConversionPatternRewriter &rewriter) {
+  unsigned numResults = op->getNumResults();
+
+  Type packedType;
+  if (numResults != 0) {
+    packedType = typeConverter.packFunctionResults(op->getResultTypes());
+    if (!packedType)
+      return failure();
+  }
+
+  // Create the operation through state since we don't know its C++ type.
+  OperationState state(op->getLoc(), targetOp);
+  state.addTypes(packedType);
+  state.addOperands(operands);
+  state.addAttributes(op->getAttrs());
+  Operation *newOp = rewriter.createOperation(state);
+
+  // If the operation produced 0 or 1 result, return them immediately.
+  if (numResults == 0)
+    return rewriter.eraseOp(op), success();
+  if (numResults == 1)
+    return rewriter.replaceOp(op, newOp->getResult(0)), success();
+
+  // Otherwise, it had been converted to an operation producing a structure.
+  // Extract individual results from the structure and return them as list.
+  SmallVector<Value, 4> results;
+  results.reserve(numResults);
+  for (unsigned i = 0; i < numResults; ++i) {
+    auto type = typeConverter.convertType(op->getResult(i).getType());
+    results.push_back(rewriter.create<LLVM::ExtractValueOp>(
+        op->getLoc(), type, newOp->getResult(0), rewriter.getI64ArrayAttr(i)));
+  }
+  rewriter.replaceOp(op, results);
+  return success();
+}
diff --git a/mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp b/mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp
new file mode 100644
index 0000000000000..ace5bec09f4e7
--- /dev/null
+++ b/mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp
@@ -0,0 +1,142 @@
+//===- VectorPattern.cpp - Vector conversion pattern to the LLVM dialect --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/LLVMCommon/VectorPattern.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+
+using namespace mlir;
+
+// For >1-D vector types, extracts the necessary information to iterate over all
+// 1-D subvectors in the underlying llrepresentation of the n-D vector
+// Iterates on the llvm array type until we hit a non-array type (which is
+// asserted to be an llvm vector type).
+LLVM::detail::NDVectorTypeInfo
+LLVM::detail::extractNDVectorTypeInfo(VectorType vectorType,
+                                      LLVMTypeConverter &converter) {
+  assert(vectorType.getRank() > 1 && "expected >1D vector type");
+  NDVectorTypeInfo info;
+  info.llvmNDVectorTy = converter.convertType(vectorType);
+  if (!info.llvmNDVectorTy || !LLVM::isCompatibleType(info.llvmNDVectorTy)) {
+    info.llvmNDVectorTy = nullptr;
+    return info;
+  }
+  info.arraySizes.reserve(vectorType.getRank() - 1);
+  auto llvmTy = info.llvmNDVectorTy;
+  while (llvmTy.isa<LLVM::LLVMArrayType>()) {
+    info.arraySizes.push_back(
+        llvmTy.cast<LLVM::LLVMArrayType>().getNumElements());
+    llvmTy = llvmTy.cast<LLVM::LLVMArrayType>().getElementType();
+  }
+  if (!LLVM::isCompatibleVectorType(llvmTy))
+    return info;
+  info.llvm1DVectorTy = llvmTy;
+  return info;
+}
+
+// Express `linearIndex` in terms of coordinates of `basis`.
+// Returns the empty vector when linearIndex is out of the range [0, P] where
+// P is the product of all the basis coordinates.
+//
+// Prerequisites:
+//   Basis is an array of nonnegative integers (signed type inherited from
+//   vector shape type).
+SmallVector<int64_t, 4> LLVM::detail::getCoordinates(ArrayRef<int64_t> basis,
+                                                     unsigned linearIndex) {
+  SmallVector<int64_t, 4> res;
+  res.reserve(basis.size());
+  for (unsigned basisElement : llvm::reverse(basis)) {
+    res.push_back(linearIndex % basisElement);
+    linearIndex = linearIndex / basisElement;
+  }
+  if (linearIndex > 0)
+    return {};
+  std::reverse(res.begin(), res.end());
+  return res;
+}
+
+// Iterate of linear index, convert to coords space and insert splatted 1-D
+// vector in each position.
+void LLVM::detail::nDVectorIterate(const LLVM::detail::NDVectorTypeInfo &info,
+                                   OpBuilder &builder,
+                                   function_ref<void(ArrayAttr)> fun) {
+  unsigned ub = 1;
+  for (auto s : info.arraySizes)
+    ub *= s;
+  for (unsigned linearIndex = 0; linearIndex < ub; ++linearIndex) {
+    auto coords = getCoordinates(info.arraySizes, linearIndex);
+    // Linear index is out of bounds, we are done.
+    if (coords.empty())
+      break;
+    assert(coords.size() == info.arraySizes.size());
+    auto position = builder.getI64ArrayAttr(coords);
+    fun(position);
+  }
+}
+
+LogicalResult LLVM::detail::handleMultidimensionalVectors(
+    Operation *op, ValueRange operands, LLVMTypeConverter &typeConverter,
+    std::function<Value(Type, ValueRange)> createOperand,
+    ConversionPatternRewriter &rewriter) {
+  auto resultNDVectorType = op->getResult(0).getType().cast<VectorType>();
+
+  SmallVector<Type> operand1DVectorTypes;
+  for (Value operand : op->getOperands()) {
+    auto operandNDVectorType = operand.getType().cast<VectorType>();
+    auto operandTypeInfo =
+        extractNDVectorTypeInfo(operandNDVectorType, typeConverter);
+    operand1DVectorTypes.push_back(operandTypeInfo.llvm1DVectorTy);
+  }
+  auto resultTypeInfo =
+      extractNDVectorTypeInfo(resultNDVectorType, typeConverter);
+  auto result1DVectorTy = resultTypeInfo.llvm1DVectorTy;
+  auto resultNDVectoryTy = resultTypeInfo.llvmNDVectorTy;
+  auto loc = op->getLoc();
+  Value desc = rewriter.create<LLVM::UndefOp>(loc, resultNDVectoryTy);
+  nDVectorIterate(resultTypeInfo, rewriter, [&](ArrayAttr position) {
+    // For this unrolled `position` corresponding to the `linearIndex`^th
+    // element, extract operand vectors
+    SmallVector<Value, 4> extractedOperands;
+    for (auto operand : llvm::enumerate(operands)) {
+      extractedOperands.push_back(rewriter.create<LLVM::ExtractValueOp>(
+          loc, operand1DVectorTypes[operand.index()], operand.value(),
+          position));
+    }
+    Value newVal = createOperand(result1DVectorTy, extractedOperands);
+    desc = rewriter.create<LLVM::InsertValueOp>(loc, resultNDVectoryTy, desc,
+                                                newVal, position);
+  });
+  rewriter.replaceOp(op, desc);
+  return success();
+}
+
+LogicalResult LLVM::detail::vectorOneToOneRewrite(
+    Operation *op, StringRef targetOp, ValueRange operands,
+    LLVMTypeConverter &typeConverter, ConversionPatternRewriter &rewriter) {
+  assert(!operands.empty());
+
+  // Cannot convert ops if their operands are not of LLVM type.
+  if (!llvm::all_of(operands.getTypes(),
+                    [](Type t) { return isCompatibleType(t); }))
+    return failure();
+
+  auto llvmNDVectorTy = operands[0].getType();
+  if (!llvmNDVectorTy.isa<LLVM::LLVMArrayType>())
+    return oneToOneRewrite(op, targetOp, operands, typeConverter, rewriter);
+
+  auto callback = [op, targetOp, &rewriter](Type llvm1DVectorTy,
+                                            ValueRange operands) {
+    OperationState state(op->getLoc(), targetOp);
+    state.addTypes(llvm1DVectorTy);
+    state.addOperands(operands);
+    state.addAttributes(op->getAttrs());
+    return rewriter.createOperation(state)->getResult(0);
+  };
+
+  return handleMultidimensionalVectors(op, operands, typeConverter, callback,
+                                       rewriter);
+}
diff --git a/mlir/lib/Conversion/OpenMPToLLVM/CMakeLists.txt b/mlir/lib/Conversion/OpenMPToLLVM/CMakeLists.txt
index c9cf7883a0abf..e0774746960f9 100644
--- a/mlir/lib/Conversion/OpenMPToLLVM/CMakeLists.txt
+++ b/mlir/lib/Conversion/OpenMPToLLVM/CMakeLists.txt
@@ -13,6 +13,7 @@ add_mlir_conversion_library(MLIROpenMPToLLVM
 
   LINK_LIBS PUBLIC
   MLIRIR
+  MLIRLLVMCommonConversion
   MLIRLLVMIR
   MLIROpenMP
   MLIRStandardToLLVM
diff --git a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
index 878e11ae6c5aa..3a9eff6ead6d3 100644
--- a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
+++ b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
@@ -9,7 +9,9 @@
 #include "mlir/Conversion/OpenMPToLLVM/ConvertOpenMPToLLVM.h"
 
 #include "../PassDetail.h"
+#include "mlir/Conversion/LLVMCommon/Pattern.h"
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 
diff --git a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
index f094f6443b156..4ec3c70568765 100644
--- a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
+++ b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
@@ -13,6 +13,8 @@
 
 #include "../PassDetail.h"
 #include "mlir/Analysis/DataLayoutAnalysis.h"
+#include "mlir/Conversion/LLVMCommon/Pattern.h"
+#include "mlir/Conversion/LLVMCommon/VectorPattern.h"
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
 #include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
@@ -46,214 +48,6 @@ using namespace mlir;
 
 #define PASS_NAME "convert-std-to-llvm"
 
-ConvertToLLVMPattern::ConvertToLLVMPattern(StringRef rootOpName,
-                                           MLIRContext *context,
-                                           LLVMTypeConverter &typeConverter,
-                                           PatternBenefit benefit)
-    : ConversionPattern(typeConverter, rootOpName, benefit, context) {}
-
-
-LLVMTypeConverter *ConvertToLLVMPattern::getTypeConverter() const {
-  return static_cast<LLVMTypeConverter *>(
-      ConversionPattern::getTypeConverter());
-}
-
-LLVM::LLVMDialect &ConvertToLLVMPattern::getDialect() const {
-  return *getTypeConverter()->getDialect();
-}
-
-Type ConvertToLLVMPattern::getIndexType() const {
-  return getTypeConverter()->getIndexType();
-}
-
-Type ConvertToLLVMPattern::getIntPtrType(unsigned addressSpace) const {
-  return IntegerType::get(&getTypeConverter()->getContext(),
-                          getTypeConverter()->getPointerBitwidth(addressSpace));
-}
-
-Type ConvertToLLVMPattern::getVoidType() const {
-  return LLVM::LLVMVoidType::get(&getTypeConverter()->getContext());
-}
-
-Type ConvertToLLVMPattern::getVoidPtrType() const {
-  return LLVM::LLVMPointerType::get(
-      IntegerType::get(&getTypeConverter()->getContext(), 8));
-}
-
-// Creates a constant Op producing a value of `resultType` from an index-typed
-// integer attribute.
-static Value createIndexAttrConstant(OpBuilder &builder, Location loc,
-                                     Type resultType, int64_t value) {
-  return builder.create<LLVM::ConstantOp>(
-      loc, resultType, builder.getIntegerAttr(builder.getIndexType(), value));
-}
-
-Value ConvertToLLVMPattern::createIndexConstant(
-    ConversionPatternRewriter &builder, Location loc, uint64_t value) const {
-  return createIndexAttrConstant(builder, loc, getIndexType(), value);
-}
-
-Value ConvertToLLVMPattern::getStridedElementPtr(
-    Location loc, MemRefType type, Value memRefDesc, ValueRange indices,
-    ConversionPatternRewriter &rewriter) const {
-
-  int64_t offset;
-  SmallVector<int64_t, 4> strides;
-  auto successStrides = getStridesAndOffset(type, strides, offset);
-  assert(succeeded(successStrides) && "unexpected non-strided memref");
-  (void)successStrides;
-
-  MemRefDescriptor memRefDescriptor(memRefDesc);
-  Value base = memRefDescriptor.alignedPtr(rewriter, loc);
-
-  Value index;
-  if (offset != 0) // Skip if offset is zero.
-    index = MemRefType::isDynamicStrideOrOffset(offset)
-                ? memRefDescriptor.offset(rewriter, loc)
-                : createIndexConstant(rewriter, loc, offset);
-
-  for (int i = 0, e = indices.size(); i < e; ++i) {
-    Value increment = indices[i];
-    if (strides[i] != 1) { // Skip if stride is 1.
-      Value stride = MemRefType::isDynamicStrideOrOffset(strides[i])
-                         ? memRefDescriptor.stride(rewriter, loc, i)
-                         : createIndexConstant(rewriter, loc, strides[i]);
-      increment = rewriter.create<LLVM::MulOp>(loc, increment, stride);
-    }
-    index =
-        index ? rewriter.create<LLVM::AddOp>(loc, index, increment) : increment;
-  }
-
-  Type elementPtrType = memRefDescriptor.getElementPtrType();
-  return index ? rewriter.create<LLVM::GEPOp>(loc, elementPtrType, base, index)
-               : base;
-}
-
-// Check if the MemRefType `type` is supported by the lowering. We currently
-// only support memrefs with identity maps.
-bool ConvertToLLVMPattern::isConvertibleAndHasIdentityMaps(
-    MemRefType type) const {
-  if (!typeConverter->convertType(type.getElementType()))
-    return false;
-  return type.getAffineMaps().empty() ||
-         llvm::all_of(type.getAffineMaps(),
-                      [](AffineMap map) { return map.isIdentity(); });
-}
-
-Type ConvertToLLVMPattern::getElementPtrType(MemRefType type) const {
-  auto elementType = type.getElementType();
-  auto structElementType = typeConverter->convertType(elementType);
-  return LLVM::LLVMPointerType::get(structElementType,
-                                    type.getMemorySpaceAsInt());
-}
-
-void ConvertToLLVMPattern::getMemRefDescriptorSizes(
-    Location loc, MemRefType memRefType, ValueRange dynamicSizes,
-    ConversionPatternRewriter &rewriter, SmallVectorImpl<Value> &sizes,
-    SmallVectorImpl<Value> &strides, Value &sizeBytes) const {
-  assert(isConvertibleAndHasIdentityMaps(memRefType) &&
-         "layout maps must have been normalized away");
-  assert(count(memRefType.getShape(), ShapedType::kDynamicSize) ==
-             static_cast<ssize_t>(dynamicSizes.size()) &&
-         "dynamicSizes size doesn't match dynamic sizes count in memref shape");
-
-  sizes.reserve(memRefType.getRank());
-  unsigned dynamicIndex = 0;
-  for (int64_t size : memRefType.getShape()) {
-    sizes.push_back(size == ShapedType::kDynamicSize
-                        ? dynamicSizes[dynamicIndex++]
-                        : createIndexConstant(rewriter, loc, size));
-  }
-
-  // Strides: iterate sizes in reverse order and multiply.
-  int64_t stride = 1;
-  Value runningStride = createIndexConstant(rewriter, loc, 1);
-  strides.resize(memRefType.getRank());
-  for (auto i = memRefType.getRank(); i-- > 0;) {
-    strides[i] = runningStride;
-
-    int64_t size = memRefType.getShape()[i];
-    if (size == 0)
-      continue;
-    bool useSizeAsStride = stride == 1;
-    if (size == ShapedType::kDynamicSize)
-      stride = ShapedType::kDynamicSize;
-    if (stride != ShapedType::kDynamicSize)
-      stride *= size;
-
-    if (useSizeAsStride)
-      runningStride = sizes[i];
-    else if (stride == ShapedType::kDynamicSize)
-      runningStride =
-          rewriter.create<LLVM::MulOp>(loc, runningStride, sizes[i]);
-    else
-      runningStride = createIndexConstant(rewriter, loc, stride);
-  }
-
-  // Buffer size in bytes.
-  Type elementPtrType = getElementPtrType(memRefType);
-  Value nullPtr = rewriter.create<LLVM::NullOp>(loc, elementPtrType);
-  Value gepPtr = rewriter.create<LLVM::GEPOp>(
-      loc, elementPtrType, ArrayRef<Value>{nullPtr, runningStride});
-  sizeBytes = rewriter.create<LLVM::PtrToIntOp>(loc, getIndexType(), gepPtr);
-}
-
-Value ConvertToLLVMPattern::getSizeInBytes(
-    Location loc, Type type, ConversionPatternRewriter &rewriter) const {
-  // Compute the size of an individual element. This emits the MLIR equivalent
-  // of the following sizeof(...) implementation in LLVM IR:
-  //   %0 = getelementptr %elementType* null, %indexType 1
-  //   %1 = ptrtoint %elementType* %0 to %indexType
-  // which is a common pattern of getting the size of a type in bytes.
-  auto convertedPtrType =
-      LLVM::LLVMPointerType::get(typeConverter->convertType(type));
-  auto nullPtr = rewriter.create<LLVM::NullOp>(loc, convertedPtrType);
-  auto gep = rewriter.create<LLVM::GEPOp>(
-      loc, convertedPtrType,
-      ArrayRef<Value>{nullPtr, createIndexConstant(rewriter, loc, 1)});
-  return rewriter.create<LLVM::PtrToIntOp>(loc, getIndexType(), gep);
-}
-
-Value ConvertToLLVMPattern::getNumElements(
-    Location loc, ArrayRef<Value> shape,
-    ConversionPatternRewriter &rewriter) const {
-  // Compute the total number of memref elements.
-  Value numElements =
-      shape.empty() ? createIndexConstant(rewriter, loc, 1) : shape.front();
-  for (unsigned i = 1, e = shape.size(); i < e; ++i)
-    numElements = rewriter.create<LLVM::MulOp>(loc, numElements, shape[i]);
-  return numElements;
-}
-
-/// Creates and populates the memref descriptor struct given all its fields.
-MemRefDescriptor ConvertToLLVMPattern::createMemRefDescriptor(
-    Location loc, MemRefType memRefType, Value allocatedPtr, Value alignedPtr,
-    ArrayRef<Value> sizes, ArrayRef<Value> strides,
-    ConversionPatternRewriter &rewriter) const {
-  auto structType = typeConverter->convertType(memRefType);
-  auto memRefDescriptor = MemRefDescriptor::undef(rewriter, loc, structType);
-
-  // Field 1: Allocated pointer, used for malloc/free.
-  memRefDescriptor.setAllocatedPtr(rewriter, loc, allocatedPtr);
-
-  // Field 2: Actual aligned pointer to payload.
-  memRefDescriptor.setAlignedPtr(rewriter, loc, alignedPtr);
-
-  // Field 3: Offset in aligned pointer.
-  memRefDescriptor.setOffset(rewriter, loc,
-                             createIndexConstant(rewriter, loc, 0));
-
-  // Fields 4: Sizes.
-  for (auto en : llvm::enumerate(sizes))
-    memRefDescriptor.setSize(rewriter, loc, en.index(), en.value());
-
-  // Field 5: Strides.
-  for (auto en : llvm::enumerate(strides))
-    memRefDescriptor.setStride(rewriter, loc, en.index(), en.value());
-
-  return memRefDescriptor;
-}
-
 /// Only retain those attributes that are not constructed by
 /// `LLVMFuncOp::build`. If `filterArgAttrs` is set, also filter out argument
 /// attributes.
@@ -572,190 +366,6 @@ struct BarePtrFuncOpConversion : public FuncOpConversionBase {
   }
 };
 
-//////////////// Support for Lowering operations on n-D vectors ////////////////
-// Helper struct to "unroll" operations on n-D vectors in terms of operations on
-// 1-D LLVM vectors.
-struct NDVectorTypeInfo {
-  // LLVM array struct which encodes n-D vectors.
-  Type llvmNDVectorTy;
-  // LLVM vector type which encodes the inner 1-D vector type.
-  Type llvm1DVectorTy;
-  // Multiplicity of llvmNDVectorTy to llvm1DVectorTy.
-  SmallVector<int64_t, 4> arraySizes;
-};
-} // namespace
-
-// For >1-D vector types, extracts the necessary information to iterate over all
-// 1-D subvectors in the underlying llrepresentation of the n-D vector
-// Iterates on the llvm array type until we hit a non-array type (which is
-// asserted to be an llvm vector type).
-static NDVectorTypeInfo extractNDVectorTypeInfo(VectorType vectorType,
-                                                LLVMTypeConverter &converter) {
-  assert(vectorType.getRank() > 1 && "expected >1D vector type");
-  NDVectorTypeInfo info;
-  info.llvmNDVectorTy = converter.convertType(vectorType);
-  if (!info.llvmNDVectorTy || !LLVM::isCompatibleType(info.llvmNDVectorTy)) {
-    info.llvmNDVectorTy = nullptr;
-    return info;
-  }
-  info.arraySizes.reserve(vectorType.getRank() - 1);
-  auto llvmTy = info.llvmNDVectorTy;
-  while (llvmTy.isa<LLVM::LLVMArrayType>()) {
-    info.arraySizes.push_back(
-        llvmTy.cast<LLVM::LLVMArrayType>().getNumElements());
-    llvmTy = llvmTy.cast<LLVM::LLVMArrayType>().getElementType();
-  }
-  if (!LLVM::isCompatibleVectorType(llvmTy))
-    return info;
-  info.llvm1DVectorTy = llvmTy;
-  return info;
-}
-
-// Express `linearIndex` in terms of coordinates of `basis`.
-// Returns the empty vector when linearIndex is out of the range [0, P] where
-// P is the product of all the basis coordinates.
-//
-// Prerequisites:
-//   Basis is an array of nonnegative integers (signed type inherited from
-//   vector shape type).
-static SmallVector<int64_t, 4> getCoordinates(ArrayRef<int64_t> basis,
-                                              unsigned linearIndex) {
-  SmallVector<int64_t, 4> res;
-  res.reserve(basis.size());
-  for (unsigned basisElement : llvm::reverse(basis)) {
-    res.push_back(linearIndex % basisElement);
-    linearIndex = linearIndex / basisElement;
-  }
-  if (linearIndex > 0)
-    return {};
-  std::reverse(res.begin(), res.end());
-  return res;
-}
-
-// Iterate of linear index, convert to coords space and insert splatted 1-D
-// vector in each position.
-template <typename Lambda>
-void nDVectorIterate(const NDVectorTypeInfo &info, OpBuilder &builder,
-                     Lambda fun) {
-  unsigned ub = 1;
-  for (auto s : info.arraySizes)
-    ub *= s;
-  for (unsigned linearIndex = 0; linearIndex < ub; ++linearIndex) {
-    auto coords = getCoordinates(info.arraySizes, linearIndex);
-    // Linear index is out of bounds, we are done.
-    if (coords.empty())
-      break;
-    assert(coords.size() == info.arraySizes.size());
-    auto position = builder.getI64ArrayAttr(coords);
-    fun(position);
-  }
-}
-////////////// End Support for Lowering operations on n-D vectors //////////////
-
-/// Replaces the given operation "op" with a new operation of type "targetOp"
-/// and given operands.
-LogicalResult LLVM::detail::oneToOneRewrite(
-    Operation *op, StringRef targetOp, ValueRange operands,
-    LLVMTypeConverter &typeConverter, ConversionPatternRewriter &rewriter) {
-  unsigned numResults = op->getNumResults();
-
-  Type packedType;
-  if (numResults != 0) {
-    packedType = typeConverter.packFunctionResults(op->getResultTypes());
-    if (!packedType)
-      return failure();
-  }
-
-  // Create the operation through state since we don't know its C++ type.
-  OperationState state(op->getLoc(), targetOp);
-  state.addTypes(packedType);
-  state.addOperands(operands);
-  state.addAttributes(op->getAttrs());
-  Operation *newOp = rewriter.createOperation(state);
-
-  // If the operation produced 0 or 1 result, return them immediately.
-  if (numResults == 0)
-    return rewriter.eraseOp(op), success();
-  if (numResults == 1)
-    return rewriter.replaceOp(op, newOp->getResult(0)), success();
-
-  // Otherwise, it had been converted to an operation producing a structure.
-  // Extract individual results from the structure and return them as list.
-  SmallVector<Value, 4> results;
-  results.reserve(numResults);
-  for (unsigned i = 0; i < numResults; ++i) {
-    auto type = typeConverter.convertType(op->getResult(i).getType());
-    results.push_back(rewriter.create<LLVM::ExtractValueOp>(
-        op->getLoc(), type, newOp->getResult(0), rewriter.getI64ArrayAttr(i)));
-  }
-  rewriter.replaceOp(op, results);
-  return success();
-}
-
-static LogicalResult handleMultidimensionalVectors(
-    Operation *op, ValueRange operands, LLVMTypeConverter &typeConverter,
-    std::function<Value(Type, ValueRange)> createOperand,
-    ConversionPatternRewriter &rewriter) {
-  auto resultNDVectorType = op->getResult(0).getType().cast<VectorType>();
-
-  SmallVector<Type> operand1DVectorTypes;
-  for (Value operand : op->getOperands()) {
-    auto operandNDVectorType = operand.getType().cast<VectorType>();
-    auto operandTypeInfo =
-        extractNDVectorTypeInfo(operandNDVectorType, typeConverter);
-    operand1DVectorTypes.push_back(operandTypeInfo.llvm1DVectorTy);
-  }
-  auto resultTypeInfo =
-      extractNDVectorTypeInfo(resultNDVectorType, typeConverter);
-  auto result1DVectorTy = resultTypeInfo.llvm1DVectorTy;
-  auto resultNDVectoryTy = resultTypeInfo.llvmNDVectorTy;
-  auto loc = op->getLoc();
-  Value desc = rewriter.create<LLVM::UndefOp>(loc, resultNDVectoryTy);
-  nDVectorIterate(resultTypeInfo, rewriter, [&](ArrayAttr position) {
-    // For this unrolled `position` corresponding to the `linearIndex`^th
-    // element, extract operand vectors
-    SmallVector<Value, 4> extractedOperands;
-    for (auto operand : llvm::enumerate(operands)) {
-      extractedOperands.push_back(rewriter.create<LLVM::ExtractValueOp>(
-          loc, operand1DVectorTypes[operand.index()], operand.value(),
-          position));
-    }
-    Value newVal = createOperand(result1DVectorTy, extractedOperands);
-    desc = rewriter.create<LLVM::InsertValueOp>(loc, resultNDVectoryTy, desc,
-                                                newVal, position);
-  });
-  rewriter.replaceOp(op, desc);
-  return success();
-}
-
-LogicalResult LLVM::detail::vectorOneToOneRewrite(
-    Operation *op, StringRef targetOp, ValueRange operands,
-    LLVMTypeConverter &typeConverter, ConversionPatternRewriter &rewriter) {
-  assert(!operands.empty());
-
-  // Cannot convert ops if their operands are not of LLVM type.
-  if (!llvm::all_of(operands.getTypes(),
-                    [](Type t) { return isCompatibleType(t); }))
-    return failure();
-
-  auto llvmNDVectorTy = operands[0].getType();
-  if (!llvmNDVectorTy.isa<LLVM::LLVMArrayType>())
-    return oneToOneRewrite(op, targetOp, operands, typeConverter, rewriter);
-
-  auto callback = [op, targetOp, &rewriter](Type llvm1DVectorTy,
-                                            ValueRange operands) {
-    OperationState state(op->getLoc(), targetOp);
-    state.addTypes(llvm1DVectorTy);
-    state.addOperands(operands);
-    state.addAttributes(op->getAttrs());
-    return rewriter.createOperation(state)->getResult(0);
-  };
-
-  return handleMultidimensionalVectors(op, operands, typeConverter, callback,
-                                       rewriter);
-}
-
-namespace {
 // Straightforward lowerings.
 using AbsFOpLowering = VectorConvertToLLVMPattern<AbsFOp, LLVM::FAbsOp>;
 using AddFOpLowering = VectorConvertToLLVMPattern<AddFOp, LLVM::FAddOp>;
@@ -1427,7 +1037,7 @@ struct ExpM1OpLowering : public ConvertOpToLLVMPattern<math::ExpM1Op> {
     if (!vectorType)
       return rewriter.notifyMatchFailure(op, "expected vector result type");
 
-    return handleMultidimensionalVectors(
+    return LLVM::detail::handleMultidimensionalVectors(
         op.getOperation(), operands, *getTypeConverter(),
         [&](Type llvm1DVectorTy, ValueRange operands) {
           auto splatAttr = SplatElementsAttr::get(
@@ -1482,7 +1092,7 @@ struct Log1pOpLowering : public ConvertOpToLLVMPattern<math::Log1pOp> {
     if (!vectorType)
       return rewriter.notifyMatchFailure(op, "expected vector result type");
 
-    return handleMultidimensionalVectors(
+    return LLVM::detail::handleMultidimensionalVectors(
         op.getOperation(), operands, *getTypeConverter(),
         [&](Type llvm1DVectorTy, ValueRange operands) {
           auto splatAttr = SplatElementsAttr::get(
@@ -1536,7 +1146,7 @@ struct RsqrtOpLowering : public ConvertOpToLLVMPattern<math::RsqrtOp> {
     if (!vectorType)
       return failure();
 
-    return handleMultidimensionalVectors(
+    return LLVM::detail::handleMultidimensionalVectors(
         op.getOperation(), operands, *getTypeConverter(),
         [&](Type llvm1DVectorTy, ValueRange operands) {
           auto splatAttr = SplatElementsAttr::get(
@@ -2244,7 +1854,7 @@ struct CmpIOpLowering : public ConvertOpToLLVMPattern<CmpIOp> {
     if (!vectorType)
       return rewriter.notifyMatchFailure(cmpiOp, "expected vector result type");
 
-    return handleMultidimensionalVectors(
+    return LLVM::detail::handleMultidimensionalVectors(
         cmpiOp.getOperation(), operands, *getTypeConverter(),
         [&](Type llvm1DVectorTy, ValueRange operands) {
           CmpIOpAdaptor transformed(operands);
@@ -2282,7 +1892,7 @@ struct CmpFOpLowering : public ConvertOpToLLVMPattern<CmpFOp> {
     if (!vectorType)
       return rewriter.notifyMatchFailure(cmpfOp, "expected vector result type");
 
-    return handleMultidimensionalVectors(
+    return LLVM::detail::handleMultidimensionalVectors(
         cmpfOp.getOperation(), operands, *getTypeConverter(),
         [&](Type llvm1DVectorTy, ValueRange operands) {
           CmpFOpAdaptor transformed(operands);
@@ -2445,7 +2055,7 @@ struct SplatNdOpLowering : public ConvertOpToLLVMPattern<SplatOp> {
     // First insert it into an undef vector so we can shuffle it.
     auto loc = splatOp.getLoc();
     auto vectorTypeInfo =
-        extractNDVectorTypeInfo(resultType, *getTypeConverter());
+        LLVM::detail::extractNDVectorTypeInfo(resultType, *getTypeConverter());
     auto llvmNDVectorTy = vectorTypeInfo.llvmNDVectorTy;
     auto llvm1DVectorTy = vectorTypeInfo.llvm1DVectorTy;
     if (!llvmNDVectorTy || !llvm1DVectorTy)

From 6c0fd4db79f2def432f761627bb8c7d4171a3237 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <nicolas.vasilache@gmail.com>
Date: Wed, 7 Jul 2021 16:11:10 +0000
Subject: [PATCH 25/32] [mlir][MemRef] Fix DimOp folding of
 OffsetSizeAndStrideInterface.

This addresses the issue reported in

https://llvm.discourse.group/t/rank-reducing-memref-subview-offsetsizeandstrideopinterface-interface-issues/3805

Differential Revision: https://reviews.llvm.org/D105558
---
 mlir/include/mlir/IR/BuiltinTypes.h        |  4 +++
 mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp   | 12 +++----
 mlir/lib/Dialect/Tensor/IR/TensorOps.cpp   | 16 ++++++---
 mlir/lib/IR/BuiltinTypes.cpp               |  9 +++++
 mlir/test/Dialect/MemRef/canonicalize.mlir | 23 ++++++++++++-
 mlir/test/Dialect/Tensor/canonicalize.mlir | 39 ++++++++++++++++++++++
 6 files changed, 92 insertions(+), 11 deletions(-)

diff --git a/mlir/include/mlir/IR/BuiltinTypes.h b/mlir/include/mlir/IR/BuiltinTypes.h
index 8b30fa94f9936..44e751ab0edeb 100644
--- a/mlir/include/mlir/IR/BuiltinTypes.h
+++ b/mlir/include/mlir/IR/BuiltinTypes.h
@@ -110,6 +110,10 @@ class ShapedType : public Type {
   /// size. Otherwise, abort.
   int64_t getNumDynamicDims() const;
 
+  /// If `dim` is a dynamic dim, return its relative index among the dynamic
+  /// dims. Otherwise, abort. The result is guaranteed to be nonnegative.
+  int64_t getRelativeIndexOfDynamicDim(unsigned dim) const;
+
   /// If this is ranked type, return the size of the specified dimension.
   /// Otherwise, abort.
   int64_t getDimSize(unsigned idx) const;
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index 518539376c9f4..a4cbb23bf74dc 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -175,9 +175,9 @@ struct SimplifyDeadAlloc : public OpRewritePattern<T> {
   LogicalResult matchAndRewrite(T alloc,
                                 PatternRewriter &rewriter) const override {
     if (llvm::any_of(alloc->getUsers(), [&](Operation *op) {
-        if (auto storeOp = dyn_cast<StoreOp>(op))
-          return storeOp.value() == alloc;
-        return !isa<DeallocOp>(op);
+          if (auto storeOp = dyn_cast<StoreOp>(op))
+            return storeOp.value() == alloc;
+          return !isa<DeallocOp>(op);
         }))
       return failure();
 
@@ -677,9 +677,9 @@ OpFoldResult DimOp::fold(ArrayRef<Attribute> operands) {
 
   if (auto sizeInterface =
           dyn_cast_or_null<OffsetSizeAndStrideOpInterface>(definingOp)) {
-    assert(sizeInterface.isDynamicSize(unsignedIndex) &&
-           "Expected dynamic subview size");
-    return sizeInterface.getDynamicSize(unsignedIndex);
+    int64_t nthDynamicIndex =
+        memrefType.getRelativeIndexOfDynamicDim(unsignedIndex);
+    return sizeInterface.sizes()[nthDynamicIndex];
   }
 
   // dim(memrefcast) -> dim
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index dbd47c2d1fcd0..b794c11d5948f 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -271,13 +271,21 @@ OpFoldResult DimOp::fold(ArrayRef<Attribute> operands) {
     return Value{*dynExtents};
   }
 
+  // dim(insert_slice.result()) -> dim(insert_slice.dest())
+  if (auto insertSliceOp =
+          dyn_cast_or_null<tensor::InsertSliceOp>(definingOp)) {
+    this->sourceMutable().assign(insertSliceOp.dest());
+    return getResult();
+  }
+
   // The size at the given index is now known to be a dynamic size.
   unsigned unsignedIndex = index.getValue().getZExtValue();
 
-  if (auto sliceOp = dyn_cast_or_null<tensor::ExtractSliceOp>(definingOp)) {
-    assert(sliceOp.isDynamicSize(unsignedIndex) &&
-           "Expected dynamic slice size");
-    return sliceOp.getDynamicSize(unsignedIndex);
+  if (auto sizeInterface =
+          dyn_cast_or_null<OffsetSizeAndStrideOpInterface>(definingOp)) {
+    int64_t nthDynamicIndex =
+        tensorType.getRelativeIndexOfDynamicDim(unsignedIndex);
+    return sizeInterface.sizes()[nthDynamicIndex];
   }
 
   // dim(cast) -> dim
diff --git a/mlir/lib/IR/BuiltinTypes.cpp b/mlir/lib/IR/BuiltinTypes.cpp
index f350596384a90..0c715d2d528f5 100644
--- a/mlir/lib/IR/BuiltinTypes.cpp
+++ b/mlir/lib/IR/BuiltinTypes.cpp
@@ -427,6 +427,15 @@ int64_t ShapedType::getNumDynamicDims() const {
   return llvm::count_if(getShape(), isDynamic);
 }
 
+int64_t ShapedType::getRelativeIndexOfDynamicDim(unsigned dim) const {
+  assert(isDynamicDim(dim) && "expected a dynamic dim");
+  int nthDynamicIndex = -1;
+  for (unsigned idx = 0; idx <= dim; ++idx)
+    if (isDynamicDim(idx))
+      ++nthDynamicIndex;
+  return nthDynamicIndex;
+}
+
 bool ShapedType::hasStaticShape() const {
   return hasRank() && llvm::none_of(getShape(), isDynamic);
 }
diff --git a/mlir/test/Dialect/MemRef/canonicalize.mlir b/mlir/test/Dialect/MemRef/canonicalize.mlir
index 2ae2c06dea92e..302477f04421e 100644
--- a/mlir/test/Dialect/MemRef/canonicalize.mlir
+++ b/mlir/test/Dialect/MemRef/canonicalize.mlir
@@ -387,11 +387,32 @@ func @alloc_const_fold_with_symbols2() -> memref<?xi32, #map0> {
 }
 
 // -----
+
 // CHECK-LABEL: func @allocator
 // CHECK:   %[[alloc:.+]] = memref.alloc
 // CHECK:   memref.store %[[alloc:.+]], %arg0
 func @allocator(%arg0 : memref<memref<?xi32>>, %arg1 : index)  {
   %0 = memref.alloc(%arg1) : memref<?xi32>
   memref.store %0, %arg0[] : memref<memref<?xi32>>
-  return 
+  return
+}
+
+// -----
+
+#map0 = affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)>
+
+// CHECK-LABEL: func @rank_reducing_subview_dim
+//  CHECK-SAME:   %[[IDX_0:[0-9a-zA-Z]*]]: index
+//  CHECK-SAME:   %[[IDX_1:[0-9a-zA-Z]*]]: index
+func @rank_reducing_subview_dim(%arg0 : memref<?x?x?xf32>, %arg1 : index,
+    %arg2 : index) -> index
+{
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %c4 = constant 4 : index
+  %0 = memref.subview %arg0[%c0, %arg1, %c1] [%c4, 1, %arg2] [%c1, %c1, %c1] : memref<?x?x?xf32> to memref<?x?xf32, #map0>
+  %1 = memref.dim %0, %c1 : memref<?x?xf32, #map0>
+
+  // CHECK-NEXT: return %[[IDX_1]] : index
+  return %1 : index
 }
diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir
index f0259952da380..977357077df37 100644
--- a/mlir/test/Dialect/Tensor/canonicalize.mlir
+++ b/mlir/test/Dialect/Tensor/canonicalize.mlir
@@ -517,3 +517,42 @@ func @fold_dim_of_tensor.cast(%arg0 : tensor<4x?xf32>) -> (index, index) {
   %2 = tensor.dim %0, %c1 : tensor<?x?xf32>
   return %1, %2: index, index
 }
+
+// -----
+
+// CHECK-LABEL: func @rank_reducing_extract_slice_dim
+//  CHECK-SAME:   %[[IDX_0:[0-9a-zA-Z]*]]: index
+//  CHECK-SAME:   %[[IDX_1:[0-9a-zA-Z]*]]: index
+func @rank_reducing_extract_slice_dim(%arg0 : tensor<?x?x?xf32>, %arg1 : index,
+    %arg2 : index) -> index
+{
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %c4 = constant 4 : index
+  %0 = tensor.extract_slice %arg0[%c0, %arg1, %c1] [%c4, 1, %arg2] [%c1, %c1, %c1] : tensor<?x?x?xf32> to tensor<?x?xf32>
+  %1 = tensor.dim %0, %c1 : tensor<?x?xf32>
+
+  // CHECK-NEXT: return %[[IDX_1]] : index
+  return %1 : index
+}
+
+// -----
+
+// CHECK-LABEL: func @rank_reducing_insert_slice_dim
+//  CHECK-SAME:   %[[OUT:[0-9a-zA-Z]*]]: tensor<?x?x?xf32>
+func @rank_reducing_insert_slice_dim(%out : tensor<?x?x?xf32>, %in : tensor<?x?xf32>, %arg1 : index,
+    %arg2 : index) -> index
+{
+  // CHECK-NEXT: %[[C1:.*]] = constant 1 : index
+
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %c4 = constant 4 : index
+  %0 = tensor.insert_slice %in into %out[%c0, %arg1, %c1] [1, 1, 1] [%c1, %c1, %c1] : tensor<?x?xf32> into tensor<?x?x?xf32>
+
+  // CHECK-NEXT: %[[D1:.*]] = tensor.dim %[[OUT]], %[[C1]] : tensor<?x?x?xf32>
+  %1 = tensor.dim %0, %c1 : tensor<?x?x?xf32>
+
+  // CHECK-NEXT: return %[[D1]] : index
+  return %1 : index
+}

From 84354b2ab20924b3807c0464308852e4568b63a3 Mon Sep 17 00:00:00 2001
From: Tobias Gysi <gysit@google.com>
Date: Thu, 8 Jul 2021 07:02:15 +0000
Subject: [PATCH 26/32] [mlir][linalg] Remove GenericOpBase.

Remove the GenericOpBase class formerly used to factor out common logic shared be GenericOp and IndexedGenericOp. After removing IndexedGenericOp, the base class is not used anymore.

Differential Revision: https://reviews.llvm.org/D105307
---
 .../Dialect/Linalg/IR/LinalgStructuredOps.td  | 65 ++++++++++---------
 1 file changed, 33 insertions(+), 32 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
index 18f5beeddf2ea..fa17237216596 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
@@ -503,41 +503,10 @@ def PoolingSumOp: SingleInputPoolingBase_Op<"pooling_sum"> {
 // Generic Linalg ops.
 //===----------------------------------------------------------------------===//
 
-class GenericOpBase<string mnemonic> : LinalgStructuredBase_Op<mnemonic, [
+def GenericOp : LinalgStructuredBase_Op<"generic", [
     AttrSizedOperandSegments,
     DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
     SingleBlockImplicitTerminator<"YieldOp">]> {
-  let arguments = (ins Variadic<AnyType>:$inputs,
-                       Variadic<AnyShaped>:$outputs,
-                       AffineMapArrayAttr:$indexing_maps,
-                       ArrayAttr:$iterator_types,
-                       OptionalAttr<StrAttr>:$doc,
-                       OptionalAttr<StrAttr>:$library_call);
-  let results = (outs Variadic<AnyRankedTensor>:$result_tensors);
-  let regions = (region AnyRegion:$region);
-  let extraClassDeclaration = structuredOpsBaseDecls # [{
-    SmallVector<StringRef, 8> linalgTraitAttrNames() {
-      return SmallVector<StringRef, 8>{
-        getDocAttrName(),
-        getIndexingMapsAttrName(), getLibraryCallAttrName(),
-        getIteratorTypesAttrName(),
-      };
-    }
-    std::string getLibraryCallName() {
-      return library_call().hasValue() ?
-        library_call()->str() : "op_has_no_registered_library_name";
-    }
-
-    static std::function<void(ImplicitLocOpBuilder &b, Block &block)>
-    getRegionBuilder() {
-      return nullptr;
-    }
-  }];
-  let printer = [{ return ::print(p, *this); }];
-  let parser = [{ return ::parseGenericOp(parser, result); }];
-}
-
-def GenericOp : GenericOpBase<"generic"> {
   let description = [{
     Generic Linalg op form where the key properties of the computation are
     specified as attributes. In pretty form, a `linalg.generic` op is written
@@ -636,6 +605,15 @@ def GenericOp : GenericOpBase<"generic"> {
     ```
   }];
 
+  let arguments = (ins Variadic<AnyType>:$inputs,
+                       Variadic<AnyShaped>:$outputs,
+                       AffineMapArrayAttr:$indexing_maps,
+                       ArrayAttr:$iterator_types,
+                       OptionalAttr<StrAttr>:$doc,
+                       OptionalAttr<StrAttr>:$library_call);
+  let results = (outs Variadic<AnyRankedTensor>:$result_tensors);
+  let regions = (region AnyRegion:$region);
+
   let builders = [
     OpBuilder<(ins "TypeRange":$resultTensorTypes, "ValueRange":$inputs,
       "ValueRange":$outputs, "ArrayRef<AffineMap>":$indexingMaps,
@@ -654,6 +632,29 @@ def GenericOp : GenericOpBase<"generic"> {
       "ArrayRef<AffineMap>":$indexingMaps, "ArrayRef<StringRef>":$iteratorTypes,
       CArg<"function_ref<void(OpBuilder &, Location, ValueRange)>", "nullptr">)>
   ];
+
+  let extraClassDeclaration = structuredOpsBaseDecls # [{
+    SmallVector<StringRef, 8> linalgTraitAttrNames() {
+      return SmallVector<StringRef, 8>{
+        getDocAttrName(),
+        getIndexingMapsAttrName(), getLibraryCallAttrName(),
+        getIteratorTypesAttrName(),
+      };
+    }
+    std::string getLibraryCallName() {
+      return library_call().hasValue() ?
+        library_call()->str() : "op_has_no_registered_library_name";
+    }
+
+    static std::function<void(ImplicitLocOpBuilder &b, Block &block)>
+    getRegionBuilder() {
+      return nullptr;
+    }
+  }];
+
+  let printer = [{ return ::print(p, *this); }];
+  let parser = [{ return ::parseGenericOp(parser, result); }];
+
   let verifier = [{ return ::verify(*this); }];
 
   let hasFolder = 1;

From abfa950d86da1737a7dd52ba262fa39dd2e937fa Mon Sep 17 00:00:00 2001
From: Tobias Gysi <gysit@google.com>
Date: Thu, 8 Jul 2021 08:48:23 +0000
Subject: [PATCH 27/32] [mlir][linalg][python] Add exp and log to the OpDSL.

Introduce the exp and log function in OpDSL. Add the soft plus operator to test the emitted IR in Python and C++.

Reviewed By: nicolasvasilache

Differential Revision: https://reviews.llvm.org/D105420
---
 .../mlir/Dialect/Linalg/IR/LinalgBase.td      |  4 +-
 .../Linalg/IR/LinalgNamedStructuredOps.yaml   | 55 +++++++++++++++++++
 .../mlir/Dialect/Linalg/IR/LinalgTypes.h      |  1 +
 mlir/lib/Dialect/Linalg/IR/CMakeLists.txt     |  1 +
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp      | 14 +++++
 .../dialects/linalg/opdsl/lang/emitter.py     | 11 ++++
 .../linalg/opdsl/ops/core_named_ops.py        | 13 +++++
 .../generalize-named-polymorphic-ops.mlir     | 17 ++++++
 .../linalg/opdsl/emit_structured_generic.py   | 21 +++++++
 9 files changed, 135 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgBase.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgBase.td
index 092d22983d3f2..49ececc0790aa 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgBase.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgBase.td
@@ -33,8 +33,8 @@ def Linalg_Dialect : Dialect {
   }];
   let cppNamespace = "::mlir::linalg";
   let dependentDialects = [
-    "AffineDialect", "memref::MemRefDialect", "StandardOpsDialect",
-    "tensor::TensorDialect"
+    "AffineDialect", "math::MathDialect", "memref::MemRefDialect",
+    "StandardOpsDialect", "tensor::TensorDialect"
   ];
   let hasCanonicalizer = 1;
   let hasOperationAttrVerify = 1;
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
index 1e4277ecd7bdf..04f9776005c4e 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
@@ -887,3 +887,58 @@ structured_op: !LinalgStructuredOpConfig
                           scalar_const: '2.3283063999999999E-10 : f64'
             - !ScalarExpression
               scalar_arg: min
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: soft_plus_2d
+  cpp_class_name: SoftPlus2DOp
+  doc: |-
+    Implements the soft plus operator.
+
+    Numeric casting is performed on the input operand, promoting it to the same
+    data type as the accumulator/output.
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    usage: InputOperand
+    type_var: T
+    shape_map: affine_map<()[s0, s1] -> (s0, s1)>
+  - !LinalgOperandDefConfig
+    name: O
+    usage: OutputOperand
+    type_var: U
+    shape_map: affine_map<()[s0, s1] -> (s0, s1)>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<(d0, d1)[s0, s1] -> (d0, d1)>
+    - affine_map<(d0, d1)[s0, s1] -> (d0, d1)>
+  iterator_types:
+  - parallel
+  - parallel
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_apply:
+        fn_name: log
+        operands:
+        - !ScalarExpression
+          scalar_apply:
+            fn_name: add
+            operands:
+            - !ScalarExpression
+              symbolic_cast:
+                type_var: U
+                operands:
+                - !ScalarExpression
+                  scalar_const: '1.000000e+00 : f64'
+            - !ScalarExpression
+              scalar_apply:
+                fn_name: exp
+                operands:
+                - !ScalarExpression
+                  symbolic_cast:
+                    type_var: U
+                    operands:
+                    - !ScalarExpression
+                      scalar_arg: I
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgTypes.h b/mlir/include/mlir/Dialect/Linalg/IR/LinalgTypes.h
index c5cfdd15c00a8..f5913e6ad6164 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgTypes.h
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgTypes.h
@@ -10,6 +10,7 @@
 #define MLIR_DIALECT_LINALG_LINALGTYPES_H_
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
diff --git a/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt b/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt
index 21104281b8120..14187f400e726 100644
--- a/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt
@@ -20,6 +20,7 @@ add_mlir_dialect_library(MLIRLinalg
   MLIRSideEffectInterfaces
   MLIRViewLikeInterface
   MLIRStandard
+  MLIRMath
   MLIRMemRef
   MLIRTensor
   )
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 93062b10ccc63..ea12a312d9c01 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -256,6 +256,20 @@ class RegionBuilderHelper {
     llvm_unreachable("unsupported non numeric type");
   }
 
+  Value applyfn__exp(Value x) {
+    OpBuilder builder = getBuilder();
+    if (isFloatingPoint(x))
+      return builder.create<math::ExpOp>(x.getLoc(), x);
+    llvm_unreachable("unsupported non numeric type");
+  }
+
+  Value applyfn__log(Value x) {
+    OpBuilder builder = getBuilder();
+    if (isFloatingPoint(x))
+      return builder.create<math::LogOp>(x.getLoc(), x);
+    llvm_unreachable("unsupported non numeric type");
+  }
+
   Value applyfn__sub(Value lhs, Value rhs) {
     OpBuilder builder = getBuilder();
     if (isFloatingPoint(lhs))
diff --git a/mlir/python/mlir/dialects/linalg/opdsl/lang/emitter.py b/mlir/python/mlir/dialects/linalg/opdsl/lang/emitter.py
index 61d2260587116..3810df9dff74a 100644
--- a/mlir/python/mlir/dialects/linalg/opdsl/lang/emitter.py
+++ b/mlir/python/mlir/dialects/linalg/opdsl/lang/emitter.py
@@ -7,6 +7,7 @@
 from mlir.ir import *
 from mlir.dialects import linalg
 from mlir.dialects import std
+from mlir.dialects import math
 # TODO: resolve name collision for Linalg functionality that is injected inside
 # the _mlir.dialects.linalg directly via pybind.
 from _mlir.dialects.linalg import fill_builtin_region
@@ -293,6 +294,16 @@ def _eval_add(self, lhs: Value, rhs: Value) -> Value:
       return std.AddIOp(lhs.type, lhs, rhs).result
     raise NotImplementedError("Unsupported 'add' operand: {lhs}")
 
+  def _eval_exp(self, x: Value) -> Value:
+    if _is_floating_point_type(x.type):
+      return math.ExpOp(x.type, x).result
+    raise NotImplementedError("Unsupported 'exp' operand: {x}")
+
+  def _eval_log(self, x: Value) -> Value:
+    if _is_floating_point_type(x.type):
+      return math.LogOp(x.type, x).result
+    raise NotImplementedError("Unsupported 'log' operand: {x}")
+
   def _eval_sub(self, lhs: Value, rhs: Value) -> Value:
     if _is_floating_point_type(lhs.type):
       return std.SubFOp(lhs.type, lhs, rhs).result
diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
index a37e1944c1f75..72793cbf9c726 100644
--- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
+++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
@@ -209,3 +209,16 @@ def fill_rng_2d(
   offset = cast(F64, const(2147483647))
   scaling = (max - min) * inv_range
   O[D.m, D.n] = cast(T, (offset + cast(F64, rand2)) * scaling + min)
+
+
+@linalg_structured_op
+def soft_plus_2d(
+    I=TensorDef(T, S.M, S.N), O=TensorDef(U, S.M, S.N, output=True)):
+  """Implements the soft plus operator.
+
+  Numeric casting is performed on the input operand, promoting it to the same
+  data type as the accumulator/output.
+  """
+  domain(D.m, D.n)
+  O[D.m, D.n] = \
+      PrimFn.log(cast(U, const(1.0)) + PrimFn.exp(cast(U, I[D.m, D.n])))
diff --git a/mlir/test/Dialect/Linalg/generalize-named-polymorphic-ops.mlir b/mlir/test/Dialect/Linalg/generalize-named-polymorphic-ops.mlir
index 0e1c6a62a7b10..aed3585d4f547 100644
--- a/mlir/test/Dialect/Linalg/generalize-named-polymorphic-ops.mlir
+++ b/mlir/test/Dialect/Linalg/generalize-named-polymorphic-ops.mlir
@@ -188,6 +188,23 @@ func @generalize_fill_rng_2d_i32(%min: f64, %max: f64, %seed: i32, %O: tensor<16
 // CHECK-NEXT:   linalg.yield %[[VAL6]] : i32
 // CHECK-NEXT: -> tensor<16x32xi32>
 
+// -----
+
+func @generalize_soft_plus_2d_f32(%input: tensor<16x32xf32>, %output: tensor<16x32xf32>) -> tensor<16x32xf32> {
+  %0 = linalg.soft_plus_2d ins(%input: tensor<16x32xf32>) outs(%output: tensor<16x32xf32>) -> tensor<16x32xf32>
+  return %0: tensor<16x32xf32>
+}
+
+// CHECK-LABEL: @generalize_soft_plus_2d_f32
+//      CHECK: %[[C1:.+]] = constant 1.000000e+00 : f64
+//      CHECK: ^{{.*}}(%[[IN:.+]]: f32, %[[OUT:.+]]: f32
+// CHECK-NEXT:   %[[C1_CAST:.+]] = fptrunc %[[C1]] : f64 to f32
+// CHECK-NEXT:   %[[EXP:.+]] = math.exp %[[IN]] : f32
+// CHECK-NEXT:   %[[SUM:.+]] = addf %[[C1_CAST]], %[[EXP]] : f32
+// CHECK-NEXT:   %[[LOG:.+]] = math.log %[[SUM]] : f32
+// CHECK-NEXT:   linalg.yield %[[LOG]] : f32
+// CHECK-NEXT: -> tensor<16x32xf32>
+
 // -----
 // Verifies floating point to integer cast.
 func @generalize_matmul_tensor_f32_f32_i16(%A : tensor<16x8xf32>, %B: tensor<8x32xf32>, %C: tensor<16x32xi16>) -> tensor<16x32xi16> {
diff --git a/mlir/test/python/dialects/linalg/opdsl/emit_structured_generic.py b/mlir/test/python/dialects/linalg/opdsl/emit_structured_generic.py
index 44ac4e8e8c5b4..ed33644859012 100644
--- a/mlir/test/python/dialects/linalg/opdsl/emit_structured_generic.py
+++ b/mlir/test/python/dialects/linalg/opdsl/emit_structured_generic.py
@@ -84,6 +84,13 @@ def fill_rng_poly(
   O[D.m, D.n] = cast(T, (offset + cast(F64, rand2)) * scaling + min)
 
 
+@linalg_structured_op
+def soft_plus_poly(
+    I=TensorDef(T, S.M, S.N), O=TensorDef(U, S.M, S.N, output=True)):
+  O[D.m, D.n] = \
+      PrimFn.log(cast(U, const(1.0)) + cast(U, PrimFn.exp(I[D.m, D.n])))
+
+
 with Context() as ctx, Location.unknown():
   module = Module.create()
   f16 = F16Type.get()
@@ -299,5 +306,19 @@ def test_f32f32_min_pooling(input, shape, init_result):
     def test_i32_fill_rng(min, max, seed, init_result):
       return fill_rng_poly(min, max, seed, outs=[init_result])
 
+    # CHECK-LABEL: @test_f32_soft_plus
+    # CHECK:      ^{{.*}}(%[[IN:.+]]: f32, %[[OUT:.+]]: f32)
+    # CHECK-NEXT:   %[[C1:.+]] = constant 1.000000e+00 : f64
+    # CHECK-NEXT:   %[[C1_CAST:.+]] = fptrunc %[[C1]] : f64 to f32
+    # CHECK-NEXT:   %[[EXP:.+]] = math.exp %[[IN]] : f32
+    # CHECK-NEXT:   %[[SUM:.+]] = addf %[[C1_CAST]], %[[EXP]] : f32
+    # CHECK-NEXT:   %[[LOG:.+]] = math.log %[[SUM]] : f32
+    # CHECK-NEXT:   linalg.yield %[[LOG]] : f32
+    # CHECK-NEXT: -> tensor<4x16xf32>
+    @builtin.FuncOp.from_py_func(
+        RankedTensorType.get((4, 16), f32), RankedTensorType.get((4, 16), f32))
+    def test_f32_soft_plus(input, init_result):
+      return soft_plus_poly(input, outs=[init_result])
+
 
 print(module)

From 715ca752ac4f8ba69fe68110823e0eabf5614bc7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Wed, 7 Jul 2021 21:06:08 +0000
Subject: [PATCH 28/32] [libcxx] [test] Fix spurious failures in the thread
 detach test on Windows

Make sure that the detached thread has started up before exiting
the process.

If the detached thread hasn't started up at all, and the main thread
exits, global data structures in the process are torn down, which
then can cause crashes when the thread starts up late after required
mutexes have been destroyed. (In particular, the mutex used internally
in _Init_thread_header, which is used in the initialization of
__thread_local_data()::__p, can cause crashes if the main thread already
has finished and progressed far with destruction.)

Differential Revision: https://reviews.llvm.org/D105592
---
 .../thread.thread.member/detach.pass.cpp                  | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.member/detach.pass.cpp b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.member/detach.pass.cpp
index ea82d5392aeb6..03dc79d2d8379 100644
--- a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.member/detach.pass.cpp
+++ b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.member/detach.pass.cpp
@@ -60,7 +60,7 @@ class G
 int G::n_alive = 0;
 bool G::op_run = false;
 
-void foo() {}
+void foo() { done = true; }
 
 int main(int, char**)
 {
@@ -75,6 +75,7 @@ int main(int, char**)
         assert(G::n_alive == 1);
     }
     assert(G::n_alive == 0);
+    done = false;
 #ifndef TEST_HAS_NO_EXCEPTIONS
     {
         std::thread t0 = support::make_test_thread(foo);
@@ -85,6 +86,11 @@ int main(int, char**)
             t0.detach();
         } catch (std::system_error const&) {
         }
+        // Wait to make sure that the detached thread has started up.
+        // Without this, we could exit main and start destructing global
+        // resources that are needed when the thread starts up, while the
+        // detached thread would start up only later.
+        while (!done) {}
     }
 #endif
 

From d58c7a92380e030af6e6f82ce55bc14a919f39ea Mon Sep 17 00:00:00 2001
From: Moritz Sichert <sichert@in.tum.de>
Date: Wed, 26 May 2021 10:50:15 +0200
Subject: [PATCH 29/32] [IR] Added operator delete to subclasses of User to
 avoid UB

Several subclasses of User override operator new without also overriding
operator delete. This means that delete expressions fall back to using
operator delete of the base class, which would be User. However, this is
only allowed if the base class has a virtual destructor which is not the
case for User, so this is UB.

See also [expr.delete] (3) for the exact wording.

This is actually detected in some cases by GCC 11's
-Wmismatched-new-delete now which is how I found this error.

Differential Revision: https://reviews.llvm.org/D103143
---
 llvm/include/llvm/Analysis/MemorySSA.h      | 10 ++--
 llvm/include/llvm/IR/Constants.h            | 12 +++--
 llvm/include/llvm/IR/GlobalIndirectSymbol.h |  5 +-
 llvm/include/llvm/IR/InstrTypes.h           | 15 +++---
 llvm/include/llvm/IR/Instructions.h         | 55 ++++++++++-----------
 llvm/lib/IR/ConstantsContext.h              | 43 +++++++---------
 6 files changed, 68 insertions(+), 72 deletions(-)

diff --git a/llvm/include/llvm/Analysis/MemorySSA.h b/llvm/include/llvm/Analysis/MemorySSA.h
index a26115aa82f11..f40b99968fd3a 100644
--- a/llvm/include/llvm/Analysis/MemorySSA.h
+++ b/llvm/include/llvm/Analysis/MemorySSA.h
@@ -329,7 +329,8 @@ class MemoryUse final : public MemoryUseOrDef {
                        /*NumOperands=*/1) {}
 
   // allocate space for exactly one operand
-  void *operator new(size_t s) { return User::operator new(s, 1); }
+  void *operator new(size_t S) { return User::operator new(S, 1); }
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   static bool classof(const Value *MA) {
     return MA->getValueID() == MemoryUseVal;
@@ -389,7 +390,8 @@ class MemoryDef final : public MemoryUseOrDef {
         ID(Ver) {}
 
   // allocate space for exactly two operands
-  void *operator new(size_t s) { return User::operator new(s, 2); }
+  void *operator new(size_t S) { return User::operator new(S, 2); }
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   static bool classof(const Value *MA) {
     return MA->getValueID() == MemoryDefVal;
@@ -484,9 +486,11 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(MemoryUseOrDef, MemoryAccess)
 /// issue.
 class MemoryPhi final : public MemoryAccess {
   // allocate space for exactly zero operands
-  void *operator new(size_t s) { return User::operator new(s); }
+  void *operator new(size_t S) { return User::operator new(S); }
 
 public:
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
+
   /// Provide fast operand accessors
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(MemoryAccess);
 
diff --git a/llvm/include/llvm/IR/Constants.h b/llvm/include/llvm/IR/Constants.h
index 07d8e9ab5bb62..142dc21874508 100644
--- a/llvm/include/llvm/IR/Constants.h
+++ b/llvm/include/llvm/IR/Constants.h
@@ -58,9 +58,11 @@ class ConstantData : public Constant {
 protected:
   explicit ConstantData(Type *Ty, ValueTy VT) : Constant(Ty, VT, nullptr, 0) {}
 
-  void *operator new(size_t s) { return User::operator new(s, 0); }
+  void *operator new(size_t S) { return User::operator new(S, 0); }
 
 public:
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
+
   ConstantData(const ConstantData &) = delete;
 
   /// Methods to support type inquiry through isa, cast, and dyn_cast.
@@ -849,12 +851,14 @@ class BlockAddress final : public Constant {
 
   BlockAddress(Function *F, BasicBlock *BB);
 
-  void *operator new(size_t s) { return User::operator new(s, 2); }
+  void *operator new(size_t S) { return User::operator new(S, 2); }
 
   void destroyConstantImpl();
   Value *handleOperandChangeImpl(Value *From, Value *To);
 
 public:
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
+
   /// Return a BlockAddress for the specified function and basic block.
   static BlockAddress *get(Function *F, BasicBlock *BB);
 
@@ -893,12 +897,14 @@ class DSOLocalEquivalent final : public Constant {
 
   DSOLocalEquivalent(GlobalValue *GV);
 
-  void *operator new(size_t s) { return User::operator new(s, 1); }
+  void *operator new(size_t S) { return User::operator new(S, 1); }
 
   void destroyConstantImpl();
   Value *handleOperandChangeImpl(Value *From, Value *To);
 
 public:
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
+
   /// Return a DSOLocalEquivalent for the specified global value.
   static DSOLocalEquivalent *get(GlobalValue *GV);
 
diff --git a/llvm/include/llvm/IR/GlobalIndirectSymbol.h b/llvm/include/llvm/IR/GlobalIndirectSymbol.h
index d996237aa3efb..e45c7529885d5 100644
--- a/llvm/include/llvm/IR/GlobalIndirectSymbol.h
+++ b/llvm/include/llvm/IR/GlobalIndirectSymbol.h
@@ -35,9 +35,8 @@ class GlobalIndirectSymbol : public GlobalValue {
   GlobalIndirectSymbol &operator=(const GlobalIndirectSymbol &) = delete;
 
   // allocate space for exactly one operand
-  void *operator new(size_t s) {
-    return User::operator new(s, 1);
-  }
+  void *operator new(size_t S) { return User::operator new(S, 1); }
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   /// Provide fast operand accessors
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Constant);
diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
index 0e372d1cc8793..2f31db0fa4d7e 100644
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -68,9 +68,8 @@ class UnaryInstruction : public Instruction {
 
 public:
   // allocate space for exactly one operand
-  void *operator new(size_t s) {
-    return User::operator new(s, 1);
-  }
+  void *operator new(size_t S) { return User::operator new(S, 1); }
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
@@ -203,9 +202,8 @@ class BinaryOperator : public Instruction {
 
 public:
   // allocate space for exactly two operands
-  void *operator new(size_t s) {
-    return User::operator new(s, 2);
-  }
+  void *operator new(size_t S) { return User::operator new(S, 2); }
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
@@ -769,9 +767,8 @@ class CmpInst : public Instruction {
 
 public:
   // allocate space for exactly two operands
-  void *operator new(size_t s) {
-    return User::operator new(s, 2);
-  }
+  void *operator new(size_t S) { return User::operator new(S, 2); }
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   /// Construct a compare instruction, given the opcode, the predicate and
   /// the two operands.  Optionally (if InstBefore is specified) insert the
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index 5de72de77f839..e48a14f4b5b4b 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -333,9 +333,8 @@ class StoreInst : public Instruction {
             AtomicOrdering Order, SyncScope::ID SSID, BasicBlock *InsertAtEnd);
 
   // allocate space for exactly two operands
-  void *operator new(size_t s) {
-    return User::operator new(s, 2);
-  }
+  void *operator new(size_t S) { return User::operator new(S, 2); }
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   /// Return true if this is a store to a volatile memory location.
   bool isVolatile() const { return getSubclassData<VolatileField>(); }
@@ -463,9 +462,8 @@ class FenceInst : public Instruction {
             BasicBlock *InsertAtEnd);
 
   // allocate space for exactly zero operands
-  void *operator new(size_t s) {
-    return User::operator new(s, 0);
-  }
+  void *operator new(size_t S) { return User::operator new(S, 0); }
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   /// Returns the ordering constraint of this fence instruction.
   AtomicOrdering getOrdering() const {
@@ -547,9 +545,8 @@ class AtomicCmpXchgInst : public Instruction {
                     BasicBlock *InsertAtEnd);
 
   // allocate space for exactly three operands
-  void *operator new(size_t s) {
-    return User::operator new(s, 3);
-  }
+  void *operator new(size_t S) { return User::operator new(S, 3); }
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   using VolatileField = BoolBitfieldElementT<0>;
   using WeakField = BoolBitfieldElementT<VolatileField::NextBit>;
@@ -792,9 +789,8 @@ class AtomicRMWInst : public Instruction {
                 BasicBlock *InsertAtEnd);
 
   // allocate space for exactly two operands
-  void *operator new(size_t s) {
-    return User::operator new(s, 2);
-  }
+  void *operator new(size_t S) { return User::operator new(S, 2); }
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   using VolatileField = BoolBitfieldElementT<0>;
   using AtomicOrderingField =
@@ -2040,7 +2036,8 @@ class ShuffleVectorInst : public Instruction {
   ShuffleVectorInst(Value *V1, Value *V2, ArrayRef<int> Mask,
                     const Twine &NameStr, BasicBlock *InsertAtEnd);
 
-  void *operator new(size_t s) { return User::operator new(s, 2); }
+  void *operator new(size_t S) { return User::operator new(S, 2); }
+  void operator delete(void *Ptr) { return User::operator delete(Ptr); }
 
   /// Swap the operands and adjust the mask to preserve the semantics
   /// of the instruction.
@@ -2497,9 +2494,8 @@ class InsertValueInst : public Instruction {
 
 public:
   // allocate space for exactly two operands
-  void *operator new(size_t s) {
-    return User::operator new(s, 2);
-  }
+  void *operator new(size_t S) { return User::operator new(S, 2); }
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   static InsertValueInst *Create(Value *Agg, Value *Val,
                                  ArrayRef<unsigned> Idxs,
@@ -2875,9 +2871,7 @@ class LandingPadInst : public Instruction {
                           const Twine &NameStr, BasicBlock *InsertAtEnd);
 
   // Allocate space for exactly zero operands.
-  void *operator new(size_t s) {
-    return User::operator new(s);
-  }
+  void *operator new(size_t S) { return User::operator new(S); }
 
   void growOperands(unsigned Size);
   void init(unsigned NumReservedValues, const Twine &NameStr);
@@ -2889,6 +2883,8 @@ class LandingPadInst : public Instruction {
   LandingPadInst *cloneImpl() const;
 
 public:
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
+
   /// Constructors - NumReservedClauses is a hint for the number of incoming
   /// clauses that this landingpad will have (use 0 if you really have no idea).
   static LandingPadInst *Create(Type *RetTy, unsigned NumReservedClauses,
@@ -3207,9 +3203,7 @@ class SwitchInst : public Instruction {
              BasicBlock *InsertAtEnd);
 
   // allocate space for exactly zero operands
-  void *operator new(size_t s) {
-    return User::operator new(s);
-  }
+  void *operator new(size_t S) { return User::operator new(S); }
 
   void init(Value *Value, BasicBlock *Default, unsigned NumReserved);
   void growOperands();
@@ -3221,6 +3215,8 @@ class SwitchInst : public Instruction {
   SwitchInst *cloneImpl() const;
 
 public:
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
+
   // -2
   static const unsigned DefaultPseudoIndex = static_cast<unsigned>(~0L-1);
 
@@ -3605,9 +3601,7 @@ class IndirectBrInst : public Instruction {
   IndirectBrInst(Value *Address, unsigned NumDests, BasicBlock *InsertAtEnd);
 
   // allocate space for exactly zero operands
-  void *operator new(size_t s) {
-    return User::operator new(s);
-  }
+  void *operator new(size_t S) { return User::operator new(S); }
 
   void init(Value *Address, unsigned NumDests);
   void growOperands();
@@ -3619,6 +3613,8 @@ class IndirectBrInst : public Instruction {
   IndirectBrInst *cloneImpl() const;
 
 public:
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
+
   /// Iterator type that casts an operand to a basic block.
   ///
   /// This only makes sense because the successors are stored as adjacent
@@ -4256,7 +4252,7 @@ class CatchSwitchInst : public Instruction {
                   BasicBlock *InsertAtEnd);
 
   // allocate space for exactly zero operands
-  void *operator new(size_t s) { return User::operator new(s); }
+  void *operator new(size_t S) { return User::operator new(S); }
 
   void init(Value *ParentPad, BasicBlock *UnwindDest, unsigned NumReserved);
   void growOperands(unsigned Size);
@@ -4268,6 +4264,8 @@ class CatchSwitchInst : public Instruction {
   CatchSwitchInst *cloneImpl() const;
 
 public:
+  void operator delete(void *Ptr) { return User::operator delete(Ptr); }
+
   static CatchSwitchInst *Create(Value *ParentPad, BasicBlock *UnwindDest,
                                  unsigned NumHandlers,
                                  const Twine &NameStr = "",
@@ -4696,9 +4694,8 @@ class UnreachableInst : public Instruction {
   explicit UnreachableInst(LLVMContext &C, BasicBlock *InsertAtEnd);
 
   // allocate space for exactly zero operands
-  void *operator new(size_t s) {
-    return User::operator new(s, 0);
-  }
+  void *operator new(size_t S) { return User::operator new(S, 0); }
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   unsigned getNumSuccessors() const { return 0; }
 
diff --git a/llvm/lib/IR/ConstantsContext.h b/llvm/lib/IR/ConstantsContext.h
index 7fc25a8944e6b..4056c57480816 100644
--- a/llvm/lib/IR/ConstantsContext.h
+++ b/llvm/lib/IR/ConstantsContext.h
@@ -51,9 +51,8 @@ class UnaryConstantExpr final : public ConstantExpr {
   }
 
   // allocate space for exactly one operand
-  void *operator new(size_t s) {
-    return User::operator new(s, 1);
-  }
+  void *operator new(size_t S) { return User::operator new(S, 1); }
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 
@@ -79,9 +78,8 @@ class BinaryConstantExpr final : public ConstantExpr {
   }
 
   // allocate space for exactly two operands
-  void *operator new(size_t s) {
-    return User::operator new(s, 2);
-  }
+  void *operator new(size_t S) { return User::operator new(S, 2); }
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
@@ -106,9 +104,8 @@ class SelectConstantExpr final : public ConstantExpr {
   }
 
   // allocate space for exactly three operands
-  void *operator new(size_t s) {
-    return User::operator new(s, 3);
-  }
+  void *operator new(size_t S) { return User::operator new(S, 3); }
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
@@ -134,9 +131,8 @@ class ExtractElementConstantExpr final : public ConstantExpr {
   }
 
   // allocate space for exactly two operands
-  void *operator new(size_t s) {
-    return User::operator new(s, 2);
-  }
+  void *operator new(size_t S) { return User::operator new(S, 2); }
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
@@ -163,9 +159,8 @@ class InsertElementConstantExpr final : public ConstantExpr {
   }
 
   // allocate space for exactly three operands
-  void *operator new(size_t s) {
-    return User::operator new(s, 3);
-  }
+  void *operator new(size_t S) { return User::operator new(S, 3); }
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
@@ -200,7 +195,8 @@ class ShuffleVectorConstantExpr final : public ConstantExpr {
   SmallVector<int, 4> ShuffleMask;
   Constant *ShuffleMaskForBitcode;
 
-  void *operator new(size_t s) { return User::operator new(s, 2); }
+  void *operator new(size_t S) { return User::operator new(S, 2); }
+  void operator delete(void *Ptr) { return User::operator delete(Ptr); }
 
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
@@ -226,9 +222,8 @@ class ExtractValueConstantExpr final : public ConstantExpr {
   }
 
   // allocate space for exactly one operand
-  void *operator new(size_t s) {
-    return User::operator new(s, 1);
-  }
+  void *operator new(size_t S) { return User::operator new(S, 1); }
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   /// Indices - These identify which value to extract.
   const SmallVector<unsigned, 4> Indices;
@@ -258,9 +253,8 @@ class InsertValueConstantExpr final : public ConstantExpr {
   }
 
   // allocate space for exactly one operand
-  void *operator new(size_t s) {
-    return User::operator new(s, 2);
-  }
+  void *operator new(size_t S) { return User::operator new(S, 2); }
+  void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
   /// Indices - These identify the position for the insertion.
   const SmallVector<unsigned, 4> Indices;
@@ -323,9 +317,8 @@ class CompareConstantExpr final : public ConstantExpr {
   }
 
   // allocate space for exactly two operands
-  void *operator new(size_t s) {
-    return User::operator new(s, 2);
-  }
+  void *operator new(size_t S) { return User::operator new(S, 2); }
+  void operator delete(void *Ptr) { return User::operator delete(Ptr); }
 
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);

From 31f80393bc06f8eeab35218e1d4476bf120e452e Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <nicolas.vasilache@gmail.com>
Date: Thu, 8 Jul 2021 10:09:00 +0000
Subject: [PATCH 30/32] Revert "[mlir][MemRef] Fix DimOp folding of
 OffsetSizeAndStrideInterface."

This reverts commit 6c0fd4db79f2def432f761627bb8c7d4171a3237.

This simple implementation is unfortunately not extensible and needs to be reverted.
The extensible way should be to extend https://reviews.llvm.org/D104321.
---
 mlir/include/mlir/IR/BuiltinTypes.h        |  4 ---
 mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp   | 12 +++----
 mlir/lib/Dialect/Tensor/IR/TensorOps.cpp   | 16 +++------
 mlir/lib/IR/BuiltinTypes.cpp               |  9 -----
 mlir/test/Dialect/MemRef/canonicalize.mlir | 23 +------------
 mlir/test/Dialect/Tensor/canonicalize.mlir | 39 ----------------------
 6 files changed, 11 insertions(+), 92 deletions(-)

diff --git a/mlir/include/mlir/IR/BuiltinTypes.h b/mlir/include/mlir/IR/BuiltinTypes.h
index 44e751ab0edeb..8b30fa94f9936 100644
--- a/mlir/include/mlir/IR/BuiltinTypes.h
+++ b/mlir/include/mlir/IR/BuiltinTypes.h
@@ -110,10 +110,6 @@ class ShapedType : public Type {
   /// size. Otherwise, abort.
   int64_t getNumDynamicDims() const;
 
-  /// If `dim` is a dynamic dim, return its relative index among the dynamic
-  /// dims. Otherwise, abort. The result is guaranteed to be nonnegative.
-  int64_t getRelativeIndexOfDynamicDim(unsigned dim) const;
-
   /// If this is ranked type, return the size of the specified dimension.
   /// Otherwise, abort.
   int64_t getDimSize(unsigned idx) const;
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index a4cbb23bf74dc..518539376c9f4 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -175,9 +175,9 @@ struct SimplifyDeadAlloc : public OpRewritePattern<T> {
   LogicalResult matchAndRewrite(T alloc,
                                 PatternRewriter &rewriter) const override {
     if (llvm::any_of(alloc->getUsers(), [&](Operation *op) {
-          if (auto storeOp = dyn_cast<StoreOp>(op))
-            return storeOp.value() == alloc;
-          return !isa<DeallocOp>(op);
+        if (auto storeOp = dyn_cast<StoreOp>(op))
+          return storeOp.value() == alloc;
+        return !isa<DeallocOp>(op);
         }))
       return failure();
 
@@ -677,9 +677,9 @@ OpFoldResult DimOp::fold(ArrayRef<Attribute> operands) {
 
   if (auto sizeInterface =
           dyn_cast_or_null<OffsetSizeAndStrideOpInterface>(definingOp)) {
-    int64_t nthDynamicIndex =
-        memrefType.getRelativeIndexOfDynamicDim(unsignedIndex);
-    return sizeInterface.sizes()[nthDynamicIndex];
+    assert(sizeInterface.isDynamicSize(unsignedIndex) &&
+           "Expected dynamic subview size");
+    return sizeInterface.getDynamicSize(unsignedIndex);
   }
 
   // dim(memrefcast) -> dim
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index b794c11d5948f..dbd47c2d1fcd0 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -271,21 +271,13 @@ OpFoldResult DimOp::fold(ArrayRef<Attribute> operands) {
     return Value{*dynExtents};
   }
 
-  // dim(insert_slice.result()) -> dim(insert_slice.dest())
-  if (auto insertSliceOp =
-          dyn_cast_or_null<tensor::InsertSliceOp>(definingOp)) {
-    this->sourceMutable().assign(insertSliceOp.dest());
-    return getResult();
-  }
-
   // The size at the given index is now known to be a dynamic size.
   unsigned unsignedIndex = index.getValue().getZExtValue();
 
-  if (auto sizeInterface =
-          dyn_cast_or_null<OffsetSizeAndStrideOpInterface>(definingOp)) {
-    int64_t nthDynamicIndex =
-        tensorType.getRelativeIndexOfDynamicDim(unsignedIndex);
-    return sizeInterface.sizes()[nthDynamicIndex];
+  if (auto sliceOp = dyn_cast_or_null<tensor::ExtractSliceOp>(definingOp)) {
+    assert(sliceOp.isDynamicSize(unsignedIndex) &&
+           "Expected dynamic slice size");
+    return sliceOp.getDynamicSize(unsignedIndex);
   }
 
   // dim(cast) -> dim
diff --git a/mlir/lib/IR/BuiltinTypes.cpp b/mlir/lib/IR/BuiltinTypes.cpp
index 0c715d2d528f5..f350596384a90 100644
--- a/mlir/lib/IR/BuiltinTypes.cpp
+++ b/mlir/lib/IR/BuiltinTypes.cpp
@@ -427,15 +427,6 @@ int64_t ShapedType::getNumDynamicDims() const {
   return llvm::count_if(getShape(), isDynamic);
 }
 
-int64_t ShapedType::getRelativeIndexOfDynamicDim(unsigned dim) const {
-  assert(isDynamicDim(dim) && "expected a dynamic dim");
-  int nthDynamicIndex = -1;
-  for (unsigned idx = 0; idx <= dim; ++idx)
-    if (isDynamicDim(idx))
-      ++nthDynamicIndex;
-  return nthDynamicIndex;
-}
-
 bool ShapedType::hasStaticShape() const {
   return hasRank() && llvm::none_of(getShape(), isDynamic);
 }
diff --git a/mlir/test/Dialect/MemRef/canonicalize.mlir b/mlir/test/Dialect/MemRef/canonicalize.mlir
index 302477f04421e..2ae2c06dea92e 100644
--- a/mlir/test/Dialect/MemRef/canonicalize.mlir
+++ b/mlir/test/Dialect/MemRef/canonicalize.mlir
@@ -387,32 +387,11 @@ func @alloc_const_fold_with_symbols2() -> memref<?xi32, #map0> {
 }
 
 // -----
-
 // CHECK-LABEL: func @allocator
 // CHECK:   %[[alloc:.+]] = memref.alloc
 // CHECK:   memref.store %[[alloc:.+]], %arg0
 func @allocator(%arg0 : memref<memref<?xi32>>, %arg1 : index)  {
   %0 = memref.alloc(%arg1) : memref<?xi32>
   memref.store %0, %arg0[] : memref<memref<?xi32>>
-  return
-}
-
-// -----
-
-#map0 = affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)>
-
-// CHECK-LABEL: func @rank_reducing_subview_dim
-//  CHECK-SAME:   %[[IDX_0:[0-9a-zA-Z]*]]: index
-//  CHECK-SAME:   %[[IDX_1:[0-9a-zA-Z]*]]: index
-func @rank_reducing_subview_dim(%arg0 : memref<?x?x?xf32>, %arg1 : index,
-    %arg2 : index) -> index
-{
-  %c0 = constant 0 : index
-  %c1 = constant 1 : index
-  %c4 = constant 4 : index
-  %0 = memref.subview %arg0[%c0, %arg1, %c1] [%c4, 1, %arg2] [%c1, %c1, %c1] : memref<?x?x?xf32> to memref<?x?xf32, #map0>
-  %1 = memref.dim %0, %c1 : memref<?x?xf32, #map0>
-
-  // CHECK-NEXT: return %[[IDX_1]] : index
-  return %1 : index
+  return 
 }
diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir
index 977357077df37..f0259952da380 100644
--- a/mlir/test/Dialect/Tensor/canonicalize.mlir
+++ b/mlir/test/Dialect/Tensor/canonicalize.mlir
@@ -517,42 +517,3 @@ func @fold_dim_of_tensor.cast(%arg0 : tensor<4x?xf32>) -> (index, index) {
   %2 = tensor.dim %0, %c1 : tensor<?x?xf32>
   return %1, %2: index, index
 }
-
-// -----
-
-// CHECK-LABEL: func @rank_reducing_extract_slice_dim
-//  CHECK-SAME:   %[[IDX_0:[0-9a-zA-Z]*]]: index
-//  CHECK-SAME:   %[[IDX_1:[0-9a-zA-Z]*]]: index
-func @rank_reducing_extract_slice_dim(%arg0 : tensor<?x?x?xf32>, %arg1 : index,
-    %arg2 : index) -> index
-{
-  %c0 = constant 0 : index
-  %c1 = constant 1 : index
-  %c4 = constant 4 : index
-  %0 = tensor.extract_slice %arg0[%c0, %arg1, %c1] [%c4, 1, %arg2] [%c1, %c1, %c1] : tensor<?x?x?xf32> to tensor<?x?xf32>
-  %1 = tensor.dim %0, %c1 : tensor<?x?xf32>
-
-  // CHECK-NEXT: return %[[IDX_1]] : index
-  return %1 : index
-}
-
-// -----
-
-// CHECK-LABEL: func @rank_reducing_insert_slice_dim
-//  CHECK-SAME:   %[[OUT:[0-9a-zA-Z]*]]: tensor<?x?x?xf32>
-func @rank_reducing_insert_slice_dim(%out : tensor<?x?x?xf32>, %in : tensor<?x?xf32>, %arg1 : index,
-    %arg2 : index) -> index
-{
-  // CHECK-NEXT: %[[C1:.*]] = constant 1 : index
-
-  %c0 = constant 0 : index
-  %c1 = constant 1 : index
-  %c4 = constant 4 : index
-  %0 = tensor.insert_slice %in into %out[%c0, %arg1, %c1] [1, 1, 1] [%c1, %c1, %c1] : tensor<?x?xf32> into tensor<?x?x?xf32>
-
-  // CHECK-NEXT: %[[D1:.*]] = tensor.dim %[[OUT]], %[[C1]] : tensor<?x?x?xf32>
-  %1 = tensor.dim %0, %c1 : tensor<?x?x?xf32>
-
-  // CHECK-NEXT: return %[[D1]] : index
-  return %1 : index
-}

From 767eb9f9d5082b295cb7fe01d5e7d22fce72396a Mon Sep 17 00:00:00 2001
From: Max Kazantsev <mkazantsev@azul.com>
Date: Thu, 8 Jul 2021 17:27:18 +0700
Subject: [PATCH 31/32] [Test] Add loop deletion switch tests

Patch by Dmitry Makogon!

Differential Revision: https://reviews.llvm.org/D105543
---
 .../LoopDeletion/eval_first_iteration.ll      | 189 ++++++++++++++++++
 1 file changed, 189 insertions(+)

diff --git a/llvm/test/Transforms/LoopDeletion/eval_first_iteration.ll b/llvm/test/Transforms/LoopDeletion/eval_first_iteration.ll
index 468382db00b46..8d04f5285ece6 100644
--- a/llvm/test/Transforms/LoopDeletion/eval_first_iteration.ll
+++ b/llvm/test/Transforms/LoopDeletion/eval_first_iteration.ll
@@ -1046,3 +1046,192 @@ done:                                             ; preds = %backedge
   %sum.next.lcssa = phi i32 [ %sum.next, %backedge ]
   ret i32 %sum.next.lcssa
 }
+
+
+
+; Switch tests
+
+; Here switch will always jump to the default label
+define i32 @test_switch_ne_default() {
+; CHECK-LABEL: @test_switch_ne_default(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SUM_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 4, [[SUM]]
+; CHECK-NEXT:    switch i32 [[SUB]], label [[DEFAULT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[ONZERO:%.*]]
+; CHECK-NEXT:    i32 1, label [[ONONE:%.*]]
+; CHECK-NEXT:    i32 2, label [[ONTWO:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       default:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       onzero:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       onone:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       ontwo:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[MERGE_PHI:%.*]] = phi i32 [ [[SUB]], [[DEFAULT]] ], [ 0, [[ONZERO]] ], [ 1, [[ONONE]] ], [ 2, [[ONTWO]] ]
+; CHECK-NEXT:    [[SUM_NEXT]] = add i32 [[SUM]], [[MERGE_PHI]]
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ne i32 [[SUM_NEXT]], 4
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[DONE:%.*]]
+; CHECK:       done:
+; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[SUM_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:                                             ; preds = %backedge, %entry
+  %sum = phi i32 [ 0, %entry ], [ %sum.next, %backedge ]
+  %sub = sub i32 4, %sum
+  switch i32 %sub, label %default [
+  i32 0, label %onzero
+  i32 1, label %onone
+  i32 2, label %ontwo
+  ]
+
+default:                                          ; preds = %loop
+  br label %backedge
+
+onzero:                                           ; preds = %loop
+  br label %backedge
+
+onone:                                            ; preds = %loop
+  br label %backedge
+
+ontwo:                                            ; preds = %loop
+  br label %backedge
+
+backedge:                                         ; preds = %ontwo, %onone, %onzero, %default
+  %merge.phi = phi i32 [ %sub, %default ], [ 0, %onzero ], [ 1, %onone ], [ 2, %ontwo ]
+  %sum.next = add i32 %sum, %merge.phi
+  %loop.cond = icmp ne i32 %sum.next, 4
+  br i1 %loop.cond, label %loop, label %done
+
+done:                                             ; preds = %backedge
+  %sum.next.lcssa = phi i32 [ %sum.next, %backedge ]
+  ret i32 %sum.next.lcssa
+}
+
+; Here switch will always jump to the %ontwo label
+define i32 @test_switch_ne_one_case() {
+; CHECK-LABEL: @test_switch_ne_one_case(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SUM_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 4, [[SUM]]
+; CHECK-NEXT:    switch i32 [[SUB]], label [[DEFAULT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[ONZERO:%.*]]
+; CHECK-NEXT:    i32 1, label [[ONONE:%.*]]
+; CHECK-NEXT:    i32 4, label [[ONTWO:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       default:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       onzero:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       onone:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       ontwo:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[MERGE_PHI:%.*]] = phi i32 [ 2, [[DEFAULT]] ], [ 0, [[ONZERO]] ], [ 1, [[ONONE]] ], [ [[SUB]], [[ONTWO]] ]
+; CHECK-NEXT:    [[SUM_NEXT]] = add i32 [[SUM]], [[MERGE_PHI]]
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ne i32 [[SUM_NEXT]], 4
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[DONE:%.*]]
+; CHECK:       done:
+; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[SUM_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:                                             ; preds = %backedge, %entry
+  %sum = phi i32 [ 0, %entry ], [ %sum.next, %backedge ]
+  %sub = sub i32 4, %sum
+  switch i32 %sub, label %default [
+  i32 0, label %onzero
+  i32 1, label %onone
+  i32 4, label %ontwo
+  ]
+
+default:                                          ; preds = %loop
+  br label %backedge
+
+onzero:                                           ; preds = %loop
+  br label %backedge
+
+onone:                                            ; preds = %loop
+  br label %backedge
+
+ontwo:                                            ; preds = %loop
+  br label %backedge
+
+backedge:                                         ; preds = %ontwo, %onone, %onzero, %default
+  %merge.phi = phi i32 [ 2, %default ], [ 0, %onzero ], [ 1, %onone ], [ %sub, %ontwo ]
+  %sum.next = add i32 %sum, %merge.phi
+  %loop.cond = icmp ne i32 %sum.next, 4
+  br i1 %loop.cond, label %loop, label %done
+
+done:                                             ; preds = %backedge
+  %sum.next.lcssa = phi i32 [ %sum.next, %backedge ]
+  ret i32 %sum.next.lcssa
+}
+
+; Here switch will always jump to the %backedge label, but there are two jumps to this label in switch
+define i32 @test_switch_ne_one_case_identical_jumps() {
+; CHECK-LABEL: @test_switch_ne_one_case_identical_jumps(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SUM_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 2, [[SUM]]
+; CHECK-NEXT:    switch i32 [[SUB]], label [[DEFAULT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[FIRST_BLOCK:%.*]]
+; CHECK-NEXT:    i32 1, label [[BACKEDGE]]
+; CHECK-NEXT:    i32 2, label [[BACKEDGE]]
+; CHECK-NEXT:    ]
+; CHECK:       default:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       first_block:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[MERGE_PHI:%.*]] = phi i32 [ 0, [[DEFAULT]] ], [ 1, [[FIRST_BLOCK]] ], [ [[SUB]], [[LOOP]] ], [ [[SUB]], [[LOOP]] ]
+; CHECK-NEXT:    [[SUM_NEXT]] = add i32 [[SUM]], [[MERGE_PHI]]
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ne i32 [[SUM_NEXT]], 2
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[DONE:%.*]]
+; CHECK:       done:
+; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[SUM_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:                                             ; preds = %backedge, %entry
+  %sum = phi i32 [ 0, %entry ], [ %sum.next, %backedge ]
+  %sub = sub i32 2, %sum
+  switch i32 %sub, label %default [
+  i32 0, label %first_block
+  i32 1, label %backedge
+  i32 2, label %backedge
+  ]
+
+default:                                          ; preds = %loop
+  br label %backedge
+
+first_block:                                      ; preds = %loop
+  br label %backedge
+
+backedge:                                         ; preds = %first_block, %default, %loop, %loop
+  %merge.phi = phi i32 [ 0, %default ], [ 1, %first_block ], [ %sub, %loop ], [ %sub, %loop ]
+  %sum.next = add i32 %sum, %merge.phi
+  %loop.cond = icmp ne i32 %sum.next, 2
+  br i1 %loop.cond, label %loop, label %done
+
+done:                                             ; preds = %backedge
+  %sum.next.lcssa = phi i32 [ %sum.next, %backedge ]
+  ret i32 %sum.next.lcssa
+}

From 026bb84bcd42b875d77b769eb5ee4c19fc2a9719 Mon Sep 17 00:00:00 2001
From: Bradley Smith <bradley.smith@arm.com>
Date: Tue, 6 Jul 2021 15:12:04 +0100
Subject: [PATCH 32/32] [AArch64][SVE] Add ISel patterns for floating point
 compare with zero instructions

Additionally, lower the floating point compare SVE intrinsics to
SETCC_MERGE_ZERO ISD nodes to avoid duplicating ISel patterns.

Differential Revision: https://reviews.llvm.org/D105486
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  37 +++--
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  22 +--
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |  35 ++++-
 llvm/test/CodeGen/AArch64/sve-fcmp.ll         | 114 ++++++++++++++
 .../AArch64/sve-fixed-length-masked-gather.ll |  57 +++----
 .../sve-fixed-length-masked-scatter.ll        |  51 ++----
 .../AArch64/sve-intrinsics-fp-compares.ll     | 148 +++++++++++++-----
 7 files changed, 319 insertions(+), 145 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e4fb9b7ae9679..5caab2e85486e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14401,29 +14401,34 @@ static SDValue performIntrinsicCombine(SDNode *N,
                          N->getValueType(0), N->getOperand(1), N->getOperand(2),
                          N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
     break;
+  case Intrinsic::aarch64_sve_fcmpge:
   case Intrinsic::aarch64_sve_cmpge:
-    if (!N->getOperand(2).getValueType().isFloatingPoint())
-      return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
-                         N->getValueType(0), N->getOperand(1), N->getOperand(2),
-                         N->getOperand(3), DAG.getCondCode(ISD::SETGE));
+    return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+                       N->getValueType(0), N->getOperand(1), N->getOperand(2),
+                       N->getOperand(3), DAG.getCondCode(ISD::SETGE));
     break;
+  case Intrinsic::aarch64_sve_fcmpgt:
   case Intrinsic::aarch64_sve_cmpgt:
-    if (!N->getOperand(2).getValueType().isFloatingPoint())
-      return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
-                         N->getValueType(0), N->getOperand(1), N->getOperand(2),
-                         N->getOperand(3), DAG.getCondCode(ISD::SETGT));
+    return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+                       N->getValueType(0), N->getOperand(1), N->getOperand(2),
+                       N->getOperand(3), DAG.getCondCode(ISD::SETGT));
     break;
+  case Intrinsic::aarch64_sve_fcmpeq:
   case Intrinsic::aarch64_sve_cmpeq:
-    if (!N->getOperand(2).getValueType().isFloatingPoint())
-      return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
-                         N->getValueType(0), N->getOperand(1), N->getOperand(2),
-                         N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
+    return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+                       N->getValueType(0), N->getOperand(1), N->getOperand(2),
+                       N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
     break;
+  case Intrinsic::aarch64_sve_fcmpne:
   case Intrinsic::aarch64_sve_cmpne:
-    if (!N->getOperand(2).getValueType().isFloatingPoint())
-      return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
-                         N->getValueType(0), N->getOperand(1), N->getOperand(2),
-                         N->getOperand(3), DAG.getCondCode(ISD::SETNE));
+    return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+                       N->getValueType(0), N->getOperand(1), N->getOperand(2),
+                       N->getOperand(3), DAG.getCondCode(ISD::SETNE));
+    break;
+  case Intrinsic::aarch64_sve_fcmpuo:
+    return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+                       N->getValueType(0), N->getOperand(1), N->getOperand(2),
+                       N->getOperand(3), DAG.getCondCode(ISD::SETUO));
     break;
   case Intrinsic::aarch64_sve_fadda:
     return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 4465ee7ce7620..0b483be3176c9 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1255,20 +1255,20 @@ let Predicates = [HasSVE] in {
   defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo", SETULT, SETUGT>;
   defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls", SETULE, SETUGE>;
 
-  defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", int_aarch64_sve_fcmpge, SETOGE, SETGE, SETOLE, SETLE>;
-  defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", int_aarch64_sve_fcmpgt, SETOGT, SETGT, SETOLT, SETLT>;
-  defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", int_aarch64_sve_fcmpeq, SETOEQ, SETEQ, SETOEQ, SETEQ>;
-  defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", int_aarch64_sve_fcmpne, SETONE, SETNE, SETONE, SETNE>;
-  defm FCMUO_PPzZZ : sve_fp_3op_p_pd_cc<0b100, "fcmuo", int_aarch64_sve_fcmpuo, SETUO, SETUO, SETUO, SETUO>;
+  defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", SETOGE, SETGE, SETOLE, SETLE>;
+  defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", SETOGT, SETGT, SETOLT, SETLT>;
+  defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", SETOEQ, SETEQ, SETOEQ, SETEQ>;
+  defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", SETONE, SETNE, SETONE, SETNE>;
+  defm FCMUO_PPzZZ : sve_fp_3op_p_pd_cc<0b100, "fcmuo", SETUO, SETUO, SETUO, SETUO>;
   defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge", int_aarch64_sve_facge>;
   defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt", int_aarch64_sve_facgt>;
 
-  defm FCMGE_PPzZ0 : sve_fp_2op_p_pd<0b000, "fcmge">;
-  defm FCMGT_PPzZ0 : sve_fp_2op_p_pd<0b001, "fcmgt">;
-  defm FCMLT_PPzZ0 : sve_fp_2op_p_pd<0b010, "fcmlt">;
-  defm FCMLE_PPzZ0 : sve_fp_2op_p_pd<0b011, "fcmle">;
-  defm FCMEQ_PPzZ0 : sve_fp_2op_p_pd<0b100, "fcmeq">;
-  defm FCMNE_PPzZ0 : sve_fp_2op_p_pd<0b110, "fcmne">;
+  defm FCMGE_PPzZ0 : sve_fp_2op_p_pd<0b000, "fcmge", SETOGE, SETGE, SETOLE, SETLE>;
+  defm FCMGT_PPzZ0 : sve_fp_2op_p_pd<0b001, "fcmgt", SETOGT, SETGT, SETOLT, SETLT>;
+  defm FCMLT_PPzZ0 : sve_fp_2op_p_pd<0b010, "fcmlt", SETOLT, SETLT, SETOGT, SETGT>;
+  defm FCMLE_PPzZ0 : sve_fp_2op_p_pd<0b011, "fcmle", SETOLE, SETLE, SETOGE, SETGE>;
+  defm FCMEQ_PPzZ0 : sve_fp_2op_p_pd<0b100, "fcmeq", SETOEQ, SETEQ, SETOEQ, SETEQ>;
+  defm FCMNE_PPzZ0 : sve_fp_2op_p_pd<0b110, "fcmne", SETONE, SETNE, SETONE, SETNE>;
 
   defm WHILELT_PWW : sve_int_while4_rr<0b010, "whilelt", int_aarch64_sve_whilelt>;
   defm WHILELE_PWW : sve_int_while4_rr<0b011, "whilele", int_aarch64_sve_whilele>;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index f60f6cacf2c3b..1e44a267c8b0b 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -4394,6 +4394,14 @@ multiclass SVE_SETCC_Pat<CondCode cc, CondCode invcc, ValueType predvt,
             (cmp $Op1, $Op3, $Op2)>;
 }
 
+multiclass SVE_SETCC_Pat_With_Zero<CondCode cc, CondCode invcc, ValueType predvt,
+                                   ValueType intvt, Instruction cmp> {
+  def : Pat<(predvt (AArch64setcc_z predvt:$Op1, intvt:$Op2, (SVEDup0), cc)),
+            (cmp $Op1, $Op2)>;
+  def : Pat<(predvt (AArch64setcc_z predvt:$Op1, (SVEDup0), intvt:$Op2, invcc)),
+            (cmp $Op1, $Op2)>;
+}
+
 multiclass sve_int_cmp_0<bits<3> opc, string asm, CondCode cc, CondCode invcc> {
   def _B : sve_int_cmp<0b0, 0b00, opc, asm, PPR8, ZPR8, ZPR8>;
   def _H : sve_int_cmp<0b0, 0b01, opc, asm, PPR16, ZPR16, ZPR16>;
@@ -4754,10 +4762,13 @@ multiclass sve_fp_3op_p_pd<bits<3> opc, string asm, SDPatternOperator op> {
   def : SVE_3_Op_Pat<nxv2i1, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve_fp_3op_p_pd_cc<bits<3> opc, string asm, SDPatternOperator op,
+multiclass sve_fp_3op_p_pd_cc<bits<3> opc, string asm,
                               CondCode cc1, CondCode cc2,
-                              CondCode invcc1, CondCode invcc2>
-: sve_fp_3op_p_pd<opc, asm, op> {
+                              CondCode invcc1, CondCode invcc2> {
+  def _H : sve_fp_3op_p_pd<0b01, opc, asm, PPR16, ZPR16>;
+  def _S : sve_fp_3op_p_pd<0b10, opc, asm, PPR32, ZPR32>;
+  def _D : sve_fp_3op_p_pd<0b11, opc, asm, PPR64, ZPR64>;
+
   defm : SVE_SETCC_Pat<cc1, invcc1, nxv8i1,  nxv8f16, !cast<Instruction>(NAME # _H)>;
   defm : SVE_SETCC_Pat<cc1, invcc1, nxv4i1,  nxv4f16, !cast<Instruction>(NAME # _H)>;
   defm : SVE_SETCC_Pat<cc1, invcc1, nxv2i1,  nxv2f16, !cast<Instruction>(NAME # _H)>;
@@ -4797,10 +4808,26 @@ class sve_fp_2op_p_pd<bits<2> sz, bits<3> opc, string asm, PPRRegOp pprty,
   let Inst{3-0}   = Pd;
 }
 
-multiclass sve_fp_2op_p_pd<bits<3> opc, string asm> {
+multiclass sve_fp_2op_p_pd<bits<3> opc, string asm,
+                           CondCode cc1, CondCode cc2,
+                           CondCode invcc1, CondCode invcc2> {
   def _H : sve_fp_2op_p_pd<0b01, opc, asm, PPR16, ZPR16>;
   def _S : sve_fp_2op_p_pd<0b10, opc, asm, PPR32, ZPR32>;
   def _D : sve_fp_2op_p_pd<0b11, opc, asm, PPR64, ZPR64>;
+
+  defm : SVE_SETCC_Pat_With_Zero<cc1, invcc1, nxv8i1,  nxv8f16, !cast<Instruction>(NAME # _H)>;
+  defm : SVE_SETCC_Pat_With_Zero<cc1, invcc1, nxv4i1,  nxv4f16, !cast<Instruction>(NAME # _H)>;
+  defm : SVE_SETCC_Pat_With_Zero<cc1, invcc1, nxv2i1,  nxv2f16, !cast<Instruction>(NAME # _H)>;
+  defm : SVE_SETCC_Pat_With_Zero<cc1, invcc1, nxv4i1,  nxv4f32, !cast<Instruction>(NAME # _S)>;
+  defm : SVE_SETCC_Pat_With_Zero<cc1, invcc1, nxv2i1,  nxv2f32, !cast<Instruction>(NAME # _S)>;
+  defm : SVE_SETCC_Pat_With_Zero<cc1, invcc1, nxv2i1,  nxv2f64, !cast<Instruction>(NAME # _D)>;
+
+  defm : SVE_SETCC_Pat_With_Zero<cc2, invcc2, nxv8i1,  nxv8f16, !cast<Instruction>(NAME # _H)>;
+  defm : SVE_SETCC_Pat_With_Zero<cc2, invcc2, nxv4i1,  nxv4f16, !cast<Instruction>(NAME # _H)>;
+  defm : SVE_SETCC_Pat_With_Zero<cc2, invcc2, nxv2i1,  nxv2f16, !cast<Instruction>(NAME # _H)>;
+  defm : SVE_SETCC_Pat_With_Zero<cc2, invcc2, nxv4i1,  nxv4f32, !cast<Instruction>(NAME # _S)>;
+  defm : SVE_SETCC_Pat_With_Zero<cc2, invcc2, nxv2i1,  nxv2f32, !cast<Instruction>(NAME # _S)>;
+  defm : SVE_SETCC_Pat_With_Zero<cc2, invcc2, nxv2i1,  nxv2f64, !cast<Instruction>(NAME # _D)>;
 }
 
 
diff --git a/llvm/test/CodeGen/AArch64/sve-fcmp.ll b/llvm/test/CodeGen/AArch64/sve-fcmp.ll
index f2e109762ffd3..3c752ab5fa25d 100644
--- a/llvm/test/CodeGen/AArch64/sve-fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fcmp.ll
@@ -308,3 +308,117 @@ define <vscale x 4 x i1> @ne_fast(<vscale x 4 x float> %x, <vscale x 4 x float>
   %y = fcmp fast one <vscale x 4 x float> %x, %x2
   ret <vscale x 4 x i1> %y
 }
+define <vscale x 4 x i1> @oeq_zero(<vscale x 4 x float> %x) {
+; CHECK-LABEL: oeq_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    ret
+  %y = fcmp oeq <vscale x 4 x float> %x, zeroinitializer
+  ret <vscale x 4 x i1> %y
+}
+define <vscale x 4 x i1> @ogt_zero(<vscale x 4 x float> %x) {
+; CHECK-LABEL: ogt_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    ret
+  %y = fcmp ogt <vscale x 4 x float> %x, zeroinitializer
+  ret <vscale x 4 x i1> %y
+}
+define <vscale x 4 x i1> @oge_zero(<vscale x 4 x float> %x) {
+; CHECK-LABEL: oge_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcmge p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    ret
+  %y = fcmp oge <vscale x 4 x float> %x, zeroinitializer
+  ret <vscale x 4 x i1> %y
+}
+define <vscale x 4 x i1> @olt_zero(<vscale x 4 x float> %x) {
+; CHECK-LABEL: olt_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcmlt p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    ret
+  %y = fcmp olt <vscale x 4 x float> %x, zeroinitializer
+  ret <vscale x 4 x i1> %y
+}
+define <vscale x 4 x i1> @ole_zero(<vscale x 4 x float> %x) {
+; CHECK-LABEL: ole_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcmle p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    ret
+  %y = fcmp ole <vscale x 4 x float> %x, zeroinitializer
+  ret <vscale x 4 x i1> %y
+}
+define <vscale x 4 x i1> @one_zero(<vscale x 4 x float> %x) {
+; CHECK-LABEL: one_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcmne p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    ret
+  %y = fcmp one <vscale x 4 x float> %x, zeroinitializer
+  ret <vscale x 4 x i1> %y
+}
+define <vscale x 4 x i1> @ueq_zero(<vscale x 4 x float> %x) {
+; CHECK-LABEL: ueq_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcmne p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    ret
+  %y = fcmp ueq <vscale x 4 x float> %x, zeroinitializer
+  ret <vscale x 4 x i1> %y
+}
+define <vscale x 4 x i1> @ugt_zero(<vscale x 4 x float> %x) {
+; CHECK-LABEL: ugt_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcmle p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    ret
+  %y = fcmp ugt <vscale x 4 x float> %x, zeroinitializer
+  ret <vscale x 4 x i1> %y
+}
+define <vscale x 4 x i1> @uge_zero(<vscale x 4 x float> %x) {
+; CHECK-LABEL: uge_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcmlt p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    ret
+  %y = fcmp uge <vscale x 4 x float> %x, zeroinitializer
+  ret <vscale x 4 x i1> %y
+}
+define <vscale x 4 x i1> @ult_zero(<vscale x 4 x float> %x) {
+; CHECK-LABEL: ult_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    ret
+  %y = fcmp ult <vscale x 4 x float> %x, zeroinitializer
+  ret <vscale x 4 x i1> %y
+}
+define <vscale x 4 x i1> @ule_zero(<vscale x 4 x float> %x) {
+; CHECK-LABEL: ule_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcmgt p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    ret
+  %y = fcmp ule <vscale x 4 x float> %x, zeroinitializer
+  ret <vscale x 4 x i1> %y
+}
+define <vscale x 4 x i1> @une_zero(<vscale x 4 x float> %x) {
+; CHECK-LABEL: une_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    ret
+  %y = fcmp une <vscale x 4 x float> %x, zeroinitializer
+  ret <vscale x 4 x i1> %y
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
index 5ae9f9ecbb419..d8b08461b6ee1 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
@@ -617,8 +617,7 @@ define void @masked_gather_v16f16(<16 x half>* %a, <16 x half*>* %b) #0 {
 ; VBITS_GE_1024-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
 ; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl16
 ; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: mov [[ZERO:z[0-9]+]].h, #0
-; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]].h
+; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
 ; VBITS_GE_1024-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d]
 ; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s
 ; VBITS_GE_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[UZP1]].h, [[UZP1]].h
@@ -638,8 +637,7 @@ define void @masked_gather_v32f16(<32 x half>* %a, <32 x half*>* %b) #0 {
 ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
 ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].h, #0
-; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]].h
+; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
 ; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d]
 ; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s
 ; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[UZP1]].h, [[UZP1]].h
@@ -702,8 +700,7 @@ define void @masked_gather_v8f32(<8 x float>* %a, <8 x float*>* %b) #0 {
 ; VBITS_GE_512-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
 ; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].d, vl8
 ; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: mov [[ZERO:z[0-9]+]].s, #0
-; VBITS_GE_512-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]].s
+; VBITS_GE_512-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0
 ; VBITS_GE_512-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d]
 ; VBITS_GE_512-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s
 ; VBITS_GE_512-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0]
@@ -722,8 +719,7 @@ define void @masked_gather_v16f32(<16 x float>* %a, <16 x float*>* %b) #0 {
 ; VBITS_GE_1024-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
 ; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl16
 ; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: mov [[ZERO:z[0-9]+]].s, #0
-; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]].s
+; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0
 ; VBITS_GE_1024-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d]
 ; VBITS_GE_1024-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s
 ; VBITS_GE_1024-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0]
@@ -742,8 +738,7 @@ define void @masked_gather_v32f32(<32 x float>* %a, <32 x float*>* %b) #0 {
 ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
 ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0
-; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]].s
+; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0
 ; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d]
 ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s
 ; VBITS_GE_2048-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0]
@@ -795,8 +790,7 @@ define void @masked_gather_v4f64(<4 x double>* %a, <4 x double*>* %b) #0 {
 ; CHECK: ptrue [[PG0:p[0-9]+]].d, vl4
 ; CHECK-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0]
 ; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1]
-; CHECK-NEXT: mov [[ZERO:z[0-9]+]].d, #0
-; CHECK-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, [[ZERO]].d
+; CHECK-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0.0
 ; CHECK-NEXT: ld1d { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d]
 ; CHECK-NEXT: st1d { [[RES]].d }, [[PG0]], [x0]
 ; CHECK-NEXT: ret
@@ -813,8 +807,7 @@ define void @masked_gather_v8f64(<8 x double>* %a, <8 x double*>* %b) #0 {
 ; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].d, vl8
 ; VBITS_GE_512-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0]
 ; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1]
-; VBITS_GE_512-NEXT: mov [[ZERO:z[0-9]+]].d, #0
-; VBITS_GE_512-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, [[ZERO]].d
+; VBITS_GE_512-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0.0
 ; VBITS_GE_512-NEXT: ld1d { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d]
 ; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG0]], [x0]
 ; VBITS_GE_512-NEXT: ret
@@ -831,8 +824,7 @@ define void @masked_gather_v16f64(<16 x double>* %a, <16 x double*>* %b) #0 {
 ; VBITS_GE_1024: ptrue [[PG0:p[0-9]+]].d, vl16
 ; VBITS_GE_1024-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0]
 ; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1]
-; VBITS_GE_1024-NEXT: mov [[ZERO:z[0-9]+]].d, #0
-; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, [[ZERO]].d
+; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0.0
 ; VBITS_GE_1024-NEXT: ld1d { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d]
 ; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG0]], [x0]
 ; VBITS_GE_1024-NEXT: ret
@@ -849,8 +841,7 @@ define void @masked_gather_v32f64(<32 x double>* %a, <32 x double*>* %b) #0 {
 ; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].d, vl32
 ; VBITS_GE_2048-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0]
 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1]
-; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].d, #0
-; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, [[ZERO]].d
+; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0.0
 ; VBITS_GE_2048-NEXT: ld1d { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d]
 ; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG0]], [x0]
 ; VBITS_GE_2048-NEXT: ret
@@ -871,8 +862,7 @@ define void @masked_gather_32b_scaled_sext(<32 x half>* %a, <32 x i32>* %b, half
 ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
 ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].h, #0
-; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]]
+; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
 ; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, sxtw #1]
 ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h
 ; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0]
@@ -893,8 +883,7 @@ define void @masked_gather_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half
 ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
 ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].h, #0
-; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]]
+; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
 ; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, uxtw #1]
 ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h
 ; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0]
@@ -915,8 +904,7 @@ define void @masked_gather_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8
 ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
 ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].h, #0
-; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]]
+; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
 ; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, sxtw]
 ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h
 ; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0]
@@ -938,8 +926,7 @@ define void @masked_gather_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8
 ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
 ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].h, #0
-; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]]
+; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
 ; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, uxtw]
 ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h
 ; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0]
@@ -961,8 +948,7 @@ define void @masked_gather_64b_scaled(<32 x float>* %a, <32 x i64>* %b, float* %
 ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
 ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0
-; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]]
+; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0
 ; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[PTRS]].d, lsl #2]
 ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s
 ; VBITS_GE_2048-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0]
@@ -982,8 +968,7 @@ define void @masked_gather_64b_unscaled(<32 x float>* %a, <32 x i64>* %b, i8* %b
 ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
 ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0
-; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]]
+; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0
 ; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[PTRS]].d]
 ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s
 ; VBITS_GE_2048-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0]
@@ -1006,9 +991,8 @@ define void @masked_gather_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 %o
 ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
 ; VBITS_GE_2048-NEXT: mov [[OFF:z[0-9]+]].d, x2
-; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0
 ; VBITS_GE_2048-NEXT: add [[PTRS_ADD:z[0-9]+]].d, [[PG1]]/m, [[PTRS]].d, [[OFF]].d
-; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]]
+; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0
 ; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS_ADD]].d]
 ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s
 ; VBITS_GE_2048-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0]
@@ -1031,9 +1015,8 @@ define void @masked_gather_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) #0 {
 ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
 ; VBITS_GE_2048-NEXT: mov [[OFF:z[0-9]+]].d, #4
-; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0
 ; VBITS_GE_2048-NEXT: add [[PTRS_ADD:z[0-9]+]].d, [[PG1]]/m, [[PTRS]].d, [[OFF]].d
-; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]]
+; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0
 ; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS_ADD]].d]
 ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s
 ; VBITS_GE_2048-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0]
@@ -1054,9 +1037,8 @@ define void @masked_gather_passthru(<32 x float>* %a, <32 x float*>* %b, <32 x f
 ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
 ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0
-; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]]
 ; VBITS_GE_2048-NEXT: ld1w { [[PT:z[0-9]+]].s }, [[PG0]]/z, [x2]
+; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0
 ; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d]
 ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s
 ; VBITS_GE_2048-NEXT: sel [[SEL:z[0-9]+]].s, [[PG1]], [[UZP]].s, [[PT]].s
@@ -1077,8 +1059,7 @@ define void @masked_gather_passthru_0(<32 x float>* %a, <32 x float*>* %b) #0 {
 ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
 ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0
-; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]]
+; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0
 ; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d]
 ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s
 ; VBITS_GE_2048-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0]
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
index aa79ea7992b70..5dc40e399d0ec 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
@@ -562,8 +562,7 @@ define void @masked_scatter_v16f16(<16 x half>* %a, <16 x half*>* %b) #0 {
 ; VBITS_GE_1024-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
 ; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl16
 ; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: mov [[ZERO:z[0-9]+]].h, #0
-; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]]
+; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
 ; VBITS_GE_1024-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[VALS]].h
 ; VBITS_GE_1024-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
 ; VBITS_GE_1024-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d]
@@ -581,8 +580,7 @@ define void @masked_scatter_v32f16(<32 x half>* %a, <32 x half*>* %b) #0 {
 ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
 ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].h, #0
-; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]]
+; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
 ; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[VALS]].h
 ; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
 ; VBITS_GE_2048-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d]
@@ -639,8 +637,7 @@ define void @masked_scatter_v8f32(<8 x float>* %a, <8 x float*>* %b) #0 {
 ; VBITS_GE_512-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
 ; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].d, vl8
 ; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_512-NEXT: mov [[ZERO:z[0-9]+]].s, #0
-; VBITS_GE_512-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]]
+; VBITS_GE_512-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0
 ; VBITS_GE_512-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s
 ; VBITS_GE_512-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d]
 ; VBITS_GE_512-NEXT: ret
@@ -657,8 +654,7 @@ define void @masked_scatter_v16f32(<16 x float>* %a, <16 x float*>* %b) #0 {
 ; VBITS_GE_1024-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
 ; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl16
 ; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_1024-NEXT: mov [[ZERO:z[0-9]+]].s, #0
-; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]]
+; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0
 ; VBITS_GE_1024-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s
 ; VBITS_GE_1024-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d]
 ; VBITS_GE_1024-NEXT: ret
@@ -675,8 +671,7 @@ define void @masked_scatter_v32f32(<32 x float>* %a, <32 x float*>* %b) #0 {
 ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
 ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0
-; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]]
+; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0
 ; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s
 ; VBITS_GE_2048-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d]
 ; VBITS_GE_2048-NEXT: ret
@@ -723,8 +718,7 @@ define void @masked_scatter_v4f64(<4 x double>* %a, <4 x double*>* %b) #0 {
 ; CHECK: ptrue [[PG0:p[0-9]+]].d, vl4
 ; CHECK-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0]
 ; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1]
-; CHECK-NEXT: mov [[ZERO:z[0-9]+]].d, #0
-; CHECK-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, [[ZERO]].d
+; CHECK-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0.0
 ; CHECK-NEXT: st1d { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS]].d]
 ; CHECK-NEXT: ret
   %vals = load <4 x double>, <4 x double>* %a
@@ -739,8 +733,7 @@ define void @masked_scatter_v8f64(<8 x double>* %a, <8 x double*>* %b) #0 {
 ; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].d, vl8
 ; VBITS_GE_512-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0]
 ; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1]
-; VBITS_GE_512-NEXT: mov [[ZERO:z[0-9]+]].d, #0
-; VBITS_GE_512-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, [[ZERO]]
+; VBITS_GE_512-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0.0
 ; VBITS_GE_512-NEXT: st1d { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS]].d]
 ; VBITS_GE_512-NEXT: ret
   %vals = load <8 x double>, <8 x double>* %a
@@ -755,8 +748,7 @@ define void @masked_scatter_v16f64(<16 x double>* %a, <16 x double*>* %b) #0 {
 ; VBITS_GE_1024: ptrue [[PG0:p[0-9]+]].d, vl16
 ; VBITS_GE_1024-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0]
 ; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1]
-; VBITS_GE_1024-NEXT: mov [[ZERO:z[0-9]+]].d, #0
-; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, [[ZERO]]
+; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0.0
 ; VBITS_GE_1024-NEXT: st1d { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS]].d]
 ; VBITS_GE_1024-NEXT: ret
   %vals = load <16 x double>, <16 x double>* %a
@@ -771,8 +763,7 @@ define void @masked_scatter_v32f64(<32 x double>* %a, <32 x double*>* %b) #0 {
 ; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].d, vl32
 ; VBITS_GE_2048-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0]
 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1]
-; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].d, #0
-; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, [[ZERO]]
+; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0.0
 ; VBITS_GE_2048-NEXT: st1d { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS]].d]
 ; VBITS_GE_2048-NEXT: ret
   %vals = load <32 x double>, <32 x double>* %a
@@ -791,8 +782,7 @@ define void @masked_scatter_32b_scaled_sext(<32 x half>* %a, <32 x i32>* %b, hal
 ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
 ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].h, #0
-; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]]
+; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
 ; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[VALS]].h
 ; VBITS_GE_2048-NEXT: st1h { [[VALS]].s }, [[MASK]], [x2, [[PTRS]].s, sxtw #1]
 ; VBITS_GE_2048-NEXT: ret
@@ -811,8 +801,7 @@ define void @masked_scatter_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, hal
 ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
 ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].h, #0
-; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]]
+; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
 ; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[VALS]].h
 ; VBITS_GE_2048-NEXT: st1h { [[VALS]].s }, [[MASK]], [x2, [[PTRS]].s, uxtw #1]
 ; VBITS_GE_2048-NEXT: ret
@@ -831,8 +820,7 @@ define void @masked_scatter_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i
 ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
 ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].h, #0
-; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]]
+; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
 ; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[VALS]].h
 ; VBITS_GE_2048-NEXT: st1h { [[VALS]].s }, [[MASK]], [x2, [[PTRS]].s, sxtw]
 ; VBITS_GE_2048-NEXT: ret
@@ -852,8 +840,7 @@ define void @masked_scatter_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i
 ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
 ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].h, #0
-; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]]
+; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
 ; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[VALS]].h
 ; VBITS_GE_2048-NEXT: st1h { [[VALS]].s }, [[MASK]], [x2, [[PTRS]].s, uxtw]
 ; VBITS_GE_2048-NEXT: ret
@@ -873,8 +860,7 @@ define void @masked_scatter_64b_scaled(<32 x float>* %a, <32 x i64>* %b, float*
 ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
 ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0
-; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]]
+; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0
 ; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s
 ; VBITS_GE_2048-NEXT: st1w { [[VALS]].d }, [[MASK]], [x2, [[PTRS]].d, lsl #2]
 ; VBITS_GE_2048-NEXT: ret
@@ -892,8 +878,7 @@ define void @masked_scatter_64b_unscaled(<32 x float>* %a, <32 x i64>* %b, i8* %
 ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
 ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
-; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0
-; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]]
+; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0
 ; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s
 ; VBITS_GE_2048-NEXT: st1w { [[VALS]].d }, [[MASK]], [x2, [[PTRS]].d]
 ; VBITS_GE_2048-NEXT: ret
@@ -914,8 +899,7 @@ define void @masked_scatter_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 %
 ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
 ; VBITS_GE_2048-NEXT: mov [[OFF:z[0-9]+]].d, x2
-; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0
-; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]]
+; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0
 ; VBITS_GE_2048-NEXT: add [[PTRS_ADD:z[0-9]+]].d, [[PG1]]/m, [[PTRS]].d, [[OFF]].d
 ; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s
 ; VBITS_GE_2048-NEXT: st1w { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS_ADD]].d]
@@ -937,8 +921,7 @@ define void @masked_scatter_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) #0 {
 ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
 ; VBITS_GE_2048-NEXT: mov [[OFF:z[0-9]+]].d, #4
-; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0
-; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]]
+; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0
 ; VBITS_GE_2048-NEXT: add [[PTRS_ADD:z[0-9]+]].d, [[PG1]]/m, [[PTRS]].d, [[OFF]].d
 ; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s
 ; VBITS_GE_2048-NEXT: st1w { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS_ADD]].d]
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-fp-compares.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-fp-compares.ll
index 3200f14680bf3..ceda1c2b05121 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-fp-compares.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-fp-compares.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
 
 ;
@@ -6,8 +7,9 @@
 
 define <vscale x 8 x i1> @facge_h(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
 ; CHECK-LABEL: facge_h:
-; CHECK: facge p0.h, p0/z, z0.h, z1.h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    facge p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    ret
   %out = call <vscale x 8 x i1> @llvm.aarch64.sve.facge.nxv8f16(<vscale x 8 x i1> %pg,
                                                                 <vscale x 8 x half> %a,
                                                                 <vscale x 8 x half> %b)
@@ -16,8 +18,9 @@ define <vscale x 8 x i1> @facge_h(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a,
 
 define <vscale x 4 x i1> @facge_s(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: facge_s:
-; CHECK: facge p0.s, p0/z, z0.s, z1.s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    facge p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    ret
   %out = call <vscale x 4 x i1> @llvm.aarch64.sve.facge.nxv4f32(<vscale x 4 x i1> %pg,
                                                                 <vscale x 4 x float> %a,
                                                                 <vscale x 4 x float> %b)
@@ -26,8 +29,9 @@ define <vscale x 4 x i1> @facge_s(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a
 
 define <vscale x 2 x i1> @facge_d(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) {
 ; CHECK-LABEL: facge_d:
-; CHECK: facge p0.d, p0/z, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    facge p0.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    ret
   %out = call <vscale x 2 x i1> @llvm.aarch64.sve.facge.nxv2f64(<vscale x 2 x i1> %pg,
                                                                 <vscale x 2 x double> %a,
                                                                 <vscale x 2 x double> %b)
@@ -40,8 +44,9 @@ define <vscale x 2 x i1> @facge_d(<vscale x 2 x i1> %pg, <vscale x 2 x double> %
 
 define <vscale x 8 x i1> @facgt_h(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
 ; CHECK-LABEL: facgt_h:
-; CHECK: facgt p0.h, p0/z, z0.h, z1.h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    facgt p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    ret
   %out = call <vscale x 8 x i1> @llvm.aarch64.sve.facgt.nxv8f16(<vscale x 8 x i1> %pg,
                                                                 <vscale x 8 x half> %a,
                                                                 <vscale x 8 x half> %b)
@@ -50,8 +55,9 @@ define <vscale x 8 x i1> @facgt_h(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a,
 
 define <vscale x 4 x i1> @facgt_s(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: facgt_s:
-; CHECK: facgt p0.s, p0/z, z0.s, z1.s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    facgt p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    ret
   %out = call <vscale x 4 x i1> @llvm.aarch64.sve.facgt.nxv4f32(<vscale x 4 x i1> %pg,
                                                                 <vscale x 4 x float> %a,
                                                                 <vscale x 4 x float> %b)
@@ -60,8 +66,9 @@ define <vscale x 4 x i1> @facgt_s(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a
 
 define <vscale x 2 x i1> @facgt_d(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) {
 ; CHECK-LABEL: facgt_d:
-; CHECK: facgt p0.d, p0/z, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    facgt p0.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    ret
   %out = call <vscale x 2 x i1> @llvm.aarch64.sve.facgt.nxv2f64(<vscale x 2 x i1> %pg,
                                                                 <vscale x 2 x double> %a,
                                                                 <vscale x 2 x double> %b)
@@ -74,8 +81,9 @@ define <vscale x 2 x i1> @facgt_d(<vscale x 2 x i1> %pg, <vscale x 2 x double> %
 
 define <vscale x 8 x i1> @fcmeq_h(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
 ; CHECK-LABEL: fcmeq_h:
-; CHECK: fcmeq p0.h, p0/z, z0.h, z1.h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    ret
   %out = call <vscale x 8 x i1> @llvm.aarch64.sve.fcmpeq.nxv8f16(<vscale x 8 x i1> %pg,
                                                                  <vscale x 8 x half> %a,
                                                                  <vscale x 8 x half> %b)
@@ -84,8 +92,9 @@ define <vscale x 8 x i1> @fcmeq_h(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a,
 
 define <vscale x 4 x i1> @fcmeq_s(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: fcmeq_s:
-; CHECK: fcmeq p0.s, p0/z, z0.s, z1.s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    ret
   %out = call <vscale x 4 x i1> @llvm.aarch64.sve.fcmpeq.nxv4f32(<vscale x 4 x i1> %pg,
                                                                  <vscale x 4 x float> %a,
                                                                  <vscale x 4 x float> %b)
@@ -94,22 +103,35 @@ define <vscale x 4 x i1> @fcmeq_s(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a
 
 define <vscale x 2 x i1> @fcmeq_d(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) {
 ; CHECK-LABEL: fcmeq_d:
-; CHECK: fcmeq p0.d, p0/z, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmeq p0.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    ret
   %out = call <vscale x 2 x i1> @llvm.aarch64.sve.fcmpeq.nxv2f64(<vscale x 2 x i1> %pg,
                                                                  <vscale x 2 x double> %a,
                                                                  <vscale x 2 x double> %b)
   ret <vscale x 2 x i1> %out
 }
 
+define <vscale x 2 x i1> @fcmeq_zero(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) {
+; CHECK-LABEL: fcmeq_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmeq p0.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    ret
+  %out = call <vscale x 2 x i1> @llvm.aarch64.sve.fcmpeq.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x double> %a,
+                                                                 <vscale x 2 x double> zeroinitializer)
+  ret <vscale x 2 x i1> %out
+}
+
 ;
 ; FCMGE
 ;
 
 define <vscale x 8 x i1> @fcmge_h(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
 ; CHECK-LABEL: fcmge_h:
-; CHECK: fcmge p0.h, p0/z, z0.h, z1.h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmge p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    ret
   %out = call <vscale x 8 x i1> @llvm.aarch64.sve.fcmpge.nxv8f16(<vscale x 8 x i1> %pg,
                                                                  <vscale x 8 x half> %a,
                                                                  <vscale x 8 x half> %b)
@@ -118,8 +140,9 @@ define <vscale x 8 x i1> @fcmge_h(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a,
 
 define <vscale x 4 x i1> @fcmge_s(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: fcmge_s:
-; CHECK: fcmge p0.s, p0/z, z0.s, z1.s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmge p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    ret
   %out = call <vscale x 4 x i1> @llvm.aarch64.sve.fcmpge.nxv4f32(<vscale x 4 x i1> %pg,
                                                                  <vscale x 4 x float> %a,
                                                                  <vscale x 4 x float> %b)
@@ -128,22 +151,34 @@ define <vscale x 4 x i1> @fcmge_s(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a
 
 define <vscale x 2 x i1> @fcmge_d(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) {
 ; CHECK-LABEL: fcmge_d:
-; CHECK: fcmge p0.d, p0/z, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmge p0.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    ret
   %out = call <vscale x 2 x i1> @llvm.aarch64.sve.fcmpge.nxv2f64(<vscale x 2 x i1> %pg,
                                                                  <vscale x 2 x double> %a,
                                                                  <vscale x 2 x double> %b)
   ret <vscale x 2 x i1> %out
 }
 
+define <vscale x 2 x i1> @fcmge_zero(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) {
+; CHECK-LABEL: fcmge_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmge p0.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    ret
+  %out = call <vscale x 2 x i1> @llvm.aarch64.sve.fcmpge.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x double> %a,
+                                                                 <vscale x 2 x double> zeroinitializer)
+  ret <vscale x 2 x i1> %out
+}
 ;
 ; FCMGT
 ;
 
 define <vscale x 8 x i1> @fcmgt_h(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
 ; CHECK-LABEL: fcmgt_h:
-; CHECK: fcmgt p0.h, p0/z, z0.h, z1.h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmgt p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    ret
   %out = call <vscale x 8 x i1> @llvm.aarch64.sve.fcmpgt.nxv8f16(<vscale x 8 x i1> %pg,
                                                                  <vscale x 8 x half> %a,
                                                                  <vscale x 8 x half> %b)
@@ -152,8 +187,9 @@ define <vscale x 8 x i1> @fcmgt_h(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a,
 
 define <vscale x 4 x i1> @fcmgt_s(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: fcmgt_s:
-; CHECK: fcmgt p0.s, p0/z, z0.s, z1.s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    ret
   %out = call <vscale x 4 x i1> @llvm.aarch64.sve.fcmpgt.nxv4f32(<vscale x 4 x i1> %pg,
                                                                  <vscale x 4 x float> %a,
                                                                  <vscale x 4 x float> %b)
@@ -162,22 +198,34 @@ define <vscale x 4 x i1> @fcmgt_s(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a
 
 define <vscale x 2 x i1> @fcmgt_d(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) {
 ; CHECK-LABEL: fcmgt_d:
-; CHECK: fcmgt p0.d, p0/z, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    ret
   %out = call <vscale x 2 x i1> @llvm.aarch64.sve.fcmpgt.nxv2f64(<vscale x 2 x i1> %pg,
                                                                  <vscale x 2 x double> %a,
                                                                  <vscale x 2 x double> %b)
   ret <vscale x 2 x i1> %out
 }
 
+define <vscale x 2 x i1> @fcmgt_zero(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) {
+; CHECK-LABEL: fcmgt_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    ret
+  %out = call <vscale x 2 x i1> @llvm.aarch64.sve.fcmpgt.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x double> %a,
+                                                                 <vscale x 2 x double> zeroinitializer)
+  ret <vscale x 2 x i1> %out
+}
 ;
 ; FCMNE
 ;
 
 define <vscale x 8 x i1> @fcmne_h(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
 ; CHECK-LABEL: fcmne_h:
-; CHECK: fcmne p0.h, p0/z, z0.h, z1.h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmne p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    ret
   %out = call <vscale x 8 x i1> @llvm.aarch64.sve.fcmpne.nxv8f16(<vscale x 8 x i1> %pg,
                                                                  <vscale x 8 x half> %a,
                                                                  <vscale x 8 x half> %b)
@@ -186,8 +234,9 @@ define <vscale x 8 x i1> @fcmne_h(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a,
 
 define <vscale x 4 x i1> @fcmne_s(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: fcmne_s:
-; CHECK: fcmne p0.s, p0/z, z0.s, z1.s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmne p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    ret
   %out = call <vscale x 4 x i1> @llvm.aarch64.sve.fcmpne.nxv4f32(<vscale x 4 x i1> %pg,
                                                                  <vscale x 4 x float> %a,
                                                                  <vscale x 4 x float> %b)
@@ -196,22 +245,35 @@ define <vscale x 4 x i1> @fcmne_s(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a
 
 define <vscale x 2 x i1> @fcmne_d(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) {
 ; CHECK-LABEL: fcmne_d:
-; CHECK: fcmne p0.d, p0/z, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmne p0.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    ret
   %out = call <vscale x 2 x i1> @llvm.aarch64.sve.fcmpne.nxv2f64(<vscale x 2 x i1> %pg,
                                                                  <vscale x 2 x double> %a,
                                                                  <vscale x 2 x double> %b)
   ret <vscale x 2 x i1> %out
 }
 
+define <vscale x 2 x i1> @fcmne_zero(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) {
+; CHECK-LABEL: fcmne_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmne p0.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    ret
+  %out = call <vscale x 2 x i1> @llvm.aarch64.sve.fcmpne.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x double> %a,
+                                                                 <vscale x 2 x double> zeroinitializer)
+  ret <vscale x 2 x i1> %out
+}
+
 ;
 ; FCMPUO
 ;
 
 define <vscale x 8 x i1> @fcmuo_h(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
 ; CHECK-LABEL: fcmuo_h:
-; CHECK: fcmuo p0.h, p0/z, z0.h, z1.h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    ret
   %out = call <vscale x 8 x i1> @llvm.aarch64.sve.fcmpuo.nxv8f16(<vscale x 8 x i1> %pg,
                                                                  <vscale x 8 x half> %a,
                                                                  <vscale x 8 x half> %b)
@@ -220,8 +282,9 @@ define <vscale x 8 x i1> @fcmuo_h(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a,
 
 define <vscale x 4 x i1> @fcmuo_s(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: fcmuo_s:
-; CHECK: fcmuo p0.s, p0/z, z0.s, z1.s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    ret
   %out = call <vscale x 4 x i1> @llvm.aarch64.sve.fcmpuo.nxv4f32(<vscale x 4 x i1> %pg,
                                                                  <vscale x 4 x float> %a,
                                                                  <vscale x 4 x float> %b)
@@ -230,8 +293,9 @@ define <vscale x 4 x i1> @fcmuo_s(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a
 
 define <vscale x 2 x i1> @fcmuo_d(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) {
 ; CHECK-LABEL: fcmuo_d:
-; CHECK: fcmuo p0.d, p0/z, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    ret
   %out = call <vscale x 2 x i1> @llvm.aarch64.sve.fcmpuo.nxv2f64(<vscale x 2 x i1> %pg,
                                                                  <vscale x 2 x double> %a,
                                                                  <vscale x 2 x double> %b)