diff --git a/clang/docs/AddressSanitizer.rst b/clang/docs/AddressSanitizer.rst
index 76fdf55995059..d937cbfdf583c 100644
--- a/clang/docs/AddressSanitizer.rst
+++ b/clang/docs/AddressSanitizer.rst
@@ -26,7 +26,7 @@ Typical slowdown introduced by AddressSanitizer is **2x**.
 How to build
 ============
 
-Build LLVM/Clang with `CMake <https://llvm.org/docs/CMake.html>` and enable
+Build LLVM/Clang with `CMake <https://llvm.org/docs/CMake.html>`_ and enable
 the ``compiler-rt`` runtime. An example CMake configuration that will allow
 for the use/testing of AddressSanitizer:
 
diff --git a/clang/docs/RealtimeSanitizer.rst b/clang/docs/RealtimeSanitizer.rst
index b09162cd99f45..41b8bbb33baf1 100644
--- a/clang/docs/RealtimeSanitizer.rst
+++ b/clang/docs/RealtimeSanitizer.rst
@@ -21,7 +21,7 @@ The runtime slowdown introduced by RealtimeSanitizer is negligible.
 How to build
 ============
 
-Build LLVM/Clang with `CMake <https://llvm.org/docs/CMake.html>` and enable the
+Build LLVM/Clang with `CMake <https://llvm.org/docs/CMake.html>`_ and enable the
 ``compiler-rt`` runtime. An example CMake configuration that will allow for the
 use/testing of RealtimeSanitizer:
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index aef637c65e1b0..fcf00d5ac0e8d 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -349,6 +349,11 @@ Modified Compiler Flags
   to utilize these vector libraries. The behavior for all other vector function
   libraries remains unchanged.
 
+- The ``-Wnontrivial-memaccess`` warning has been updated to also warn about
+  passing non-trivially-copyable destrination parameter to ``memcpy``,
+  ``memset`` and similar functions for which it is a documented undefined
+  behavior.
+
 Removed Compiler Flags
 -------------------------
 
diff --git a/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h b/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h
index 9d81cacb50735..713494178b97b 100644
--- a/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h
+++ b/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h
@@ -37,6 +37,14 @@ struct UncheckedOptionalAccessModelOptions {
   /// can't identify when their results are used safely (across calls),
   /// resulting in false positives in all such cases. Note: this option does not
   /// cover access through `operator[]`.
+  /// FIXME: we currently cache and equate the result of const accessors
+  /// returning pointers, so cover the case of operator-> followed by
+  /// operator->, which covers the common case of smart pointers. We also cover
+  /// some limited cases of returning references (if return type is an optional
+  /// type), so cover some cases of operator* followed by operator*. We don't
+  /// cover mixing operator-> and operator*. Once we are confident in this const
+  /// accessor caching, we shouldn't need the IgnoreSmartPointerDereference
+  /// option anymore.
   bool IgnoreSmartPointerDereference = false;
 };
 
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 90475a361bb8f..9bd67e0cefebc 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -4871,6 +4871,12 @@ def HLSLRadians : LangBuiltin<"HLSL_LANG"> {
   let Prototype = "void(...)";
 }
 
+def HLSLSplitDouble: LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_elementwise_splitdouble"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "void(...)";
+}
+
 // Builtins for XRay.
 def XRayCustomEvent : Builtin {
   let Spellings = ["__xray_customevent"];
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 9b9bdd7c800e3..34ff49d7238a7 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -795,6 +795,10 @@ def warn_cstruct_memaccess : Warning<
   "%1 call is a pointer to record %2 that is not trivial to "
   "%select{primitive-default-initialize|primitive-copy}3">,
   InGroup<NonTrivialMemaccess>;
+def warn_cxxstruct_memaccess : Warning<
+  "first argument in call to "
+  "%0 is a pointer to non-trivially copyable type %1">,
+  InGroup<NonTrivialMemaccess>;
 def note_nontrivial_field : Note<
   "field is non-trivial to %select{copy|default-initialize}0">;
 def err_non_trivial_c_union_in_invalid_context : Error<
diff --git a/clang/include/clang/Basic/Module.h b/clang/include/clang/Basic/Module.h
index 1ab3b5e5f8156..dd384c1d76c5f 100644
--- a/clang/include/clang/Basic/Module.h
+++ b/clang/include/clang/Basic/Module.h
@@ -227,7 +227,7 @@ class alignas(8) Module {
 
   /// A mapping from the submodule name to the index into the
   /// \c SubModules vector at which that submodule resides.
-  llvm::StringMap<unsigned> SubModuleIndex;
+  mutable llvm::StringMap<unsigned> SubModuleIndex;
 
   /// The AST file if this is a top-level module which has a
   /// corresponding serialized AST file, or null otherwise.
@@ -612,7 +612,6 @@ class alignas(8) Module {
   void setParent(Module *M) {
     assert(!Parent);
     Parent = M;
-    Parent->SubModuleIndex[Name] = Parent->SubModules.size();
     Parent->SubModules.push_back(this);
   }
 
diff --git a/clang/include/clang/Lex/ModuleMap.h b/clang/include/clang/Lex/ModuleMap.h
index 75b567a347cb6..53e9e0ec83ddb 100644
--- a/clang/include/clang/Lex/ModuleMap.h
+++ b/clang/include/clang/Lex/ModuleMap.h
@@ -546,6 +546,17 @@ class ModuleMap {
   std::pair<Module *, bool> findOrCreateModule(StringRef Name, Module *Parent,
                                                bool IsFramework,
                                                bool IsExplicit);
+  /// Call \c ModuleMap::findOrCreateModule and throw away the information
+  /// whether the module was found or created.
+  Module *findOrCreateModuleFirst(StringRef Name, Module *Parent,
+                                  bool IsFramework, bool IsExplicit) {
+    return findOrCreateModule(Name, Parent, IsFramework, IsExplicit).first;
+  }
+  /// Create new submodule, assuming it does not exist. This function can only
+  /// be called when it is guaranteed that this submodule does not exist yet.
+  /// The parameters have same semantics as \c ModuleMap::findOrCreateModule.
+  Module *createModule(StringRef Name, Module *Parent, bool IsFramework,
+                       bool IsExplicit);
 
   /// Create a global module fragment for a C++ module unit.
   ///
diff --git a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
index 31ae2b94f5b61..da5dda063344f 100644
--- a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
@@ -338,6 +338,11 @@ auto isZeroParamConstMemberCall() {
       callee(cxxMethodDecl(parameterCountIs(0), isConst())));
 }
 
+auto isZeroParamConstMemberOperatorCall() {
+  return cxxOperatorCallExpr(
+      callee(cxxMethodDecl(parameterCountIs(0), isConst())));
+}
+
 auto isNonConstMemberCall() {
   return cxxMemberCallExpr(callee(cxxMethodDecl(unless(isConst()))));
 }
@@ -572,9 +577,10 @@ void handleConstMemberCall(const CallExpr *CE,
     return;
   }
 
-  // Cache if the const method returns a boolean type.
+  // Cache if the const method returns a boolean or pointer type.
   // We may decide to cache other return types in the future.
-  if (RecordLoc != nullptr && CE->getType()->isBooleanType()) {
+  if (RecordLoc != nullptr &&
+      (CE->getType()->isBooleanType() || CE->getType()->isPointerType())) {
     Value *Val = State.Lattice.getOrCreateConstMethodReturnValue(*RecordLoc, CE,
                                                                  State.Env);
     if (Val == nullptr)
@@ -597,6 +603,14 @@ void transferValue_ConstMemberCall(const CXXMemberCallExpr *MCE,
       MCE, dataflow::getImplicitObjectLocation(*MCE, State.Env), Result, State);
 }
 
+void transferValue_ConstMemberOperatorCall(
+    const CXXOperatorCallExpr *OCE, const MatchFinder::MatchResult &Result,
+    LatticeTransferState &State) {
+  auto *RecordLoc = cast_or_null<dataflow::RecordStorageLocation>(
+      State.Env.getStorageLocation(*OCE->getArg(0)));
+  handleConstMemberCall(OCE, RecordLoc, Result, State);
+}
+
 void handleNonConstMemberCall(const CallExpr *CE,
                               dataflow::RecordStorageLocation *RecordLoc,
                               const MatchFinder::MatchResult &Result,
@@ -1020,6 +1034,8 @@ auto buildTransferMatchSwitch() {
       // const accessor calls
       .CaseOfCFGStmt<CXXMemberCallExpr>(isZeroParamConstMemberCall(),
                                         transferValue_ConstMemberCall)
+      .CaseOfCFGStmt<CXXOperatorCallExpr>(isZeroParamConstMemberOperatorCall(),
+                                          transferValue_ConstMemberOperatorCall)
       // non-const member calls that may modify the state of an object.
       .CaseOfCFGStmt<CXXMemberCallExpr>(isNonConstMemberCall(),
                                         transferValue_NonConstMemberCall)
diff --git a/clang/lib/Basic/Module.cpp b/clang/lib/Basic/Module.cpp
index a7a3f6b37efef..330108d5b3e47 100644
--- a/clang/lib/Basic/Module.cpp
+++ b/clang/lib/Basic/Module.cpp
@@ -54,7 +54,6 @@ Module::Module(ModuleConstructorTag, StringRef Name,
     NoUndeclaredIncludes = Parent->NoUndeclaredIncludes;
     ModuleMapIsPrivate = Parent->ModuleMapIsPrivate;
 
-    Parent->SubModuleIndex[Name] = Parent->SubModules.size();
     Parent->SubModules.push_back(this);
   }
 }
@@ -351,11 +350,14 @@ void Module::markUnavailable(bool Unimportable) {
 }
 
 Module *Module::findSubmodule(StringRef Name) const {
-  llvm::StringMap<unsigned>::const_iterator Pos = SubModuleIndex.find(Name);
-  if (Pos == SubModuleIndex.end())
-    return nullptr;
+  // Add new submodules into the index.
+  for (unsigned I = SubModuleIndex.size(), E = SubModules.size(); I != E; ++I)
+    SubModuleIndex[SubModules[I]->Name] = I;
 
-  return SubModules[Pos->getValue()];
+  if (auto It = SubModuleIndex.find(Name); It != SubModuleIndex.end())
+    return SubModules[It->second];
+
+  return nullptr;
 }
 
 Module *Module::getGlobalModuleFragment() const {
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index d067ec218b527..700c2f9a5dbd1 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -1465,7 +1465,7 @@ bool X86TargetInfo::validateAsmConstraint(
     }
   case 'f': // Any x87 floating point stack register.
     // Constraint 'f' cannot be used for output operands.
-    if (Info.ConstraintStr[0] == '=')
+    if (Info.ConstraintStr[0] == '=' || Info.ConstraintStr[0] == '+')
       return false;
     Info.setAllowsRegister();
     return true;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index df1f4017606f7..82e6326a1c42a 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -17,6 +17,7 @@
 #include "CGObjCRuntime.h"
 #include "CGOpenCLRuntime.h"
 #include "CGRecordLayout.h"
+#include "CGValue.h"
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "ConstantEmitter.h"
@@ -25,8 +26,10 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Attr.h"
 #include "clang/AST/Decl.h"
+#include "clang/AST/Expr.h"
 #include "clang/AST/OSLog.h"
 #include "clang/AST/OperationKinds.h"
+#include "clang/AST/Type.h"
 #include "clang/Basic/TargetBuiltins.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Basic/TargetOptions.h"
@@ -67,6 +70,7 @@
 #include "llvm/TargetParser/X86TargetParser.h"
 #include <optional>
 #include <sstream>
+#include <utility>
 
 using namespace clang;
 using namespace CodeGen;
@@ -95,6 +99,76 @@ static void initializeAlloca(CodeGenFunction &CGF, AllocaInst *AI, Value *Size,
   I->addAnnotationMetadata("auto-init");
 }
 
+static Value *handleHlslSplitdouble(const CallExpr *E, CodeGenFunction *CGF) {
+  Value *Op0 = CGF->EmitScalarExpr(E->getArg(0));
+  const auto *OutArg1 = dyn_cast<HLSLOutArgExpr>(E->getArg(1));
+  const auto *OutArg2 = dyn_cast<HLSLOutArgExpr>(E->getArg(2));
+
+  CallArgList Args;
+  LValue Op1TmpLValue =
+      CGF->EmitHLSLOutArgExpr(OutArg1, Args, OutArg1->getType());
+  LValue Op2TmpLValue =
+      CGF->EmitHLSLOutArgExpr(OutArg2, Args, OutArg2->getType());
+
+  if (CGF->getTarget().getCXXABI().areArgsDestroyedLeftToRightInCallee())
+    Args.reverseWritebacks();
+
+  Value *LowBits = nullptr;
+  Value *HighBits = nullptr;
+
+  if (CGF->CGM.getTarget().getTriple().isDXIL()) {
+
+    llvm::Type *RetElementTy = CGF->Int32Ty;
+    if (auto *Op0VecTy = E->getArg(0)->getType()->getAs<clang::VectorType>())
+      RetElementTy = llvm::VectorType::get(
+          CGF->Int32Ty, ElementCount::getFixed(Op0VecTy->getNumElements()));
+    auto *RetTy = llvm::StructType::get(RetElementTy, RetElementTy);
+
+    CallInst *CI = CGF->Builder.CreateIntrinsic(
+        RetTy, Intrinsic::dx_splitdouble, {Op0}, nullptr, "hlsl.splitdouble");
+
+    LowBits = CGF->Builder.CreateExtractValue(CI, 0);
+    HighBits = CGF->Builder.CreateExtractValue(CI, 1);
+
+  } else {
+    // For Non DXIL targets we generate the instructions.
+
+    if (!Op0->getType()->isVectorTy()) {
+      FixedVectorType *DestTy = FixedVectorType::get(CGF->Int32Ty, 2);
+      Value *Bitcast = CGF->Builder.CreateBitCast(Op0, DestTy);
+
+      LowBits = CGF->Builder.CreateExtractElement(Bitcast, (uint64_t)0);
+      HighBits = CGF->Builder.CreateExtractElement(Bitcast, 1);
+    } else {
+      int NumElements = 1;
+      if (const auto *VecTy =
+              E->getArg(0)->getType()->getAs<clang::VectorType>())
+        NumElements = VecTy->getNumElements();
+
+      FixedVectorType *Uint32VecTy =
+          FixedVectorType::get(CGF->Int32Ty, NumElements * 2);
+      Value *Uint32Vec = CGF->Builder.CreateBitCast(Op0, Uint32VecTy);
+      if (NumElements == 1) {
+        LowBits = CGF->Builder.CreateExtractElement(Uint32Vec, (uint64_t)0);
+        HighBits = CGF->Builder.CreateExtractElement(Uint32Vec, 1);
+      } else {
+        SmallVector<int> EvenMask, OddMask;
+        for (int I = 0, E = NumElements; I != E; ++I) {
+          EvenMask.push_back(I * 2);
+          OddMask.push_back(I * 2 + 1);
+        }
+        LowBits = CGF->Builder.CreateShuffleVector(Uint32Vec, EvenMask);
+        HighBits = CGF->Builder.CreateShuffleVector(Uint32Vec, OddMask);
+      }
+    }
+  }
+  CGF->Builder.CreateStore(LowBits, Op1TmpLValue.getAddress());
+  auto *LastInst =
+      CGF->Builder.CreateStore(HighBits, Op2TmpLValue.getAddress());
+  CGF->EmitWritebacks(Args);
+  return LastInst;
+}
+
 /// getBuiltinLibFunction - Given a builtin id for a function like
 /// "__builtin_fabsf", return a Function* for "fabsf".
 llvm::Constant *CodeGenModule::getBuiltinLibFunction(const FunctionDecl *FD,
@@ -18956,6 +19030,14 @@ case Builtin::BI__builtin_hlsl_elementwise_isinf: {
         CGM.getHLSLRuntime().getRadiansIntrinsic(), ArrayRef<Value *>{Op0},
         nullptr, "hlsl.radians");
   }
+  case Builtin::BI__builtin_hlsl_elementwise_splitdouble: {
+
+    assert((E->getArg(0)->getType()->hasFloatingRepresentation() &&
+            E->getArg(1)->getType()->hasUnsignedIntegerRepresentation() &&
+            E->getArg(2)->getType()->hasUnsignedIntegerRepresentation()) &&
+           "asuint operands types mismatch");
+    return handleHlslSplitdouble(E, this);
+  }
   }
   return nullptr;
 }
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 72fd357be0f7d..fe581b01a365c 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -40,6 +40,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <optional>
 using namespace clang;
@@ -4257,12 +4258,6 @@ static void emitWriteback(CodeGenFunction &CGF,
     CGF.EmitBlock(contBB);
 }
 
-static void emitWritebacks(CodeGenFunction &CGF,
-                           const CallArgList &args) {
-  for (const auto &I : args.writebacks())
-    emitWriteback(CGF, I);
-}
-
 static void deactivateArgCleanupsBeforeCall(CodeGenFunction &CGF,
                                             const CallArgList &CallArgs) {
   ArrayRef<CallArgList::CallArgCleanup> Cleanups =
@@ -4731,6 +4726,11 @@ void CallArg::copyInto(CodeGenFunction &CGF, Address Addr) const {
   IsUsed = true;
 }
 
+void CodeGenFunction::EmitWritebacks(const CallArgList &args) {
+  for (const auto &I : args.writebacks())
+    emitWriteback(*this, I);
+}
+
 void CodeGenFunction::EmitCallArg(CallArgList &args, const Expr *E,
                                   QualType type) {
   DisableDebugLocationUpdates Dis(*this, E);
@@ -5954,7 +5954,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   // Emit any call-associated writebacks immediately.  Arguably this
   // should happen after any return-value munging.
   if (CallArgs.hasWritebacks())
-    emitWritebacks(*this, CallArgs);
+    EmitWritebacks(CallArgs);
 
   // The stack cleanup for inalloca arguments has to run out of the normal
   // lexical order, so deactivate it and run it manually here.
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 28d185d1a6b6f..d31763ca496be 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -5460,9 +5460,8 @@ LValue CodeGenFunction::EmitOpaqueValueLValue(const OpaqueValueExpr *e) {
   return getOrCreateOpaqueLValueMapping(e);
 }
 
-void CodeGenFunction::EmitHLSLOutArgExpr(const HLSLOutArgExpr *E,
-                                         CallArgList &Args, QualType Ty) {
-
+std::pair<LValue, LValue>
+CodeGenFunction::EmitHLSLOutArgLValues(const HLSLOutArgExpr *E, QualType Ty) {
   // Emitting the casted temporary through an opaque value.
   LValue BaseLV = EmitLValue(E->getArgLValue());
   OpaqueValueMappingData::bind(*this, E->getOpaqueArgLValue(), BaseLV);
@@ -5476,6 +5475,13 @@ void CodeGenFunction::EmitHLSLOutArgExpr(const HLSLOutArgExpr *E,
                                TempLV);
 
   OpaqueValueMappingData::bind(*this, E->getCastedTemporary(), TempLV);
+  return std::make_pair(BaseLV, TempLV);
+}
+
+LValue CodeGenFunction::EmitHLSLOutArgExpr(const HLSLOutArgExpr *E,
+                                           CallArgList &Args, QualType Ty) {
+
+  auto [BaseLV, TempLV] = EmitHLSLOutArgLValues(E, Ty);
 
   llvm::Value *Addr = TempLV.getAddress().getBasePointer();
   llvm::Type *ElTy = ConvertTypeForMem(TempLV.getType());
@@ -5488,6 +5494,7 @@ void CodeGenFunction::EmitHLSLOutArgExpr(const HLSLOutArgExpr *E,
   Args.addWriteback(BaseLV, TmpAddr, nullptr, E->getWritebackCast(),
                     LifetimeSize);
   Args.add(RValue::get(TmpAddr, *this), Ty);
+  return TempLV;
 }
 
 LValue
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index e06efe0b7e8dc..192f488d97ec2 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4379,8 +4379,11 @@ class CodeGenFunction : public CodeGenTypeCache {
   LValue EmitMaterializeTemporaryExpr(const MaterializeTemporaryExpr *E);
   LValue EmitOpaqueValueLValue(const OpaqueValueExpr *e);
   LValue EmitHLSLArrayAssignLValue(const BinaryOperator *E);
-  void EmitHLSLOutArgExpr(const HLSLOutArgExpr *E, CallArgList &Args,
-                          QualType Ty);
+
+  std::pair<LValue, LValue> EmitHLSLOutArgLValues(const HLSLOutArgExpr *E,
+                                                  QualType Ty);
+  LValue EmitHLSLOutArgExpr(const HLSLOutArgExpr *E, CallArgList &Args,
+                            QualType Ty);
 
   Address EmitExtVectorElementLValue(LValue V);
 
@@ -5240,6 +5243,9 @@ class CodeGenFunction : public CodeGenTypeCache {
                            SourceLocation ArgLoc, AbstractCallee AC,
                            unsigned ParmNum);
 
+  /// EmitWriteback - Emit callbacks for function.
+  void EmitWritebacks(const CallArgList &Args);
+
   /// EmitCallArg - Emit a single call argument.
   void EmitCallArg(CallArgList &args, const Expr *E, QualType ArgType);
 
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index 30dce60b3ff70..8ade4b27f360f 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -438,6 +438,24 @@ template <typename T> constexpr uint asuint(T F) {
   return __detail::bit_cast<uint, T>(F);
 }
 
+//===----------------------------------------------------------------------===//
+// asuint splitdouble builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn void asuint(double D, out uint lowbits, out int highbits)
+/// \brief Split and interprets the lowbits and highbits of double D into uints.
+/// \param D The input double.
+/// \param lowbits The output lowbits of D.
+/// \param highbits The output highbits of D.
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_splitdouble)
+void asuint(double, out uint, out uint);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_splitdouble)
+void asuint(double2, out uint2, out uint2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_splitdouble)
+void asuint(double3, out uint3, out uint3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_splitdouble)
+void asuint(double4, out uint4, out uint4);
+
 //===----------------------------------------------------------------------===//
 // atan builtins
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp
index 201ab91cf68ca..dc9d2bfd5629c 100644
--- a/clang/lib/Lex/ModuleMap.cpp
+++ b/clang/lib/Lex/ModuleMap.cpp
@@ -655,8 +655,8 @@ ModuleMap::findOrCreateModuleForHeaderInUmbrellaDir(FileEntryRef File) {
         SmallString<32> NameBuf;
         StringRef Name = sanitizeFilenameAsIdentifier(
             llvm::sys::path::stem(SkippedDir.getName()), NameBuf);
-        Result = findOrCreateModule(Name, Result, /*IsFramework=*/false,
-                                    Explicit).first;
+        Result = findOrCreateModuleFirst(Name, Result, /*IsFramework=*/false,
+                                         Explicit);
         setInferredModuleAllowedBy(Result, UmbrellaModuleMap);
 
         // Associate the module and the directory.
@@ -672,8 +672,8 @@ ModuleMap::findOrCreateModuleForHeaderInUmbrellaDir(FileEntryRef File) {
       SmallString<32> NameBuf;
       StringRef Name = sanitizeFilenameAsIdentifier(
                          llvm::sys::path::stem(File.getName()), NameBuf);
-      Result = findOrCreateModule(Name, Result, /*IsFramework=*/false,
-                                  Explicit).first;
+      Result = findOrCreateModuleFirst(Name, Result, /*IsFramework=*/false,
+                                       Explicit);
       setInferredModuleAllowedBy(Result, UmbrellaModuleMap);
       Result->addTopHeader(File);
 
@@ -866,6 +866,15 @@ std::pair<Module *, bool> ModuleMap::findOrCreateModule(StringRef Name,
     return std::make_pair(Sub, false);
 
   // Create a new module with this name.
+  Module *M = createModule(Name, Parent, IsFramework, IsExplicit);
+  return std::make_pair(M, true);
+}
+
+Module *ModuleMap::createModule(StringRef Name, Module *Parent,
+                                bool IsFramework, bool IsExplicit) {
+  assert(lookupModuleQualified(Name, Parent) == nullptr &&
+         "Creating duplicate submodule");
+
   Module *Result = new (ModulesAlloc.Allocate())
       Module(ModuleConstructorTag{}, Name, SourceLocation(), Parent,
              IsFramework, IsExplicit, NumCreatedModules++);
@@ -875,7 +884,7 @@ std::pair<Module *, bool> ModuleMap::findOrCreateModule(StringRef Name,
     Modules[Name] = Result;
     ModuleScopeIDs[Result] = CurrentModuleScopeID;
   }
-  return std::make_pair(Result, true);
+  return Result;
 }
 
 Module *ModuleMap::createGlobalModuleFragmentForModuleUnit(SourceLocation Loc,
@@ -2123,9 +2132,8 @@ void ModuleMapParser::parseModuleDecl() {
     ActiveModule =
         Map.createShadowedModule(ModuleName, Framework, ShadowingModule);
   } else {
-    ActiveModule =
-        Map.findOrCreateModule(ModuleName, ActiveModule, Framework, Explicit)
-            .first;
+    ActiveModule = Map.findOrCreateModuleFirst(ModuleName, ActiveModule,
+                                               Framework, Explicit);
   }
 
   ActiveModule->DefinitionLoc = ModuleNameLoc;
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 27b274d74ce71..d027e4c6dfdb4 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -8899,18 +8899,36 @@ void Sema::CheckMemaccessArguments(const CallExpr *Call,
           << ArgIdx << FnName << PointeeTy
           << Call->getCallee()->getSourceRange());
     else if (const auto *RT = PointeeTy->getAs<RecordType>()) {
+
+      bool IsTriviallyCopyableCXXRecord =
+          RT->desugar().isTriviallyCopyableType(Context);
+
       if ((BId == Builtin::BImemset || BId == Builtin::BIbzero) &&
           RT->getDecl()->isNonTrivialToPrimitiveDefaultInitialize()) {
         DiagRuntimeBehavior(Dest->getExprLoc(), Dest,
                             PDiag(diag::warn_cstruct_memaccess)
                                 << ArgIdx << FnName << PointeeTy << 0);
         SearchNonTrivialToInitializeField::diag(PointeeTy, Dest, *this);
+      } else if ((BId == Builtin::BImemset || BId == Builtin::BIbzero) &&
+                 !IsTriviallyCopyableCXXRecord && ArgIdx == 0) {
+        // FIXME: Limiting this warning to dest argument until we decide
+        // whether it's valid for source argument too.
+        DiagRuntimeBehavior(Dest->getExprLoc(), Dest,
+                            PDiag(diag::warn_cxxstruct_memaccess)
+                                << FnName << PointeeTy);
       } else if ((BId == Builtin::BImemcpy || BId == Builtin::BImemmove) &&
                  RT->getDecl()->isNonTrivialToPrimitiveCopy()) {
         DiagRuntimeBehavior(Dest->getExprLoc(), Dest,
                             PDiag(diag::warn_cstruct_memaccess)
                                 << ArgIdx << FnName << PointeeTy << 1);
         SearchNonTrivialToCopyField::diag(PointeeTy, Dest, *this);
+      } else if ((BId == Builtin::BImemcpy || BId == Builtin::BImemmove) &&
+                 !IsTriviallyCopyableCXXRecord && ArgIdx == 0) {
+        // FIXME: Limiting this warning to dest argument until we decide
+        // whether it's valid for source argument too.
+        DiagRuntimeBehavior(Dest->getExprLoc(), Dest,
+                            PDiag(diag::warn_cxxstruct_memaccess)
+                                << FnName << PointeeTy);
       } else {
         continue;
       }
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 1f6c5b8d4561b..a472538236e2d 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -1698,18 +1698,27 @@ static bool CheckVectorElementCallArgs(Sema *S, CallExpr *TheCall) {
   return true;
 }
 
-static bool CheckArgsTypesAreCorrect(
+bool CheckArgTypeIsCorrect(
+    Sema *S, Expr *Arg, QualType ExpectedType,
+    llvm::function_ref<bool(clang::QualType PassedType)> Check) {
+  QualType PassedType = Arg->getType();
+  if (Check(PassedType)) {
+    if (auto *VecTyA = PassedType->getAs<VectorType>())
+      ExpectedType = S->Context.getVectorType(
+          ExpectedType, VecTyA->getNumElements(), VecTyA->getVectorKind());
+    S->Diag(Arg->getBeginLoc(), diag::err_typecheck_convert_incompatible)
+        << PassedType << ExpectedType << 1 << 0 << 0;
+    return true;
+  }
+  return false;
+}
+
+bool CheckAllArgTypesAreCorrect(
     Sema *S, CallExpr *TheCall, QualType ExpectedType,
     llvm::function_ref<bool(clang::QualType PassedType)> Check) {
   for (unsigned i = 0; i < TheCall->getNumArgs(); ++i) {
-    QualType PassedType = TheCall->getArg(i)->getType();
-    if (Check(PassedType)) {
-      if (auto *VecTyA = PassedType->getAs<VectorType>())
-        ExpectedType = S->Context.getVectorType(
-            ExpectedType, VecTyA->getNumElements(), VecTyA->getVectorKind());
-      S->Diag(TheCall->getArg(0)->getBeginLoc(),
-              diag::err_typecheck_convert_incompatible)
-          << PassedType << ExpectedType << 1 << 0 << 0;
+    Expr *Arg = TheCall->getArg(i);
+    if (CheckArgTypeIsCorrect(S, Arg, ExpectedType, Check)) {
       return true;
     }
   }
@@ -1720,8 +1729,8 @@ static bool CheckAllArgsHaveFloatRepresentation(Sema *S, CallExpr *TheCall) {
   auto checkAllFloatTypes = [](clang::QualType PassedType) -> bool {
     return !PassedType->hasFloatingRepresentation();
   };
-  return CheckArgsTypesAreCorrect(S, TheCall, S->Context.FloatTy,
-                                  checkAllFloatTypes);
+  return CheckAllArgTypesAreCorrect(S, TheCall, S->Context.FloatTy,
+                                    checkAllFloatTypes);
 }
 
 static bool CheckFloatOrHalfRepresentations(Sema *S, CallExpr *TheCall) {
@@ -1732,8 +1741,19 @@ static bool CheckFloatOrHalfRepresentations(Sema *S, CallExpr *TheCall) {
             : PassedType;
     return !BaseType->isHalfType() && !BaseType->isFloat32Type();
   };
-  return CheckArgsTypesAreCorrect(S, TheCall, S->Context.FloatTy,
-                                  checkFloatorHalf);
+  return CheckAllArgTypesAreCorrect(S, TheCall, S->Context.FloatTy,
+                                    checkFloatorHalf);
+}
+
+static bool CheckModifiableLValue(Sema *S, CallExpr *TheCall,
+                                  unsigned ArgIndex) {
+  auto *Arg = TheCall->getArg(ArgIndex);
+  SourceLocation OrigLoc = Arg->getExprLoc();
+  if (Arg->IgnoreCasts()->isModifiableLvalue(S->Context, &OrigLoc) ==
+      Expr::MLV_Valid)
+    return false;
+  S->Diag(OrigLoc, diag::error_hlsl_inout_lvalue) << Arg << 0;
+  return true;
 }
 
 static bool CheckNoDoubleVectors(Sema *S, CallExpr *TheCall) {
@@ -1742,24 +1762,24 @@ static bool CheckNoDoubleVectors(Sema *S, CallExpr *TheCall) {
       return VecTy->getElementType()->isDoubleType();
     return false;
   };
-  return CheckArgsTypesAreCorrect(S, TheCall, S->Context.FloatTy,
-                                  checkDoubleVector);
+  return CheckAllArgTypesAreCorrect(S, TheCall, S->Context.FloatTy,
+                                    checkDoubleVector);
 }
 static bool CheckFloatingOrIntRepresentation(Sema *S, CallExpr *TheCall) {
   auto checkAllSignedTypes = [](clang::QualType PassedType) -> bool {
     return !PassedType->hasIntegerRepresentation() &&
            !PassedType->hasFloatingRepresentation();
   };
-  return CheckArgsTypesAreCorrect(S, TheCall, S->Context.IntTy,
-                                  checkAllSignedTypes);
+  return CheckAllArgTypesAreCorrect(S, TheCall, S->Context.IntTy,
+                                    checkAllSignedTypes);
 }
 
 static bool CheckUnsignedIntRepresentation(Sema *S, CallExpr *TheCall) {
   auto checkAllUnsignedTypes = [](clang::QualType PassedType) -> bool {
     return !PassedType->hasUnsignedIntegerRepresentation();
   };
-  return CheckArgsTypesAreCorrect(S, TheCall, S->Context.UnsignedIntTy,
-                                  checkAllUnsignedTypes);
+  return CheckAllArgTypesAreCorrect(S, TheCall, S->Context.UnsignedIntTy,
+                                    checkAllUnsignedTypes);
 }
 
 static void SetElementTypeAsReturnType(Sema *S, CallExpr *TheCall,
@@ -2074,6 +2094,22 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
       return true;
     break;
   }
+  case Builtin::BI__builtin_hlsl_elementwise_splitdouble: {
+    if (SemaRef.checkArgCount(TheCall, 3))
+      return true;
+
+    if (CheckScalarOrVector(&SemaRef, TheCall, SemaRef.Context.DoubleTy, 0) ||
+        CheckScalarOrVector(&SemaRef, TheCall, SemaRef.Context.UnsignedIntTy,
+                            1) ||
+        CheckScalarOrVector(&SemaRef, TheCall, SemaRef.Context.UnsignedIntTy,
+                            2))
+      return true;
+
+    if (CheckModifiableLValue(&SemaRef, TheCall, 1) ||
+        CheckModifiableLValue(&SemaRef, TheCall, 2))
+      return true;
+    break;
+  }
   case Builtin::BI__builtin_elementwise_acos:
   case Builtin::BI__builtin_elementwise_asin:
   case Builtin::BI__builtin_elementwise_atan:
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 2419ed84e68ac..8d8f9378cfeab 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -5756,6 +5756,14 @@ llvm::Error ASTReader::ReadSubmoduleBlock(ModuleFile &F,
     return Err;
 
   ModuleMap &ModMap = PP.getHeaderSearchInfo().getModuleMap();
+  bool KnowsTopLevelModule = ModMap.findModule(F.ModuleName) != nullptr;
+  // If we don't know the top-level module, there's no point in doing qualified
+  // lookup of its submodules; it won't find anything anywhere within this tree.
+  // Let's skip that and avoid some string lookups.
+  auto CreateModule = !KnowsTopLevelModule
+                          ? &ModuleMap::createModule
+                          : &ModuleMap::findOrCreateModuleFirst;
+
   bool First = true;
   Module *CurrentModule = nullptr;
   RecordData Record;
@@ -5829,11 +5837,8 @@ llvm::Error ASTReader::ReadSubmoduleBlock(ModuleFile &F,
       if (Parent)
         ParentModule = getSubmodule(Parent);
 
-      // Retrieve this (sub)module from the module map, creating it if
-      // necessary.
-      CurrentModule =
-          ModMap.findOrCreateModule(Name, ParentModule, IsFramework, IsExplicit)
-              .first;
+      CurrentModule = std::invoke(CreateModule, &ModMap, Name, ParentModule,
+                                  IsFramework, IsExplicit);
 
       SubmoduleID GlobalIndex = GlobalID - NUM_PREDEF_SUBMODULE_IDS;
       if (GlobalIndex >= SubmodulesLoaded.size() ||
diff --git a/clang/test/ClangScanDeps/print-timing.c b/clang/test/ClangScanDeps/print-timing.c
index f27df1ebf732a..fa2a433b95537 100644
--- a/clang/test/ClangScanDeps/print-timing.c
+++ b/clang/test/ClangScanDeps/print-timing.c
@@ -3,7 +3,8 @@
 
 // RUN: clang-scan-deps -compilation-database %t/cdb.json -print-timing > %t/result.json 2>%t/errs
 // RUN: cat %t/errs | FileCheck %s
-// CHECK: clang-scan-deps timing: {{[0-9]+}}.{{[0-9][0-9]}}s wall, {{[0-9]+}}.{{[0-9][0-9]}}s process
+// CHECK:      wall time [s]              process time [s]           instruction count
+// CHECK-NEXT: {{[0-9]+}}.{{([0-9]{4})}}  {{[0-9]+}}.{{([0-9]{4})}}  {{[0-9]+}}
 
 //--- cdb.json
 []
diff --git a/clang/test/CodeGen/X86/avx-cmp-builtins.c b/clang/test/CodeGen/X86/avx-cmp-builtins.c
index c4e3c7ccd5498..2e4a383a6b3fc 100644
--- a/clang/test/CodeGen/X86/avx-cmp-builtins.c
+++ b/clang/test/CodeGen/X86/avx-cmp-builtins.c
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -O3 -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -O3 -triple=i386-apple-darwin -target-feature +avx -emit-llvm -o - | FileCheck %s
-// FIXME: The shufflevector instructions in test_cmpgt_sd are relying on O3 here.
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-- -target-feature +avx -disable-O0-optnone -emit-llvm -o - | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i386-- -target-feature +avx -disable-O0-optnone -emit-llvm -o - | opt -S -passes=mem2reg | FileCheck %s
 
 
 #include <immintrin.h>
@@ -9,62 +9,124 @@
 // Test LLVM IR codegen of cmpXY instructions
 //
 
+// CHECK-LABEL: define dso_local <2 x double> @test_cmp_sd(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[A]], <2 x double> [[B]], i8 13)
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
 __m128d test_cmp_sd(__m128d a, __m128d b) {
   // Expects that the third argument in LLVM IR is immediate expression
-  // CHECK: @llvm.x86.sse2.cmp.sd({{.*}}, i8 13)
   return _mm_cmp_sd(a, b, _CMP_GE_OS);
 }
 
+// CHECK-LABEL: define dso_local <4 x float> @test_cmp_ss(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> [[A]], <4 x float> [[B]], i8 13)
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 __m128 test_cmp_ss(__m128 a, __m128 b) {
   // Expects that the third argument in LLVM IR is immediate expression
-  // CHECK: @llvm.x86.sse.cmp.ss({{.*}}, i8 13)
   return _mm_cmp_ss(a, b, _CMP_GE_OS);
 }
 
+// CHECK-LABEL: define dso_local <4 x float> @test_cmpgt_ss(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> [[B]], <4 x float> [[A]], i8 1)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[TMP0]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x float> [[SHUFFLE_I]]
+//
 __m128 test_cmpgt_ss(__m128 a, __m128 b) {
-  // CHECK: @llvm.x86.sse.cmp.ss({{.*}}, i8 1)
-  // CHECK: shufflevector <{{.*}}, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   return _mm_cmpgt_ss(a, b);
 }
 
+// CHECK-LABEL: define dso_local <4 x float> @test_cmpge_ss(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> [[B]], <4 x float> [[A]], i8 2)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[TMP0]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x float> [[SHUFFLE_I]]
+//
 __m128 test_cmpge_ss(__m128 a, __m128 b) {
-  // CHECK: @llvm.x86.sse.cmp.ss({{.*}}, i8 2)
-  // CHECK: shufflevector <{{.*}}, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   return _mm_cmpge_ss(a, b);
 }
 
+// CHECK-LABEL: define dso_local <4 x float> @test_cmpngt_ss(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> [[B]], <4 x float> [[A]], i8 5)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[TMP0]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x float> [[SHUFFLE_I]]
+//
 __m128 test_cmpngt_ss(__m128 a, __m128 b) {
-  // CHECK: @llvm.x86.sse.cmp.ss({{.*}}, i8 5)
-  // CHECK: shufflevector <{{.*}}, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   return _mm_cmpngt_ss(a, b);
 }
 
+// CHECK-LABEL: define dso_local <4 x float> @test_cmpnge_ss(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> [[B]], <4 x float> [[A]], i8 6)
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[TMP0]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x float> [[SHUFFLE_I]]
+//
 __m128 test_cmpnge_ss(__m128 a, __m128 b) {
-  // CHECK: @llvm.x86.sse.cmp.ss({{.*}}, i8 6)
-  // CHECK: shufflevector <{{.*}}, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   return _mm_cmpnge_ss(a, b);
 }
 
+// CHECK-LABEL: define dso_local <2 x double> @test_cmpgt_sd(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[B]], <2 x double> [[A]], i8 1)
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP0]], i32 0
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[VECEXT_I]], i32 0
+// CHECK-NEXT:    [[VECEXT1_I:%.*]] = extractelement <2 x double> [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[VECEXT1_I]], i32 1
+// CHECK-NEXT:    ret <2 x double> [[VECINIT2_I]]
+//
 __m128d test_cmpgt_sd(__m128d a, __m128d b) {
-  // CHECK: @llvm.x86.sse2.cmp.sd({{.*}}, i8 1)
-  // CHECK: shufflevector <{{.*}}, <2 x i32> <i32 0, i32 3>
   return _mm_cmpgt_sd(a, b);
 }
 
+// CHECK-LABEL: define dso_local <2 x double> @test_cmpge_sd(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[B]], <2 x double> [[A]], i8 2)
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP0]], i32 0
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[VECEXT_I]], i32 0
+// CHECK-NEXT:    [[VECEXT1_I:%.*]] = extractelement <2 x double> [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[VECEXT1_I]], i32 1
+// CHECK-NEXT:    ret <2 x double> [[VECINIT2_I]]
+//
 __m128d test_cmpge_sd(__m128d a, __m128d b) {
-  // CHECK: @llvm.x86.sse2.cmp.sd({{.*}}, i8 2)
-  // CHECK: shufflevector <{{.*}}, <2 x i32> <i32 0, i32 3>
   return _mm_cmpge_sd(a, b);
 }
 
+// CHECK-LABEL: define dso_local <2 x double> @test_cmpngt_sd(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[B]], <2 x double> [[A]], i8 5)
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP0]], i32 0
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[VECEXT_I]], i32 0
+// CHECK-NEXT:    [[VECEXT1_I:%.*]] = extractelement <2 x double> [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[VECEXT1_I]], i32 1
+// CHECK-NEXT:    ret <2 x double> [[VECINIT2_I]]
+//
 __m128d test_cmpngt_sd(__m128d a, __m128d b) {
-  // CHECK: @llvm.x86.sse2.cmp.sd({{.*}}, i8 5)
-  // CHECK: shufflevector <{{.*}}, <2 x i32> <i32 0, i32 3>
   return _mm_cmpngt_sd(a, b);
 }
 
+// CHECK-LABEL: define dso_local <2 x double> @test_cmpnge_sd(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[B]], <2 x double> [[A]], i8 6)
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP0]], i32 0
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[VECEXT_I]], i32 0
+// CHECK-NEXT:    [[VECEXT1_I:%.*]] = extractelement <2 x double> [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[VECEXT1_I]], i32 1
+// CHECK-NEXT:    ret <2 x double> [[VECINIT2_I]]
+//
 __m128d test_cmpnge_sd(__m128d a, __m128d b) {
-  // CHECK: @llvm.x86.sse2.cmp.sd({{.*}}, i8 6)
-  // CHECK: shufflevector <{{.*}}, <2 x i32> <i32 0, i32 3>
   return _mm_cmpnge_sd(a, b);
 }
diff --git a/clang/test/CodeGen/X86/avx-shuffle-builtins.c b/clang/test/CodeGen/X86/avx-shuffle-builtins.c
index d184d28f3e07a..1c05fa436983e 100644
--- a/clang/test/CodeGen/X86/avx-shuffle-builtins.c
+++ b/clang/test/CodeGen/X86/avx-shuffle-builtins.c
@@ -1,7 +1,7 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // REQUIRES: x86-registered-target
-// RUN: %clang_cc1 -ffreestanding %s -O3 -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,X64
-// RUN: %clang_cc1 -ffreestanding %s -O3 -triple=i386-apple-darwin -target-feature +avx -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,X86
-// FIXME: This is testing optimized generation of shuffle instructions and should be fixed.
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-- -target-feature +avx -disable-O0-optnone -emit-llvm -o - | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=i386-- -target-feature +avx -disable-O0-optnone -emit-llvm -o - | opt -S -passes=mem2reg | FileCheck %s
 
 
 #include <immintrin.h>
@@ -10,201 +10,341 @@
 // Test LLVM IR codegen of shuffle instructions, checking if the masks are correct
 //
 
+// CHECK-LABEL: define dso_local <8 x float> @x(
+// CHECK-SAME: <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFP:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15>
+// CHECK-NEXT:    ret <8 x float> [[SHUFP]]
+//
 __m256 x(__m256 a, __m256 b) {
-  // CHECK-LABEL: x
-  // CHECK: shufflevector{{.*}}<i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15>
   return _mm256_shuffle_ps(a, b, 203);
 }
 
+// CHECK-LABEL: define dso_local <2 x double> @test_mm_permute_pd(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[PERMIL:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+// CHECK-NEXT:    ret <2 x double> [[PERMIL]]
+//
 __m128d test_mm_permute_pd(__m128d a) {
-  // CHECK-LABEL: test_mm_permute_pd
-  // CHECK: shufflevector{{.*}}<i32 1, i32 0>
   return _mm_permute_pd(a, 1);
 }
 
+// CHECK-LABEL: define dso_local <4 x double> @test_mm256_permute_pd(
+// CHECK-SAME: <4 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[PERMIL:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT:    ret <4 x double> [[PERMIL]]
+//
 __m256d test_mm256_permute_pd(__m256d a) {
-  // CHECK-LABEL: test_mm256_permute_pd
-  // CHECK: shufflevector{{.*}}<i32 1, i32 0, i32 3, i32 2>
   return _mm256_permute_pd(a, 5);
 }
 
+// CHECK-LABEL: define dso_local <4 x float> @test_mm_permute_ps(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[PERMIL:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    ret <4 x float> [[PERMIL]]
+//
 __m128 test_mm_permute_ps(__m128 a) {
-  // CHECK-LABEL: test_mm_permute_ps
-  // CHECK: shufflevector{{.*}}<i32 3, i32 2, i32 1, i32 0>
   return _mm_permute_ps(a, 0x1b);
 }
 
-// Test case for PR12401
+// CHECK-LABEL: define dso_local <4 x float> @test_mm_permute_ps2(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[PERMIL:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 2, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x float> [[PERMIL]]
+//
 __m128 test_mm_permute_ps2(__m128 a) {
-  // CHECK-LABEL: test_mm_permute_ps2
-  // CHECK: shufflevector{{.*}}<i32 2, i32 1, i32 2, i32 3>
   return _mm_permute_ps(a, 0xe6);
 }
 
+// CHECK-LABEL: define dso_local <8 x float> @test_mm256_permute_ps(
+// CHECK-SAME: <8 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[PERMIL:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:    ret <8 x float> [[PERMIL]]
+//
 __m256 test_mm256_permute_ps(__m256 a) {
-  // CHECK-LABEL: test_mm256_permute_ps
-  // CHECK: shufflevector{{.*}}<i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   return _mm256_permute_ps(a, 0x1b);
 }
 
+// CHECK-LABEL: define dso_local <4 x double> @test_mm256_permute2f128_pd(
+// CHECK-SAME: <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPERM:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+// CHECK-NEXT:    ret <4 x double> [[VPERM]]
+//
 __m256d test_mm256_permute2f128_pd(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_permute2f128_pd
-  // CHECK: shufflevector{{.*}}<i32 2, i32 3, i32 6, i32 7> 
   return _mm256_permute2f128_pd(a, b, 0x31);
 }
 
+// CHECK-LABEL: define dso_local <8 x float> @test_mm256_permute2f128_ps(
+// CHECK-SAME: <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VPERM:%.*]] = shufflevector <8 x float> [[B]], <8 x float> [[A]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <8 x float> [[VPERM]]
+//
 __m256 test_mm256_permute2f128_ps(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_permute2f128_ps
-  // CHECK: shufflevector{{.*}}<i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   return _mm256_permute2f128_ps(a, b, 0x13);
 }
 
+// CHECK-LABEL: define dso_local <4 x i64> @test_mm256_permute2f128_si256(
+// CHECK-SAME: <4 x i64> noundef [[A:%.*]], <4 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i64> [[A]] to <8 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[B]] to <8 x i32>
+// CHECK-NEXT:    [[VPERM:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[VPERM]] to <4 x i64>
+// CHECK-NEXT:    ret <4 x i64> [[TMP2]]
+//
 __m256i test_mm256_permute2f128_si256(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_permute2f128_si256
-  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 4, i32 5>
   return _mm256_permute2f128_si256(a, b, 0x20);
 }
 
-__m128
-test_mm_broadcast_ss(float const *__a) {
-  // CHECK-LABEL: test_mm_broadcast_ss
-  // CHECK: insertelement <4 x float> {{.*}}, i64 0
-  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> poison, <4 x i32> zeroinitializer
+// CHECK-LABEL: define dso_local <4 x float> @test_mm_broadcast_ss(
+// CHECK-SAME: ptr noundef [[__A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A]], align 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[TMP0]], i32 1
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[TMP0]], i32 2
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <4 x float> [[VECINIT3_I]], float [[TMP0]], i32 3
+// CHECK-NEXT:    ret <4 x float> [[VECINIT4_I]]
+//
+__m128 test_mm_broadcast_ss(float const *__a) {
   return _mm_broadcast_ss(__a);
 }
 
-__m256d
-test_mm256_broadcast_sd(double const *__a) {
-  // CHECK-LABEL: test_mm256_broadcast_sd
-  // CHECK: insertelement <4 x double> {{.*}}, i64 0
-  // CHECK: shufflevector <4 x double> {{.*}}, <4 x double> poison, <4 x i32> zeroinitializer
+// CHECK-LABEL: define dso_local <4 x double> @test_mm256_broadcast_sd(
+// CHECK-SAME: ptr noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A]], align 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x double> poison, double [[TMP0]], i32 0
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x double> [[VECINIT_I]], double [[TMP0]], i32 1
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x double> [[VECINIT2_I]], double [[TMP0]], i32 2
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <4 x double> [[VECINIT3_I]], double [[TMP0]], i32 3
+// CHECK-NEXT:    ret <4 x double> [[VECINIT4_I]]
+//
+__m256d test_mm256_broadcast_sd(double const *__a) {
   return _mm256_broadcast_sd(__a);
 }
 
-__m256
-test_mm256_broadcast_ss(float const *__a) {
-  // CHECK-LABEL: test_mm256_broadcast_ss
-  // CHECK: insertelement <8 x float> {{.*}}, i64 0
-  // CHECK: shufflevector <8 x float> {{.*}}, <8 x float> poison, <8 x i32> zeroinitializer
+// CHECK-LABEL: define dso_local <8 x float> @test_mm256_broadcast_ss(
+// CHECK-SAME: ptr noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A]], align 1
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x float> poison, float [[TMP0]], i32 0
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[TMP0]], i32 1
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[TMP0]], i32 2
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[TMP0]], i32 3
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[TMP0]], i32 4
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[TMP0]], i32 5
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[TMP0]], i32 6
+// CHECK-NEXT:    [[VECINIT8_I:%.*]] = insertelement <8 x float> [[VECINIT7_I]], float [[TMP0]], i32 7
+// CHECK-NEXT:    ret <8 x float> [[VECINIT8_I]]
+//
+__m256 test_mm256_broadcast_ss(float const *__a) {
   return _mm256_broadcast_ss(__a);
 }
 
 // Make sure we have the correct mask for each insertf128 case.
 
+// CHECK-LABEL: define dso_local <8 x float> @test_mm256_insertf128_ps_0(
+// CHECK-SAME: <8 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[WIDEN:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[INSERT:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[WIDEN]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x float> [[INSERT]]
+//
 __m256 test_mm256_insertf128_ps_0(__m256 a, __m128 b) {
-  // CHECK-LABEL: test_mm256_insertf128_ps_0
-  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
   return _mm256_insertf128_ps(a, b, 0);
 }
 
+// CHECK-LABEL: define dso_local <4 x double> @test_mm256_insertf128_pd_0(
+// CHECK-SAME: <4 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[WIDEN:%.*]] = shufflevector <2 x double> [[B]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    [[INSERT:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[WIDEN]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x double> [[INSERT]]
+//
 __m256d test_mm256_insertf128_pd_0(__m256d a, __m128d b) {
-  // CHECK-LABEL: test_mm256_insertf128_pd_0
-  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 6, i32 7>
   return _mm256_insertf128_pd(a, b, 0);
 }
 
+// CHECK-LABEL: define dso_local <4 x i64> @test_mm256_insertf128_si256_0(
+// CHECK-SAME: <4 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i64> [[A]] to <8 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <4 x i32>
+// CHECK-NEXT:    [[WIDEN:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[INSERT:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> [[WIDEN]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[INSERT]] to <4 x i64>
+// CHECK-NEXT:    ret <4 x i64> [[TMP2]]
+//
 __m256i test_mm256_insertf128_si256_0(__m256i a, __m128i b) {
-  // CHECK-LABEL: test_mm256_insertf128_si256_0
-  // X64: shufflevector{{.*}}<i32 0, i32 1, i32 6, i32 7>
-  // X86: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
   return _mm256_insertf128_si256(a, b, 0);
 }
 
+// CHECK-LABEL: define dso_local <8 x float> @test_mm256_insertf128_ps_1(
+// CHECK-SAME: <8 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[WIDEN:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[INSERT:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[WIDEN]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+// CHECK-NEXT:    ret <8 x float> [[INSERT]]
+//
 __m256 test_mm256_insertf128_ps_1(__m256 a, __m128 b) {
-  // CHECK-LABEL: test_mm256_insertf128_ps_1
-  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   return _mm256_insertf128_ps(a, b, 1);
 }
 
+// CHECK-LABEL: define dso_local <4 x double> @test_mm256_insertf128_pd_1(
+// CHECK-SAME: <4 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[WIDEN:%.*]] = shufflevector <2 x double> [[B]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    [[INSERT:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[WIDEN]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+// CHECK-NEXT:    ret <4 x double> [[INSERT]]
+//
 __m256d test_mm256_insertf128_pd_1(__m256d a, __m128d b) {
-  // CHECK-LABEL: test_mm256_insertf128_pd_1
-  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 4, i32 5>
   return _mm256_insertf128_pd(a, b, 1);
 }
 
+// CHECK-LABEL: define dso_local <4 x i64> @test_mm256_insertf128_si256_1(
+// CHECK-SAME: <4 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i64> [[A]] to <8 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <4 x i32>
+// CHECK-NEXT:    [[WIDEN:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[INSERT:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> [[WIDEN]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[INSERT]] to <4 x i64>
+// CHECK-NEXT:    ret <4 x i64> [[TMP2]]
+//
 __m256i test_mm256_insertf128_si256_1(__m256i a, __m128i b) {
-  // CHECK-LABEL: test_mm256_insertf128_si256_1
-  // X64: shufflevector{{.*}}<i32 0, i32 1, i32 4, i32 5>
-  // X86: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   return _mm256_insertf128_si256(a, b, 1);
 }
 
 // Make sure we have the correct mask for each extractf128 case.
 
+// CHECK-LABEL: define dso_local <4 x float> @test_mm256_extractf128_ps_0(
+// CHECK-SAME: <8 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x float> [[EXTRACT]]
+//
 __m128 test_mm256_extractf128_ps_0(__m256 a) {
-  // X64-LABEL: test_mm256_extractf128_ps_0
-  // X64: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3>
-  //
-  // X86-LABEL: test_mm256_extractf128_ps_0
-  // X86: shufflevector{{.*}}<i32 0, i32 1>
   return _mm256_extractf128_ps(a, 0);
 }
 
+// CHECK-LABEL: define dso_local <2 x double> @test_mm256_extractf128_pd_0(
+// CHECK-SAME: <4 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    ret <2 x double> [[EXTRACT]]
+//
 __m128d test_mm256_extractf128_pd_0(__m256d a) {
-  // CHECK-LABEL: test_mm256_extractf128_pd_0
-  // CHECK: shufflevector{{.*}}<i32 0, i32 1>
   return _mm256_extractf128_pd(a, 0);
 }
 
+// CHECK-LABEL: define dso_local <2 x i64> @test_mm256_extractf128_si256_0(
+// CHECK-SAME: <4 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i64> [[A]] to <8 x i32>
+// CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[EXTRACT]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+//
 __m128i test_mm256_extractf128_si256_0(__m256i a) {
-  // CHECK-LABEL: test_mm256_extractf128_si256_0
-  // CHECK: shufflevector{{.*}}<i32 0, i32 1>
   return _mm256_extractf128_si256(a, 0);
 }
 
+// CHECK-LABEL: define dso_local <4 x float> @test_mm256_extractf128_ps_1(
+// CHECK-SAME: <8 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <4 x float> [[EXTRACT]]
+//
 __m128 test_mm256_extractf128_ps_1(__m256 a) {
-  // X64-LABEL: test_mm256_extractf128_ps_1
-  // X64: shufflevector{{.*}}<i32 4, i32 5, i32 6, i32 7>
-  //
-  // X86-LABEL: test_mm256_extractf128_ps_1
-  // X86: shufflevector{{.*}}<i32 2, i32 3>
   return _mm256_extractf128_ps(a, 1);
 }
 
+// CHECK-LABEL: define dso_local <2 x double> @test_mm256_extractf128_pd_1(
+// CHECK-SAME: <4 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+// CHECK-NEXT:    ret <2 x double> [[EXTRACT]]
+//
 __m128d test_mm256_extractf128_pd_1(__m256d a) {
-  // CHECK-LABEL: test_mm256_extractf128_pd_1
-  // CHECK: shufflevector{{.*}}<i32 2, i32 3>
   return _mm256_extractf128_pd(a, 1);
 }
 
+// CHECK-LABEL: define dso_local <2 x i64> @test_mm256_extractf128_si256_1(
+// CHECK-SAME: <4 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i64> [[A]] to <8 x i32>
+// CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[EXTRACT]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+//
 __m128i test_mm256_extractf128_si256_1(__m256i a) {
-  // CHECK-LABEL: test_mm256_extractf128_si256_1
-  // CHECK: shufflevector{{.*}}<i32 2, i32 3>
   return _mm256_extractf128_si256(a, 1);
 }
 
+// CHECK-LABEL: define dso_local <8 x float> @test_mm256_set_m128(
+// CHECK-SAME: <4 x float> noundef [[HI:%.*]], <4 x float> noundef [[LO:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[LO]], <4 x float> [[HI]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x float> [[SHUFFLE_I]]
+//
 __m256 test_mm256_set_m128(__m128 hi, __m128 lo) {
-  // CHECK-LABEL: test_mm256_set_m128
-  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return _mm256_set_m128(hi, lo);
 }
 
+// CHECK-LABEL: define dso_local <4 x double> @test_mm256_set_m128d(
+// CHECK-SAME: <2 x double> noundef [[HI:%.*]], <2 x double> noundef [[LO:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[LO]], <2 x double> [[HI]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x double> [[SHUFFLE_I]]
+//
 __m256d test_mm256_set_m128d(__m128d hi, __m128d lo) {
-  // CHECK-LABEL: test_mm256_set_m128d
-  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3>
   return _mm256_set_m128d(hi, lo);
 }
 
+// CHECK-LABEL: define dso_local <4 x i64> @test_mm256_set_m128i(
+// CHECK-SAME: <2 x i64> noundef [[HI:%.*]], <2 x i64> noundef [[LO:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[LO]], <2 x i64> [[HI]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i64> [[SHUFFLE_I]]
+//
 __m256i test_mm256_set_m128i(__m128i hi, __m128i lo) {
-  // CHECK-LABEL: test_mm256_set_m128i
-  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3>  
   return _mm256_set_m128i(hi, lo);
 }
 
+// CHECK-LABEL: define dso_local <8 x float> @test_mm256_setr_m128(
+// CHECK-SAME: <4 x float> noundef [[HI:%.*]], <4 x float> noundef [[LO:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x float> [[LO]], <4 x float> [[HI]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x float> [[SHUFFLE_I_I]]
+//
 __m256 test_mm256_setr_m128(__m128 hi, __m128 lo) {
-  // CHECK-LABEL: test_mm256_setr_m128
-  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return _mm256_setr_m128(lo, hi);
 }
 
+// CHECK-LABEL: define dso_local <4 x double> @test_mm256_setr_m128d(
+// CHECK-SAME: <2 x double> noundef [[HI:%.*]], <2 x double> noundef [[LO:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x double> [[LO]], <2 x double> [[HI]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x double> [[SHUFFLE_I_I]]
+//
 __m256d test_mm256_setr_m128d(__m128d hi, __m128d lo) {
-  // CHECK-LABEL: test_mm256_setr_m128d
-  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3>
   return _mm256_setr_m128d(lo, hi);
 }
 
+// CHECK-LABEL: define dso_local <4 x i64> @test_mm256_setr_m128i(
+// CHECK-SAME: <2 x i64> noundef [[HI:%.*]], <2 x i64> noundef [[LO:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[LO]], <2 x i64> [[HI]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    ret <4 x i64> [[SHUFFLE_I_I]]
+//
 __m256i test_mm256_setr_m128i(__m128i hi, __m128i lo) {
-  // CHECK-LABEL: test_mm256_setr_m128i
-  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3>
   return _mm256_setr_m128i(lo, hi);
 }
diff --git a/clang/test/CodeGen/X86/sse.c b/clang/test/CodeGen/X86/sse.c
index a75b8dc77e86e..017bdd7846fa3 100644
--- a/clang/test/CodeGen/X86/sse.c
+++ b/clang/test/CodeGen/X86/sse.c
@@ -1,42 +1,72 @@
-// RUN: %clang_cc1 -ffreestanding -O3 -triple x86_64-apple-macosx10.8.0 -target-feature +sse4.1 -emit-llvm %s -o - | FileCheck %s
-// FIXME: This test currently depends on optimization - it should be rewritten to avoid it.
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -ffreestanding -triple x86_64-- -target-feature +sse4.1 -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck %s
 
 
 #include <emmintrin.h>
 
 // Byte-shifts look reversed due to xmm register layout
+// CHECK-LABEL: define dso_local <2 x i64> @test_mm_slli_si128(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CAST:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[PSLLDQ:%.*]] = shufflevector <16 x i8> zeroinitializer, <16 x i8> [[CAST]], <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
+// CHECK-NEXT:    [[CAST1:%.*]] = bitcast <16 x i8> [[PSLLDQ]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[CAST1]]
+//
 __m128i test_mm_slli_si128(__m128i a) {
-  // CHECK-LABEL: @test_mm_slli_si128
-  // CHECK: shufflevector <16 x i8> <{{.*}}, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> {{.*}}, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
   return _mm_slli_si128(a, 5);
 }
 
+// CHECK-LABEL: define dso_local <2 x i64> @test_mm_slli_si128_0(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CAST:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[PSLLDQ:%.*]] = shufflevector <16 x i8> zeroinitializer, <16 x i8> [[CAST]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:    [[CAST1:%.*]] = bitcast <16 x i8> [[PSLLDQ]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[CAST1]]
+//
 __m128i test_mm_slli_si128_0(__m128i a) {
-  // CHECK-LABEL: @test_mm_slli_si128_0
-  // CHECK-NOT: shufflevector
   return _mm_slli_si128(a, 0);
 }
 
+// CHECK-LABEL: define dso_local <2 x i64> @test_mm_slli_si128_16(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <2 x i64> zeroinitializer
+//
 __m128i test_mm_slli_si128_16(__m128i a) {
-  // CHECK-LABEL: @test_mm_slli_si128_16
-  // CHECK-NOT: shufflevector
   return _mm_slli_si128(a, 16);
 }
 
+// CHECK-LABEL: define dso_local <2 x i64> @test_mm_srli_si128(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CAST:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[PSRLDQ:%.*]] = shufflevector <16 x i8> [[CAST]], <16 x i8> zeroinitializer, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
+// CHECK-NEXT:    [[CAST1:%.*]] = bitcast <16 x i8> [[PSRLDQ]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[CAST1]]
+//
 __m128i test_mm_srli_si128(__m128i a) {
-  // CHECK-LABEL: @test_mm_srli_si128
-  // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, {{.*}}>, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
   return _mm_srli_si128(a, 5);
 }
 
+// CHECK-LABEL: define dso_local <2 x i64> @test_mm_srli_si128_0(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CAST:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[PSRLDQ:%.*]] = shufflevector <16 x i8> [[CAST]], <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[CAST1:%.*]] = bitcast <16 x i8> [[PSRLDQ]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[CAST1]]
+//
 __m128i test_mm_srli_si128_0(__m128i a) {
-  // CHECK-LABEL: @test_mm_srli_si128_0
-  // CHECK-NOT: shufflevector
   return _mm_srli_si128(a, 0);
 }
 
+// CHECK-LABEL: define dso_local <2 x i64> @test_mm_srli_si128_16(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <2 x i64> zeroinitializer
+//
 __m128i test_mm_srli_si128_16(__m128i a) {
-  // CHECK-LABEL: @test_mm_srli_si128_16
-  // CHECK-NOT: shufflevector
   return _mm_srli_si128(a, 16);
 }
diff --git a/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl b/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl
new file mode 100644
index 0000000000000..a883c9d5cc355
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl
@@ -0,0 +1,91 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -O1 -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple spirv-vulkan-library %s -fnative-half-type -emit-llvm -O0 -o - | FileCheck %s --check-prefix=SPIRV
+
+
+
+// CHECK: define {{.*}} i32 {{.*}}test_scalar{{.*}}(double {{.*}} [[VALD:%.*]])
+// CHECK:      [[VALRET:%.*]] = {{.*}} call { i32, i32 } @llvm.dx.splitdouble.i32(double [[VALD]])
+// CHECK-NEXT: extractvalue { i32, i32 } [[VALRET]], 0
+// CHECK-NEXT: extractvalue { i32, i32 } [[VALRET]], 1
+//
+// SPIRV: define spir_func {{.*}} i32 {{.*}}test_scalar{{.*}}(double {{.*}} [[VALD:%.*]])
+// SPIRV-NOT:  @llvm.dx.splitdouble.i32
+// SPIRV:      [[LOAD:%.*]] = load double, ptr [[VALD]].addr, align 8
+// SPIRV-NEXT: [[CAST:%.*]] = bitcast double [[LOAD]] to <2 x i32>
+// SPIRV-NEXT: extractelement <2 x i32> [[CAST]], i64 0
+// SPIRV-NEXT: extractelement <2 x i32> [[CAST]], i64 1
+uint test_scalar(double D) {
+  uint A, B;
+  asuint(D, A, B);
+  return A + B;
+}
+
+// CHECK: define {{.*}} <1 x i32> {{.*}}test_double1{{.*}}(<1 x double> {{.*}} [[VALD:%.*]])
+// CHECK:      [[TRUNC:%.*]] = extractelement <1 x double> %D, i64 0
+// CHECK-NEXT: [[VALRET:%.*]] = {{.*}} call { i32, i32 } @llvm.dx.splitdouble.i32(double [[TRUNC]])
+// CHECK-NEXT: extractvalue { i32, i32 } [[VALRET]], 0
+// CHECK-NEXT: extractvalue { i32, i32 } [[VALRET]], 1
+//
+// SPIRV: define spir_func {{.*}} <1 x i32> {{.*}}test_double1{{.*}}(<1 x double> {{.*}} [[VALD:%.*]])
+// SPIRV-NOT:  @llvm.dx.splitdouble.i32
+// SPIRV:      [[LOAD:%.*]] = load <1 x double>, ptr [[VALD]].addr, align 8
+// SPIRV-NEXT: [[TRUNC:%.*]] = extractelement <1 x double> [[LOAD]], i64 0
+// SPIRV-NEXT: [[CAST:%.*]] = bitcast double [[TRUNC]] to <2 x i32>
+// SPIRV-NEXT: extractelement <2 x i32> [[CAST]], i64 0
+// SPIRV-NEXT: extractelement <2 x i32> [[CAST]], i64 1
+uint1 test_double1(double1 D) {
+  uint A, B;
+  asuint(D, A, B);
+  return A + B;
+}
+
+// CHECK: define {{.*}} <2 x i32> {{.*}}test_vector2{{.*}}(<2 x double> {{.*}} [[VALD:%.*]])
+// CHECK:      [[VALRET:%.*]] = {{.*}} call { <2 x i32>, <2 x i32> } @llvm.dx.splitdouble.v2i32(<2 x double> [[VALD]])
+// CHECK-NEXT: extractvalue { <2 x i32>, <2 x i32> } [[VALRET]], 0
+// CHECK-NEXT: extractvalue { <2 x i32>, <2 x i32> } [[VALRET]], 1
+//
+// SPIRV: define spir_func {{.*}} <2 x i32> {{.*}}test_vector2{{.*}}(<2 x double> {{.*}} [[VALD:%.*]])
+// SPIRV-NOT:  @llvm.dx.splitdouble.i32
+// SPIRV:      [[LOAD:%.*]] = load <2 x double>, ptr [[VALD]].addr, align 16
+// SPIRV-NEXT: [[CAST1:%.*]] = bitcast <2 x double> [[LOAD]] to <4 x i32>
+// SPIRV-NEXT: [[SHUF1:%.*]] = shufflevector <4 x i32> [[CAST1]], <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+// SPIRV-NEXT: [[SHUF2:%.*]] = shufflevector <4 x i32> [[CAST1]], <4 x i32> poison, <2 x i32> <i32 1, i32 3>
+uint2 test_vector2(double2 D) {
+  uint2 A, B;
+  asuint(D, A, B);
+  return A + B;
+}
+
+// CHECK: define {{.*}} <3 x i32> {{.*}}test_vector3{{.*}}(<3 x double> {{.*}} [[VALD:%.*]])
+// CHECK:      [[VALRET:%.*]] = {{.*}} call { <3 x i32>, <3 x i32> } @llvm.dx.splitdouble.v3i32(<3 x double> [[VALD]])
+// CHECK-NEXT: extractvalue { <3 x i32>, <3 x i32> } [[VALRET]], 0
+// CHECK-NEXT: extractvalue { <3 x i32>, <3 x i32> } [[VALRET]], 1
+//
+// SPIRV: define spir_func {{.*}} <3 x i32> {{.*}}test_vector3{{.*}}(<3 x double> {{.*}} [[VALD:%.*]])
+// SPIRV-NOT:  @llvm.dx.splitdouble.i32
+// SPIRV:      [[LOAD:%.*]] = load <3 x double>, ptr [[VALD]].addr, align 32
+// SPIRV-NEXT: [[CAST1:%.*]] = bitcast <3 x double> [[LOAD]] to <6 x i32>
+// SPIRV-NEXT: [[SHUF1:%.*]] = shufflevector <6 x i32> [[CAST1]], <6 x i32> poison, <3 x i32> <i32 0, i32 2, i32 4>
+// SPIRV-NEXT: [[SHUF2:%.*]] = shufflevector <6 x i32> [[CAST1]], <6 x i32> poison, <3 x i32> <i32 1, i32 3, i32 5>
+uint3 test_vector3(double3 D) {
+  uint3 A, B;
+  asuint(D, A, B);
+  return A + B;
+}
+
+// CHECK: define {{.*}} <4 x i32> {{.*}}test_vector4{{.*}}(<4 x double> {{.*}} [[VALD:%.*]])
+// CHECK:      [[VALRET:%.*]] = {{.*}} call { <4 x i32>, <4 x i32> } @llvm.dx.splitdouble.v4i32(<4 x double> [[VALD]])
+// CHECK-NEXT: extractvalue { <4 x i32>, <4 x i32> } [[VALRET]], 0
+// CHECK-NEXT: extractvalue { <4 x i32>, <4 x i32> } [[VALRET]], 1
+//
+// SPIRV: define spir_func {{.*}} <4 x i32> {{.*}}test_vector4{{.*}}(<4 x double> {{.*}} [[VALD:%.*]])
+// SPIRV-NOT: @llvm.dx.splitdouble.i32
+// SPIRV:      [[LOAD:%.*]] = load <4 x double>, ptr [[VALD]].addr, align 32
+// SPIRV-NEXT: [[CAST1:%.*]] = bitcast <4 x double> [[LOAD]] to <8 x i32>
+// SPIRV-NEXT: [[SHUF1:%.*]] = shufflevector <8 x i32> [[CAST1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// SPIRV-NEXT: [[SHUF2:%.*]] = shufflevector <8 x i32> [[CAST1]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+uint4 test_vector4(double4 D) {
+  uint4 A, B;
+  asuint(D, A, B);
+  return A + B;
+}
diff --git a/clang/test/Sema/asm.c b/clang/test/Sema/asm.c
index 6cd95c71604d4..28ef3ec6ce09c 100644
--- a/clang/test/Sema/asm.c
+++ b/clang/test/Sema/asm.c
@@ -204,6 +204,12 @@ double f_output_constraint(void) {
   return result;
 }
 
+double f_output_constraint_2(void) {
+  double result;
+  __asm("foo1": "+f" (result)); // expected-error {{invalid output constraint '+f' in asm}}
+  return result;
+}
+
 void fn1(void) {
   int l;
   __asm__(""
diff --git a/clang/test/SemaCXX/constexpr-string.cpp b/clang/test/SemaCXX/constexpr-string.cpp
index c456740ef7551..5448365489a51 100644
--- a/clang/test/SemaCXX/constexpr-string.cpp
+++ b/clang/test/SemaCXX/constexpr-string.cpp
@@ -670,6 +670,8 @@ namespace MemcpyEtc {
   constexpr bool test_address_of_incomplete_struct_type() { // expected-error {{never produces a constant}}
     struct Incomplete;
     extern Incomplete x, y;
+    // expected-warning@+2 {{first argument in call to '__builtin_memcpy' is a pointer to non-trivially copyable type 'Incomplete'}}
+    // expected-note@+1 {{explicitly cast the pointer to silence this warning}}
     __builtin_memcpy(&x, &x, 4);
     // expected-note@-1 2{{cannot constant evaluate 'memcpy' between objects of incomplete type 'Incomplete'}}
     return true;
diff --git a/clang/test/SemaCXX/warn-memaccess.cpp b/clang/test/SemaCXX/warn-memaccess.cpp
new file mode 100644
index 0000000000000..b4b7f6a6905b2
--- /dev/null
+++ b/clang/test/SemaCXX/warn-memaccess.cpp
@@ -0,0 +1,68 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -Wnontrivial-memaccess %s
+
+extern "C" void *bzero(void *, unsigned);
+extern "C" void *memset(void *, int, unsigned);
+extern "C" void *memmove(void *s1, const void *s2, unsigned n);
+extern "C" void *memcpy(void *s1, const void *s2, unsigned n);
+
+class TriviallyCopyable {};
+class NonTriviallyCopyable { NonTriviallyCopyable(const NonTriviallyCopyable&);};
+
+void test_bzero(TriviallyCopyable* tc,
+                 NonTriviallyCopyable *ntc) {
+  // OK
+  bzero(tc, sizeof(*tc));
+
+  // expected-warning@+2{{first argument in call to 'bzero' is a pointer to non-trivially copyable type 'NonTriviallyCopyable'}}
+  // expected-note@+1{{explicitly cast the pointer to silence this warning}}
+  bzero(ntc, sizeof(*ntc));
+
+  // OK
+  bzero((void*)ntc, sizeof(*ntc));
+}
+
+void test_memset(TriviallyCopyable* tc,
+                 NonTriviallyCopyable *ntc) {
+  // OK
+  memset(tc, 0, sizeof(*tc));
+
+  // expected-warning@+2{{first argument in call to 'memset' is a pointer to non-trivially copyable type 'NonTriviallyCopyable'}}
+  // expected-note@+1{{explicitly cast the pointer to silence this warning}}
+  memset(ntc, 0, sizeof(*ntc));
+
+  // OK
+  memset((void*)ntc, 0, sizeof(*ntc));
+}
+
+
+void test_memcpy(TriviallyCopyable* tc0, TriviallyCopyable* tc1,
+                 NonTriviallyCopyable *ntc0, NonTriviallyCopyable *ntc1) {
+  // OK
+  memcpy(tc0, tc1, sizeof(*tc0));
+
+  // expected-warning@+2{{first argument in call to 'memcpy' is a pointer to non-trivially copyable type 'NonTriviallyCopyable'}}
+  // expected-note@+1{{explicitly cast the pointer to silence this warning}}
+  memcpy(ntc0, ntc1, sizeof(*ntc0));
+
+  // ~ OK
+  memcpy((void*)ntc0, ntc1, sizeof(*ntc0));
+
+  // OK
+  memcpy((void*)ntc0, (void*)ntc1, sizeof(*ntc0));
+}
+
+void test_memmove(TriviallyCopyable* tc0, TriviallyCopyable* tc1,
+                 NonTriviallyCopyable *ntc0, NonTriviallyCopyable *ntc1) {
+  // OK
+  memmove(tc0, tc1, sizeof(*tc0));
+
+  // expected-warning@+2{{first argument in call to 'memmove' is a pointer to non-trivially copyable type 'NonTriviallyCopyable'}}
+  // expected-note@+1{{explicitly cast the pointer to silence this warning}}
+  memmove(ntc0, ntc1, sizeof(*ntc0));
+
+  // ~ OK
+  memmove((void*)ntc0, ntc1, sizeof(*ntc0));
+
+  // OK
+  memmove((void*)ntc0, (void*)ntc1, sizeof(*ntc0));
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/asuint-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/asuint-errors.hlsl
index 8c56fdddb1c24..4adb0555c35be 100644
--- a/clang/test/SemaHLSL/BuiltIns/asuint-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/asuint-errors.hlsl
@@ -6,6 +6,10 @@ uint4 test_asuint_too_many_arg(float p0, float p1) {
   // expected-error@-1 {{no matching function for call to 'asuint'}}
   // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires single argument 'V', but 2 arguments were provided}}
   // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires single argument 'F', but 2 arguments were provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function not viable: requires 3 arguments, but 2 were provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function not viable: requires 3 arguments, but 2 were provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function not viable: requires 3 arguments, but 2 were provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function not viable: requires 3 arguments, but 2 were provided}}
 }
 
 uint test_asuint_double(double p1) {
@@ -23,3 +27,29 @@ uint test_asuint_half(half p1) {
     // expected-note@hlsl/hlsl_detail.h:* {{candidate template ignored: could not match 'vector<half, N>' against 'half'}}
     // expected-note@hlsl/hlsl_detail.h:* {{candidate template ignored: substitution failure [with U = uint, T = half]: no type named 'Type'}}
 }
+
+void test_asuint_first_arg_const(double D) {
+  const uint A = 0;
+  uint B;
+  asuint(D, A, B);
+ // expected-error@hlsl/hlsl_intrinsics.h:* {{read-only variable is not assignable}} 
+}
+
+void test_asuint_second_arg_const(double D) {
+  const uint A = 0;
+  uint B;
+  asuint(D, B, A);
+ // expected-error@hlsl/hlsl_intrinsics.h:* {{read-only variable is not assignable}} 
+}
+
+void test_asuint_imidiate_value(double D) {
+  uint B;
+  asuint(D, B, 1);
+ // expected-error@-1 {{cannot bind non-lvalue argument 1 to out paramemter}} 
+}
+
+void test_asuint_expr(double D) {
+  uint B;
+  asuint(D, B, B + 1);
+ // expected-error@-1 {{cannot bind non-lvalue argument B + 1 to out paramemter}} 
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/splitdouble-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/splitdouble-errors.hlsl
new file mode 100644
index 0000000000000..18d2b692b335b
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/splitdouble-errors.hlsl
@@ -0,0 +1,76 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -verify
+
+void test_no_second_arg(double D) {
+  __builtin_hlsl_elementwise_splitdouble(D);
+ // expected-error@-1 {{too few arguments to function call, expected 3, have 1}} 
+}
+
+void test_no_third_arg(double D) {
+  uint A;
+  __builtin_hlsl_elementwise_splitdouble(D, A);
+ // expected-error@-1 {{too few arguments to function call, expected 3, have 2}} 
+}
+
+void test_too_many_arg(double D) {
+  uint A, B, C;
+  __builtin_hlsl_elementwise_splitdouble(D, A, B, C);
+ // expected-error@-1 {{too many arguments to function call, expected 3, have 4}} 
+}
+
+void test_first_arg_type_mismatch(bool3 D) {
+  uint3 A, B;
+  __builtin_hlsl_elementwise_splitdouble(D, A, B);
+ // expected-error@-1 {{invalid operand of type 'bool3' (aka 'vector<bool, 3>') where 'double' or a vector of such type is required}} 
+}
+
+void test_second_arg_type_mismatch(double D) {
+  bool A;
+  uint B;
+  __builtin_hlsl_elementwise_splitdouble(D, A, B);
+ // expected-error@-1 {{invalid operand of type 'bool' where 'unsigned int' or a vector of such type is required}} 
+}
+
+void test_third_arg_type_mismatch(double D) {
+  bool A;
+  uint B;
+  __builtin_hlsl_elementwise_splitdouble(D, B, A);
+ // expected-error@-1 {{invalid operand of type 'bool' where 'unsigned int' or a vector of such type is required}} 
+}
+
+void test_const_second_arg(double D) {
+  const uint A = 1;
+  uint B;
+  __builtin_hlsl_elementwise_splitdouble(D, A, B);
+ // expected-error@-1 {{cannot bind non-lvalue argument A to out paramemter}} 
+}
+
+void test_const_third_arg(double D) {
+  uint A;
+  const uint B = 1;
+  __builtin_hlsl_elementwise_splitdouble(D, A, B);
+ // expected-error@-1 {{cannot bind non-lvalue argument B to out paramemter}} 
+}
+
+void test_number_second_arg(double D) {
+  uint B;
+  __builtin_hlsl_elementwise_splitdouble(D, (uint)1, B);
+ // expected-error@-1 {{cannot bind non-lvalue argument (uint)1 to out paramemter}} 
+}
+
+void test_number_third_arg(double D) {
+  uint B;
+  __builtin_hlsl_elementwise_splitdouble(D, B, (uint)1);
+ // expected-error@-1 {{cannot bind non-lvalue argument (uint)1 to out paramemter}} 
+}
+
+void test_expr_second_arg(double D) {
+  uint B;
+  __builtin_hlsl_elementwise_splitdouble(D, B+1, B);
+ // expected-error@-1 {{cannot bind non-lvalue argument B + 1 to out paramemter}} 
+}
+
+void test_expr_third_arg(double D) {
+  uint B;
+  __builtin_hlsl_elementwise_splitdouble(D, B, B+1);
+ // expected-error@-1 {{cannot bind non-lvalue argument B + 1 to out paramemter}} 
+}
diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
index 7d36cee7a22b3..f474b1346b1be 100644
--- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp
+++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
@@ -1080,10 +1080,15 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) {
                  << NumExistsCalls << " exists() calls\n"
                  << NumIsLocalCalls << " isLocal() calls\n";
 
-  if (PrintTiming)
-    llvm::errs() << llvm::format(
-        "clang-scan-deps timing: %0.2fs wall, %0.2fs process\n",
-        T.getTotalTime().getWallTime(), T.getTotalTime().getProcessTime());
+  if (PrintTiming) {
+    llvm::errs() << "wall time [s]\t"
+                 << "process time [s]\t"
+                 << "instruction count\n";
+    const llvm::TimeRecord &R = T.getTotalTime();
+    llvm::errs() << llvm::format("%0.4f", R.getWallTime()) << "\t"
+                 << llvm::format("%0.4f", R.getProcessTime()) << "\t"
+                 << llvm::format("%llu", R.getInstructionsExecuted()) << "\n";
+  }
 
   if (RoundTripArgs)
     if (FD && FD->roundTripCommands(llvm::errs()))
diff --git a/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp b/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp
index 5b64eaca0e10d..de16f6be8eedb 100644
--- a/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp
@@ -1282,28 +1282,35 @@ static raw_ostream &operator<<(raw_ostream &OS,
 class UncheckedOptionalAccessTest
     : public ::testing::TestWithParam<OptionalTypeIdentifier> {
 protected:
-  void ExpectDiagnosticsFor(std::string SourceCode) {
-    ExpectDiagnosticsFor(SourceCode, ast_matchers::hasName("target"));
+  void ExpectDiagnosticsFor(std::string SourceCode,
+                            bool IgnoreSmartPointerDereference = true) {
+    ExpectDiagnosticsFor(SourceCode, ast_matchers::hasName("target"),
+                         IgnoreSmartPointerDereference);
   }
 
-  void ExpectDiagnosticsForLambda(std::string SourceCode) {
+  void ExpectDiagnosticsForLambda(std::string SourceCode,
+                                  bool IgnoreSmartPointerDereference = true) {
     ExpectDiagnosticsFor(
-        SourceCode, ast_matchers::hasDeclContext(
-                        ast_matchers::cxxRecordDecl(ast_matchers::isLambda())));
+        SourceCode,
+        ast_matchers::hasDeclContext(
+            ast_matchers::cxxRecordDecl(ast_matchers::isLambda())),
+        IgnoreSmartPointerDereference);
   }
 
   template <typename FuncDeclMatcher>
-  void ExpectDiagnosticsFor(std::string SourceCode,
-                            FuncDeclMatcher FuncMatcher) {
+  void ExpectDiagnosticsFor(std::string SourceCode, FuncDeclMatcher FuncMatcher,
+                            bool IgnoreSmartPointerDereference = true) {
     // Run in C++17 and C++20 mode to cover differences in the AST between modes
     // (e.g. C++20 can contain `CXXRewrittenBinaryOperator`).
     for (const char *CxxMode : {"-std=c++17", "-std=c++20"})
-      ExpectDiagnosticsFor(SourceCode, FuncMatcher, CxxMode);
+      ExpectDiagnosticsFor(SourceCode, FuncMatcher, CxxMode,
+                           IgnoreSmartPointerDereference);
   }
 
   template <typename FuncDeclMatcher>
   void ExpectDiagnosticsFor(std::string SourceCode, FuncDeclMatcher FuncMatcher,
-                            const char *CxxMode) {
+                            const char *CxxMode,
+                            bool IgnoreSmartPointerDereference) {
     ReplaceAllOccurrences(SourceCode, "$ns", GetParam().NamespaceName);
     ReplaceAllOccurrences(SourceCode, "$optional", GetParam().TypeName);
 
@@ -1328,8 +1335,7 @@ class UncheckedOptionalAccessTest
       template <typename T>
       T Make();
     )");
-    UncheckedOptionalAccessModelOptions Options{
-        /*IgnoreSmartPointerDereference=*/true};
+    UncheckedOptionalAccessModelOptions Options{IgnoreSmartPointerDereference};
     std::vector<SourceLocation> Diagnostics;
     llvm::Error Error = checkDataflow<UncheckedOptionalAccessModel>(
         AnalysisInputs<UncheckedOptionalAccessModel>(
@@ -3721,6 +3727,50 @@ TEST_P(UncheckedOptionalAccessTest, ConstByValueAccessorWithModInBetween) {
   )cc");
 }
 
+TEST_P(UncheckedOptionalAccessTest, ConstPointerAccessor) {
+  ExpectDiagnosticsFor(R"cc(
+     #include "unchecked_optional_access_test.h"
+
+    struct A {
+      $ns::$optional<int> x;
+    };
+
+    struct MyUniquePtr {
+      A* operator->() const;
+    };
+
+    void target(MyUniquePtr p) {
+      if (p->x) {
+        *p->x;
+      }
+    }
+  )cc",
+                       /*IgnoreSmartPointerDereference=*/false);
+}
+
+TEST_P(UncheckedOptionalAccessTest, ConstPointerAccessorWithModInBetween) {
+  ExpectDiagnosticsFor(R"cc(
+    #include "unchecked_optional_access_test.h"
+
+    struct A {
+      $ns::$optional<int> x;
+    };
+
+    struct MyUniquePtr {
+      A* operator->() const;
+      void reset(A*);
+    };
+
+    void target(MyUniquePtr p) {
+      if (p->x) {
+        p.reset(nullptr);
+        *p->x;  // [[unsafe]]
+      }
+    }
+  )cc",
+                       /*IgnoreSmartPointerDereference=*/false);
+}
+
 TEST_P(UncheckedOptionalAccessTest, ConstBoolAccessor) {
   ExpectDiagnosticsFor(R"cc(
     #include "unchecked_optional_access_test.h"
diff --git a/compiler-rt/include/fuzzer/FuzzedDataProvider.h b/compiler-rt/include/fuzzer/FuzzedDataProvider.h
index 5903ed837917c..e57b95b6304a9 100644
--- a/compiler-rt/include/fuzzer/FuzzedDataProvider.h
+++ b/compiler-rt/include/fuzzer/FuzzedDataProvider.h
@@ -18,6 +18,7 @@
 #include <climits>
 #include <cstddef>
 #include <cstdint>
+#include <cstdlib>
 #include <cstring>
 #include <initializer_list>
 #include <limits>
diff --git a/flang/examples/FeatureList/FeatureList.cpp b/flang/examples/FeatureList/FeatureList.cpp
index 9fce67e61ed30..62f8d39a8abaa 100644
--- a/flang/examples/FeatureList/FeatureList.cpp
+++ b/flang/examples/FeatureList/FeatureList.cpp
@@ -473,8 +473,8 @@ struct NodeVisitor {
   READ_FEATURE(OmpDependClause::InOut)
   READ_FEATURE(OmpDependClause::Sink)
   READ_FEATURE(OmpDependClause::Source)
-  READ_FEATURE(OmpDependenceType)
-  READ_FEATURE(OmpDependenceType::Type)
+  READ_FEATURE(OmpTaskDependenceType)
+  READ_FEATURE(OmpTaskDependenceType::Type)
   READ_FEATURE(OmpDependSinkVec)
   READ_FEATURE(OmpDependSinkVecLength)
   READ_FEATURE(OmpEndAllocators)
diff --git a/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp b/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp
index 5d3c5cd72eef0..d28ed0534d600 100644
--- a/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp
+++ b/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp
@@ -222,9 +222,9 @@ void OpenMPCounterVisitor::Post(const OmpLinearModifier::Type &c) {
   clauseDetails +=
       "modifier=" + std::string{OmpLinearModifier::EnumToString(c)} + ";";
 }
-void OpenMPCounterVisitor::Post(const OmpDependenceType::Type &c) {
+void OpenMPCounterVisitor::Post(const OmpTaskDependenceType::Type &c) {
   clauseDetails +=
-      "type=" + std::string{OmpDependenceType::EnumToString(c)} + ";";
+      "type=" + std::string{OmpTaskDependenceType::EnumToString(c)} + ";";
 }
 void OpenMPCounterVisitor::Post(const OmpMapClause::Type &c) {
   clauseDetails += "type=" + std::string{OmpMapClause::EnumToString(c)} + ";";
diff --git a/flang/examples/FlangOmpReport/FlangOmpReportVisitor.h b/flang/examples/FlangOmpReport/FlangOmpReportVisitor.h
index 380534ebbfd70..68c52db46e2f0 100644
--- a/flang/examples/FlangOmpReport/FlangOmpReportVisitor.h
+++ b/flang/examples/FlangOmpReport/FlangOmpReportVisitor.h
@@ -73,7 +73,7 @@ struct OpenMPCounterVisitor {
   void Post(const OmpDeviceTypeClause::Type &c);
   void Post(const OmpScheduleModifierType::ModType &c);
   void Post(const OmpLinearModifier::Type &c);
-  void Post(const OmpDependenceType::Type &c);
+  void Post(const OmpTaskDependenceType::Type &c);
   void Post(const OmpMapClause::Type &c);
   void Post(const OmpScheduleClause::ScheduleType &c);
   void Post(const OmpIfClause::DirectiveNameModifier &c);
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index ccdfe980f6f38..31ad1b7c6ce5b 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -513,8 +513,8 @@ class ParseTreeDumper {
   NODE(OmpDependClause, InOut)
   NODE(OmpDependClause, Sink)
   NODE(OmpDependClause, Source)
-  NODE(parser, OmpDependenceType)
-  NODE_ENUM(OmpDependenceType, Type)
+  NODE(parser, OmpTaskDependenceType)
+  NODE_ENUM(OmpTaskDependenceType, Type)
   NODE(parser, OmpDependSinkVec)
   NODE(parser, OmpDependSinkVecLength)
   NODE(parser, OmpEndAllocators)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 2a312e29a3a44..506a470c5557b 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -3439,6 +3439,18 @@ struct OmpObject {
 
 WRAPPER_CLASS(OmpObjectList, std::list<OmpObject>);
 
+// Ref: [4.5:169-170], [5.0:254-256], [5.1:287-289], [5.2:321]
+//
+// task-dependence-type -> // "dependence-type" in 5.1 and before
+//    IN | OUT | INOUT |        // since 4.5
+//    SOURCE | SINK |           // since 4.5, until 5.1
+//    MUTEXINOUTSET | DEPOBJ |  // since 5.0
+//    INOUTSET                  // since 5.2
+struct OmpTaskDependenceType {
+  ENUM_CLASS(Type, In, Out, Inout, Source, Sink)
+  WRAPPER_CLASS_BOILERPLATE(OmpTaskDependenceType, Type);
+};
+
 // [5.0] 2.1.6 iterator-specifier -> type-declaration-stmt = subscript-triple
 //             iterator-modifier -> iterator-specifier-list
 struct OmpIteratorSpecifier {
@@ -3534,27 +3546,27 @@ struct OmpDependSinkVecLength {
   std::tuple<DefinedOperator, ScalarIntConstantExpr> t;
 };
 
-// 2.13.9 depend-vec -> iterator [+/- depend-vec-length],...,iterator[...]
+// 2.13.9 depend-vec -> induction-variable [depend-vec-length], ...
 struct OmpDependSinkVec {
   TUPLE_CLASS_BOILERPLATE(OmpDependSinkVec);
   std::tuple<Name, std::optional<OmpDependSinkVecLength>> t;
 };
 
-// 2.13.9 depend-type -> IN | OUT | INOUT | SOURCE | SINK
-struct OmpDependenceType {
-  ENUM_CLASS(Type, In, Out, Inout, Source, Sink)
-  WRAPPER_CLASS_BOILERPLATE(OmpDependenceType, Type);
-};
-
-// 2.13.9 depend-clause -> DEPEND (((IN | OUT | INOUT) : variable-name-list) |
-//                                 SOURCE | SINK : depend-vec)
+// Ref: [4.5:169-170], [5.0:255-256], [5.1:288-289], [5.2:323-324]
+//
+// depend-clause ->
+//    DEPEND(SOURCE) |                               // since 4.5, until 5.1
+//    DEPEND(SINK: depend-vec) |                     // since 4.5, until 5.1
+//    DEPEND([depend-modifier,]dependence-type: locator-list)   // since 4.5
+//
+// depend-modifier -> iterator-modifier              // since 5.0
 struct OmpDependClause {
   UNION_CLASS_BOILERPLATE(OmpDependClause);
   EMPTY_CLASS(Source);
   WRAPPER_CLASS(Sink, std::list<OmpDependSinkVec>);
   struct InOut {
     TUPLE_CLASS_BOILERPLATE(InOut);
-    std::tuple<OmpDependenceType, std::list<Designator>> t;
+    std::tuple<OmpTaskDependenceType, OmpObjectList> t;
   };
   std::variant<Source, Sink, InOut> u;
 };
diff --git a/flang/include/flang/Runtime/CUDA/registration.h b/flang/include/flang/Runtime/CUDA/registration.h
index 009715613e29f..5237069a4c739 100644
--- a/flang/include/flang/Runtime/CUDA/registration.h
+++ b/flang/include/flang/Runtime/CUDA/registration.h
@@ -11,6 +11,7 @@
 
 #include "flang/Runtime/entry-names.h"
 #include <cstddef>
+#include <cstdint>
 
 namespace Fortran::runtime::cuda {
 
@@ -23,6 +24,10 @@ void *RTDECL(CUFRegisterModule)(void *data);
 void RTDECL(CUFRegisterFunction)(
     void **module, const char *fctSym, char *fctName);
 
+/// Register a device variable.
+void RTDECL(CUFRegisterVariable)(
+    void **module, char *varSym, const char *varName, int64_t size);
+
 } // extern "C"
 
 } // namespace Fortran::runtime::cuda
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index fbc031f3a93d7..8fb0dd4a1ec3a 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -798,11 +798,11 @@ bool ClauseProcessor::processDepend(mlir::omp::DependClauseOps &result) const {
   return findRepeatableClause<omp::clause::Depend>(
       [&](const omp::clause::Depend &clause, const parser::CharBlock &) {
         using Depend = omp::clause::Depend;
-        assert(std::holds_alternative<Depend::WithLocators>(clause.u) &&
-               "Only the modern form is handled at the moment");
-        auto &modern = std::get<Depend::WithLocators>(clause.u);
-        auto kind = std::get<Depend::TaskDependenceType>(modern.t);
-        auto &objects = std::get<omp::ObjectList>(modern.t);
+        assert(std::holds_alternative<Depend::DepType>(clause.u) &&
+               "Only the form with dependence type is handled at the moment");
+        auto &depType = std::get<Depend::DepType>(clause.u);
+        auto kind = std::get<Depend::TaskDependenceType>(depType.t);
+        auto &objects = std::get<omp::ObjectList>(depType.t);
 
         mlir::omp::ClauseTaskDependAttr dependTypeOperand =
             genDependKindAttr(firOpBuilder, kind);
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index 3bd89b5432886..b1fa52751fbd7 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -555,7 +555,7 @@ Depend make(const parser::OmpClause::Depend &inp,
   using Iteration = Doacross::Vector::value_type; // LoopIterationT
 
   CLAUSET_ENUM_CONVERT( //
-      convert1, parser::OmpDependenceType::Type, Depend::TaskDependenceType,
+      convert1, parser::OmpTaskDependenceType::Type, Depend::TaskDependenceType,
       // clang-format off
       MS(In,     In)
       MS(Out,    Out)
@@ -593,17 +593,13 @@ Depend make(const parser::OmpClause::Depend &inp,
             return Doacross{{/*DependenceType=*/Doacross::DependenceType::Sink,
                              /*Vector=*/makeList(s.v, convert2)}};
           },
-          // Depend::WithLocators
+          // Depend::DepType
           [&](const wrapped::InOut &s) -> Variant {
-            auto &t0 = std::get<parser::OmpDependenceType>(s.t);
-            auto &t1 = std::get<std::list<parser::Designator>>(s.t);
-            auto convert4 = [&](const parser::Designator &t) {
-              return makeObject(t, semaCtx);
-            };
-            return Depend::WithLocators{
-                {/*TaskDependenceType=*/convert1(t0.v),
-                 /*Iterator=*/std::nullopt,
-                 /*LocatorList=*/makeList(t1, convert4)}};
+            auto &t0 = std::get<parser::OmpTaskDependenceType>(s.t);
+            auto &t1 = std::get<parser::OmpObjectList>(s.t);
+            return Depend::DepType{{/*TaskDependenceType=*/convert1(t0.v),
+                                    /*Iterator=*/std::nullopt,
+                                    /*LocatorList=*/makeObjects(t1, semaCtx)}};
           },
       },
       inp.v.u)};
diff --git a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp
index a4761f24f16d7..dc39be8574f84 100644
--- a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp
@@ -11,6 +11,7 @@
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "flang/Optimizer/Transforms/CUFCommon.h"
 #include "flang/Runtime/CUDA/common.h"
 #include "flang/Runtime/allocatable.h"
 #include "mlir/IR/SymbolTable.h"
@@ -58,6 +59,32 @@ class CUFDeviceGlobal : public fir::impl::CUFDeviceGlobalBase<CUFDeviceGlobal> {
       prepareImplicitDeviceGlobals(funcOp, symTable);
       return mlir::WalkResult::advance();
     });
+
+    // Copying the device global variable into the gpu module
+    mlir::SymbolTable parentSymTable(mod);
+    auto gpuMod =
+        parentSymTable.lookup<mlir::gpu::GPUModuleOp>(cudaDeviceModuleName);
+    if (gpuMod) {
+      mlir::SymbolTable gpuSymTable(gpuMod);
+      for (auto globalOp : mod.getOps<fir::GlobalOp>()) {
+        auto attr = globalOp.getDataAttrAttr();
+        if (!attr)
+          continue;
+        switch (attr.getValue()) {
+        case cuf::DataAttribute::Device:
+        case cuf::DataAttribute::Constant:
+        case cuf::DataAttribute::Managed: {
+          auto globalName{globalOp.getSymbol().getValue()};
+          if (gpuSymTable.lookup<fir::GlobalOp>(globalName)) {
+            break;
+          }
+          gpuSymTable.insert(globalOp->clone());
+        } break;
+        default:
+          break;
+        }
+      }
+    }
   }
 };
 } // namespace
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index ae0c351fed56d..3ca4e93a6c9b9 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -365,10 +365,10 @@ TYPE_PARSER(construct<OmpDependSinkVecLength>(
 TYPE_PARSER(
     construct<OmpDependSinkVec>(name, maybe(Parser<OmpDependSinkVecLength>{})))
 
-TYPE_PARSER(
-    construct<OmpDependenceType>("IN"_id >> pure(OmpDependenceType::Type::In) ||
-        "INOUT" >> pure(OmpDependenceType::Type::Inout) ||
-        "OUT" >> pure(OmpDependenceType::Type::Out)))
+TYPE_PARSER(construct<OmpTaskDependenceType>(
+    "IN"_id >> pure(OmpTaskDependenceType::Type::In) ||
+    "INOUT" >> pure(OmpTaskDependenceType::Type::Inout) ||
+    "OUT" >> pure(OmpTaskDependenceType::Type::Out)))
 
 TYPE_CONTEXT_PARSER("Omp Depend clause"_en_US,
     construct<OmpDependClause>(construct<OmpDependClause::Sink>(
@@ -376,7 +376,7 @@ TYPE_CONTEXT_PARSER("Omp Depend clause"_en_US,
         construct<OmpDependClause>(
             construct<OmpDependClause::Source>("SOURCE"_tok)) ||
         construct<OmpDependClause>(construct<OmpDependClause::InOut>(
-            Parser<OmpDependenceType>{}, ":" >> nonemptyList(designator))))
+            Parser<OmpTaskDependenceType>{}, ":" >> Parser<OmpObjectList>{})))
 
 // 2.15.3.7 LINEAR (linear-list: linear-step)
 //          linear-list -> list | modifier(list)
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index ba4155469073e..39fcb61609e33 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2216,9 +2216,9 @@ class UnparseVisitor {
   }
   void Unparse(const OmpDependClause::InOut &x) {
     Put("(");
-    Walk(std::get<OmpDependenceType>(x.t));
+    Walk(std::get<OmpTaskDependenceType>(x.t));
     Put(":");
-    Walk(std::get<std::list<Designator>>(x.t), ",");
+    Walk(std::get<OmpObjectList>(x.t));
     Put(")");
   }
   bool Pre(const OmpDependClause &x) {
@@ -2829,7 +2829,7 @@ class UnparseVisitor {
       OmpLastprivateClause, LastprivateModifier) // OMP lastprivate-modifier
   WALK_NESTED_ENUM(OmpScheduleModifierType, ModType) // OMP schedule-modifier
   WALK_NESTED_ENUM(OmpLinearModifier, Type) // OMP linear-modifier
-  WALK_NESTED_ENUM(OmpDependenceType, Type) // OMP dependence-type
+  WALK_NESTED_ENUM(OmpTaskDependenceType, Type) // OMP task-dependence-type
   WALK_NESTED_ENUM(OmpScheduleClause, ScheduleType) // OMP schedule-type
   WALK_NESTED_ENUM(OmpDeviceClause, DeviceModifier) // OMP device modifier
   WALK_NESTED_ENUM(OmpDeviceTypeClause, Type) // OMP DEVICE_TYPE
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 455322d610d6c..599cc61a83bf0 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -3288,15 +3288,21 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Depend &x) {
         parser::ToUpperCaseLetters(getDirectiveName(GetContext().directive)));
   }
   if (const auto *inOut{std::get_if<parser::OmpDependClause::InOut>(&x.v.u)}) {
-    const auto &designators{std::get<std::list<parser::Designator>>(inOut->t)};
-    for (const auto &ele : designators) {
-      if (const auto *dataRef{std::get_if<parser::DataRef>(&ele.u)}) {
-        CheckDependList(*dataRef);
-        if (const auto *arr{
-                std::get_if<common::Indirection<parser::ArrayElement>>(
-                    &dataRef->u)}) {
-          CheckArraySection(arr->value(), GetLastName(*dataRef),
-              llvm::omp::Clause::OMPC_depend);
+    for (const auto &object : std::get<parser::OmpObjectList>(inOut->t).v) {
+      if (const auto *name{std::get_if<parser::Name>(&object.u)}) {
+        context_.Say(GetContext().clauseSource,
+            "Common block name ('%s') cannot appear in a DEPEND "
+            "clause"_err_en_US,
+            name->ToString());
+      } else if (auto *designator{std::get_if<parser::Designator>(&object.u)}) {
+        if (auto *dataRef{std::get_if<parser::DataRef>(&designator->u)}) {
+          CheckDependList(*dataRef);
+          if (const auto *arr{
+                  std::get_if<common::Indirection<parser::ArrayElement>>(
+                      &dataRef->u)}) {
+            CheckArraySection(arr->value(), GetLastName(*dataRef),
+                llvm::omp::Clause::OMPC_depend);
+          }
         }
       }
     }
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 979570a7d4103..014b7987a658b 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -435,6 +435,20 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
   bool Pre(const parser::OpenMPAllocatorsConstruct &);
   void Post(const parser::OpenMPAllocatorsConstruct &);
 
+  void Post(const parser::OmpObjectList &x) {
+    // The objects from OMP clauses should have already been resolved,
+    // except common blocks (the ResolveNamesVisitor does not visit
+    // parser::Name, those are dealt with as members of other structures).
+    // Iterate over elements of x, and resolve any common blocks that
+    // are still unresolved.
+    for (const parser::OmpObject &obj : x.v) {
+      auto *name{std::get_if<parser::Name>(&obj.u)};
+      if (name && !name->symbol) {
+        Resolve(*name, currScope().MakeCommonBlock(name->source));
+      }
+    }
+  }
+
   // 2.15.3 Data-Sharing Attribute Clauses
   void Post(const parser::OmpDefaultClause &);
   bool Pre(const parser::OmpClause::Shared &x) {
@@ -531,16 +545,9 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
     return false;
   }
 
-  bool Pre(const parser::OmpDependClause &x) {
-    if (const auto *dependSink{
-            std::get_if<parser::OmpDependClause::Sink>(&x.u)}) {
-      const auto &dependSinkVec{dependSink->v};
-      for (const auto &dependSinkElement : dependSinkVec) {
-        const auto &name{std::get<parser::Name>(dependSinkElement.t)};
-        ResolveName(&name);
-      }
-    }
-    return false;
+  void Post(const parser::OmpDependSinkVec &x) {
+    const auto &name{std::get<parser::Name>(x.t)};
+    ResolveName(&name);
   }
 
   bool Pre(const parser::OmpClause::UseDevicePtr &x) {
diff --git a/flang/runtime/CUDA/registration.cpp b/flang/runtime/CUDA/registration.cpp
index 20d274c4d8d1c..b7b6ef389bffb 100644
--- a/flang/runtime/CUDA/registration.cpp
+++ b/flang/runtime/CUDA/registration.cpp
@@ -21,6 +21,9 @@ extern void __cudaRegisterFatBinaryEnd(void *);
 extern void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun,
     char *deviceFun, const char *deviceName, int thread_limit, uint3 *tid,
     uint3 *bid, dim3 *bDim, dim3 *gDim, int *wSize);
+extern void __cudaRegisterVar(void **fatCubinHandle, char *hostVar,
+    const char *deviceAddress, const char *deviceName, int ext, size_t size,
+    int constant, int global);
 
 void *RTDECL(CUFRegisterModule)(void *data) {
   void **fatHandle{__cudaRegisterFatBinary(data)};
@@ -34,6 +37,11 @@ void RTDEF(CUFRegisterFunction)(
       (uint3 *)0, (dim3 *)0, (dim3 *)0, (int *)0);
 }
 
+void RTDEF(CUFRegisterVariable)(
+    void **module, char *varSym, const char *varName, int64_t size) {
+  __cudaRegisterVar(module, varSym, varName, varName, 0, size, 0, 0);
+}
+
 } // extern "C"
 
 } // namespace Fortran::runtime::cuda
diff --git a/flang/test/Fir/CUDA/cuda-device-global.f90 b/flang/test/Fir/CUDA/cuda-device-global.f90
new file mode 100644
index 0000000000000..c83a938d5af21
--- /dev/null
+++ b/flang/test/Fir/CUDA/cuda-device-global.f90
@@ -0,0 +1,13 @@
+
+// RUN: fir-opt --split-input-file --cuf-device-global %s | FileCheck %s
+
+
+module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module} {
+  fir.global @_QMmtestsEn(dense<[3, 4, 5, 6, 7]> : tensor<5xi32>) {data_attr = #cuf.cuda<device>} : !fir.array<5xi32>
+
+  gpu.module @cuda_device_mod [#nvvm.target] {
+  }
+}
+
+// CHECK: gpu.module @cuda_device_mod [#nvvm.target] 
+// CHECK-NEXT: fir.global @_QMmtestsEn(dense<[3, 4, 5, 6, 7]> : tensor<5xi32>) {data_attr = #cuf.cuda<device>} : !fir.array<5xi32>
diff --git a/flang/test/Semantics/OpenMP/depend04.f90 b/flang/test/Semantics/OpenMP/depend04.f90
new file mode 100644
index 0000000000000..8bdddb017d2c9
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/depend04.f90
@@ -0,0 +1,10 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=50
+
+subroutine f00
+  integer :: x
+  common /cc/ x
+!ERROR: Common block name ('cc') cannot appear in a DEPEND clause
+  !$omp task depend(in: /cc/)
+  x = 0
+  !$omp end task
+end
diff --git a/libc/hdr/CMakeLists.txt b/libc/hdr/CMakeLists.txt
index 13dc892978bb8..80545ee4b359f 100644
--- a/libc/hdr/CMakeLists.txt
+++ b/libc/hdr/CMakeLists.txt
@@ -51,10 +51,13 @@ add_proxy_header_library(
     libc.include.llvm-libc-macros.generic_error_number_macros
 )
 
+add_header_library(fcntl_overlay HDRS fcntl_overlay.h)
 add_proxy_header_library(
   fcntl_macros
   HDRS
     fcntl_macros.h
+  DEPENDS
+    .fcntl_overlay
   FULL_BUILD_DEPENDS
     libc.include.llvm-libc-macros.fcntl_macros
     libc.include.fcntl
diff --git a/libc/hdr/fcntl_macros.h b/libc/hdr/fcntl_macros.h
index 828cb984c0cb1..3a1ddeb0a2da1 100644
--- a/libc/hdr/fcntl_macros.h
+++ b/libc/hdr/fcntl_macros.h
@@ -15,7 +15,7 @@
 
 #else // Overlay mode
 
-#include <fcntl.h>
+#include "hdr/fcntl_overlay.h"
 
 #endif // LLVM_LIBC_FULL_BUILD
 
diff --git a/libc/hdr/fcntl_overlay.h b/libc/hdr/fcntl_overlay.h
new file mode 100644
index 0000000000000..c1cc98b0ebb2c
--- /dev/null
+++ b/libc/hdr/fcntl_overlay.h
@@ -0,0 +1,37 @@
+//===-- Including fcntl.h in overlay mode ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_FCNTL_OVERLAY_H
+#define LLVM_LIBC_HDR_FCNTL_OVERLAY_H
+
+#ifdef LIBC_FULL_BUILD
+#error "This header should only be included in overlay mode"
+#endif
+
+// Overlay mode
+
+// glibc <fcntl.h> header might provide extern inline definitions for few
+// functions, causing external alias errors.  They are guarded by
+// `__USE_FORTIFY_LEVEL`, which will be temporarily disabled
+// with `_FORTIFY_SOURCE`.
+
+#ifdef __USE_FORTIFY_LEVEL
+#define LIBC_OLD_USE_FORTIFY_LEVEL __USE_FORTIFY_LEVEL
+#undef __USE_FORTIFY_LEVEL
+#define __USE_FORTIFY_LEVEL 0
+#endif
+
+#include <fcntl.h>
+
+#ifdef LIBC_OLD_USE_FORTIFY_LEVEL
+#undef __USE_FORTIFY_LEVEL
+#define __USE_FORTIFY_LEVEL LIBC_OLD_USE_FORTIFY_LEVEL
+#undef LIBC_OLD_USE_FORTIFY_LEVEL
+#endif
+
+#endif // LLVM_LIBC_HDR_FCNTL_OVERLAY_H
diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt
index fab5245816bbe..e45979857d795 100644
--- a/libc/hdr/types/CMakeLists.txt
+++ b/libc/hdr/types/CMakeLists.txt
@@ -46,6 +46,17 @@ add_proxy_header_library(
     libc.include.llvm-libc-types.struct_timespec
 )
 
+add_proxy_header_library(
+  mode_t
+  HDRS
+    mode_t.h
+  DEPENDS
+    ../fcntl_overlay
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-types.mode_t
+    libc.include.fcntl
+)
+
 add_proxy_header_library(
   fenv_t
   HDRS
diff --git a/libc/hdr/types/mode_t.h b/libc/hdr/types/mode_t.h
new file mode 100644
index 0000000000000..abbbdb0a09d7b
--- /dev/null
+++ b/libc/hdr/types/mode_t.h
@@ -0,0 +1,22 @@
+//===-- Definition of macros from mode_t.h --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_MODE_T_H
+#define LLVM_LIBC_HDR_MODE_T_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-types/mode_t.h"
+
+#else // Overlay mode
+
+#include "hdr/fcntl_overlay.h"
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_MODE_T_H
diff --git a/libc/src/__support/File/linux/CMakeLists.txt b/libc/src/__support/File/linux/CMakeLists.txt
index 5abbf11b3671c..84e3d5608361e 100644
--- a/libc/src/__support/File/linux/CMakeLists.txt
+++ b/libc/src/__support/File/linux/CMakeLists.txt
@@ -7,7 +7,7 @@ add_object_library(
     file.h
     lseekImpl.h
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.include.sys_syscall
     libc.include.sys_stat
     libc.src.__support.CPP.new
@@ -55,7 +55,7 @@ add_object_library(
   SRCS
     dir.cpp
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
     libc.src.__support.error_or
diff --git a/libc/src/__support/File/linux/dir.cpp b/libc/src/__support/File/linux/dir.cpp
index fc90ff097e460..5fe44fa8297b6 100644
--- a/libc/src/__support/File/linux/dir.cpp
+++ b/libc/src/__support/File/linux/dir.cpp
@@ -12,7 +12,7 @@
 #include "src/__support/error_or.h"
 #include "src/__support/macros/config.h"
 
-#include <fcntl.h>       // For open flags
+#include "hdr/fcntl_macros.h" // For open flags
 #include <sys/syscall.h> // For syscall numbers
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/__support/File/linux/file.cpp b/libc/src/__support/File/linux/file.cpp
index 22292336f300e..824c1f200e8c5 100644
--- a/libc/src/__support/File/linux/file.cpp
+++ b/libc/src/__support/File/linux/file.cpp
@@ -18,7 +18,7 @@
 #include "src/__support/macros/config.h"
 #include "src/errno/libc_errno.h" // For error macros
 
-#include <fcntl.h>       // For mode_t and other flags to the open syscall
+#include "hdr/fcntl_macros.h" // For mode_t and other flags to the open syscall
 #include <sys/stat.h>    // For S_IS*, S_IF*, and S_IR* flags.
 #include <sys/syscall.h> // For syscall numbers
 
diff --git a/libc/src/__support/threads/linux/CMakeLists.txt b/libc/src/__support/threads/linux/CMakeLists.txt
index b6796f40adce7..fa11458f99b6c 100644
--- a/libc/src/__support/threads/linux/CMakeLists.txt
+++ b/libc/src/__support/threads/linux/CMakeLists.txt
@@ -79,7 +79,7 @@ add_object_library(
     .futex_utils
     libc.config.app_h
     libc.include.sys_syscall
-    libc.include.fcntl
+    libc.hdr.fcntl_macros  
     libc.src.errno.errno
     libc.src.__support.CPP.atomic
     libc.src.__support.CPP.stringstream
diff --git a/libc/src/__support/threads/linux/thread.cpp b/libc/src/__support/threads/linux/thread.cpp
index ee3f63fa3cde3..c531d74c53355 100644
--- a/libc/src/__support/threads/linux/thread.cpp
+++ b/libc/src/__support/threads/linux/thread.cpp
@@ -22,7 +22,7 @@
 #include <arm_acle.h>
 #endif
 
-#include <fcntl.h>
+#include "hdr/fcntl_macros.h"
 #include <linux/param.h> // For EXEC_PAGESIZE.
 #include <linux/prctl.h> // For PR_SET_NAME
 #include <linux/sched.h> // For CLONE_* flags.
diff --git a/libc/src/fcntl/creat.h b/libc/src/fcntl/creat.h
index e180e17c25788..3e00427638a36 100644
--- a/libc/src/fcntl/creat.h
+++ b/libc/src/fcntl/creat.h
@@ -9,8 +9,8 @@
 #ifndef LLVM_LIBC_SRC_FCNTL_CREAT_H
 #define LLVM_LIBC_SRC_FCNTL_CREAT_H
 
+#include "hdr/fcntl_macros.h"
 #include "src/__support/macros/config.h"
-#include <fcntl.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/fcntl/linux/CMakeLists.txt b/libc/src/fcntl/linux/CMakeLists.txt
index ee8ae63b8cf06..580db16cd4132 100644
--- a/libc/src/fcntl/linux/CMakeLists.txt
+++ b/libc/src/fcntl/linux/CMakeLists.txt
@@ -5,7 +5,7 @@ add_entrypoint_object(
   HDRS
     ../creat.h
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.src.__support.OSUtil.osutil
     libc.src.errno.errno
 )
@@ -17,7 +17,7 @@ add_entrypoint_object(
   HDRS
     ../fcntl.h
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.src.__support.OSUtil.osutil
 )
 
@@ -28,7 +28,8 @@ add_entrypoint_object(
   HDRS
     ../open.h
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.types.mode_t
+    libc.hdr.fcntl_macros
     libc.src.__support.OSUtil.osutil
     libc.src.errno.errno
 )
@@ -40,7 +41,7 @@ add_entrypoint_object(
   HDRS
     ../openat.h
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.types.mode_t
     libc.src.__support.OSUtil.osutil
     libc.src.errno.errno
 )
diff --git a/libc/src/fcntl/linux/creat.cpp b/libc/src/fcntl/linux/creat.cpp
index 2c5b5d736a3be..23abae243aed9 100644
--- a/libc/src/fcntl/linux/creat.cpp
+++ b/libc/src/fcntl/linux/creat.cpp
@@ -13,7 +13,7 @@
 #include "src/__support/macros/config.h"
 #include "src/errno/libc_errno.h"
 
-#include <fcntl.h>
+#include "hdr/fcntl_macros.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/fcntl/linux/open.cpp b/libc/src/fcntl/linux/open.cpp
index 79b7b2b32c887..8b699ecdd2043 100644
--- a/libc/src/fcntl/linux/open.cpp
+++ b/libc/src/fcntl/linux/open.cpp
@@ -13,7 +13,8 @@
 #include "src/__support/macros/config.h"
 #include "src/errno/libc_errno.h"
 
-#include <fcntl.h>
+#include "hdr/fcntl_macros.h"
+#include "hdr/types/mode_t.h"
 #include <stdarg.h>
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/fcntl/linux/openat.cpp b/libc/src/fcntl/linux/openat.cpp
index 0862082c22ebf..6063d9c00ad6c 100644
--- a/libc/src/fcntl/linux/openat.cpp
+++ b/libc/src/fcntl/linux/openat.cpp
@@ -13,7 +13,7 @@
 #include "src/__support/macros/config.h"
 #include "src/errno/libc_errno.h"
 
-#include <fcntl.h>
+#include "hdr/types/mode_t.h"
 #include <stdarg.h>
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/fcntl/open.h b/libc/src/fcntl/open.h
index 19bb53c2e3203..11f0ae5379531 100644
--- a/libc/src/fcntl/open.h
+++ b/libc/src/fcntl/open.h
@@ -9,8 +9,8 @@
 #ifndef LLVM_LIBC_SRC_FCNTL_OPEN_H
 #define LLVM_LIBC_SRC_FCNTL_OPEN_H
 
+#include "hdr/fcntl_macros.h"
 #include "src/__support/macros/config.h"
-#include <fcntl.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/fcntl/openat.h b/libc/src/fcntl/openat.h
index d09791a84f735..051c8a2304dcb 100644
--- a/libc/src/fcntl/openat.h
+++ b/libc/src/fcntl/openat.h
@@ -9,8 +9,8 @@
 #ifndef LLVM_LIBC_SRC_FCNTL_OPENAT_H
 #define LLVM_LIBC_SRC_FCNTL_OPENAT_H
 
+#include "hdr/fcntl_macros.h"
 #include "src/__support/macros/config.h"
-#include <fcntl.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/math/cbrt.h b/libc/src/math/cbrt.h
index a7d5fe80e57b3..8cf7d9b221df3 100644
--- a/libc/src/math/cbrt.h
+++ b/libc/src/math/cbrt.h
@@ -9,10 +9,12 @@
 #ifndef LLVM_LIBC_SRC_MATH_CBRT_H
 #define LLVM_LIBC_SRC_MATH_CBRT_H
 
-namespace LIBC_NAMESPACE {
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
 
 double cbrt(double x);
 
-} // namespace LIBC_NAMESPACE
+} // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_SRC_MATH_CBRT_H
diff --git a/libc/src/spawn/linux/CMakeLists.txt b/libc/src/spawn/linux/CMakeLists.txt
index 9ef3a9d18b0c6..26148fe1c76db 100644
--- a/libc/src/spawn/linux/CMakeLists.txt
+++ b/libc/src/spawn/linux/CMakeLists.txt
@@ -5,7 +5,8 @@ add_entrypoint_object(
   HDRS
     ../posix_spawn.h
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.types.mode_t
+    libc.hdr.fcntl_macros
     libc.include.spawn
     libc.include.sys_syscall
     libc.include.signal
diff --git a/libc/src/spawn/linux/posix_spawn.cpp b/libc/src/spawn/linux/posix_spawn.cpp
index 4c0469b3ce384..fe82ba260148a 100644
--- a/libc/src/spawn/linux/posix_spawn.cpp
+++ b/libc/src/spawn/linux/posix_spawn.cpp
@@ -14,7 +14,8 @@
 #include "src/__support/macros/config.h"
 #include "src/spawn/file_actions.h"
 
-#include <fcntl.h>
+#include "hdr/fcntl_macros.h"
+#include "hdr/types/mode_t.h"
 #include <signal.h> // For SIGCHLD
 #include <spawn.h>
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/stdio/gpu/fprintf.cpp b/libc/src/stdio/gpu/fprintf.cpp
index 6222589cc4bab..46196d7d2b10f 100644
--- a/libc/src/stdio/gpu/fprintf.cpp
+++ b/libc/src/stdio/gpu/fprintf.cpp
@@ -16,7 +16,7 @@
 
 #include <stdarg.h>
 
-namespace LIBC_NAMESPACE {
+namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, fprintf,
                    (::FILE *__restrict stream, const char *__restrict format,
@@ -29,4 +29,4 @@ LLVM_LIBC_FUNCTION(int, fprintf,
   return ret_val;
 }
 
-} // namespace LIBC_NAMESPACE
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/gpu/printf.cpp b/libc/src/stdio/gpu/printf.cpp
index d9903193ef165..be1885fd6801d 100644
--- a/libc/src/stdio/gpu/printf.cpp
+++ b/libc/src/stdio/gpu/printf.cpp
@@ -15,7 +15,7 @@
 
 #include <stdarg.h>
 
-namespace LIBC_NAMESPACE {
+namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) {
   va_list vlist;
@@ -26,4 +26,4 @@ LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) {
   return ret_val;
 }
 
-} // namespace LIBC_NAMESPACE
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/gpu/vfprintf.cpp b/libc/src/stdio/gpu/vfprintf.cpp
index 961cfa48579e0..c92685f48c728 100644
--- a/libc/src/stdio/gpu/vfprintf.cpp
+++ b/libc/src/stdio/gpu/vfprintf.cpp
@@ -14,7 +14,7 @@
 #include "src/errno/libc_errno.h"
 #include "src/stdio/gpu/vfprintf_utils.h"
 
-namespace LIBC_NAMESPACE {
+namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, vfprintf,
                    (::FILE *__restrict stream, const char *__restrict format,
@@ -24,4 +24,4 @@ LLVM_LIBC_FUNCTION(int, vfprintf,
   return ret_val;
 }
 
-} // namespace LIBC_NAMESPACE
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/gpu/vfprintf_utils.h b/libc/src/stdio/gpu/vfprintf_utils.h
index 93ce1649869fc..5010ee16d9607 100644
--- a/libc/src/stdio/gpu/vfprintf_utils.h
+++ b/libc/src/stdio/gpu/vfprintf_utils.h
@@ -9,10 +9,11 @@
 #include "hdr/types/FILE.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/macros/config.h"
 #include "src/stdio/gpu/file.h"
 #include "src/string/string_utils.h"
 
-namespace LIBC_NAMESPACE {
+namespace LIBC_NAMESPACE_DECL {
 
 template <uint16_t opcode>
 LIBC_INLINE int vfprintf_impl(::FILE *__restrict file,
@@ -82,4 +83,4 @@ LIBC_INLINE int vfprintf_internal(::FILE *__restrict stream,
 #endif
 }
 
-} // namespace LIBC_NAMESPACE
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/gpu/vprintf.cpp b/libc/src/stdio/gpu/vprintf.cpp
index 2bb74d7f017b5..54012f3071844 100644
--- a/libc/src/stdio/gpu/vprintf.cpp
+++ b/libc/src/stdio/gpu/vprintf.cpp
@@ -13,7 +13,7 @@
 #include "src/errno/libc_errno.h"
 #include "src/stdio/gpu/vfprintf_utils.h"
 
-namespace LIBC_NAMESPACE {
+namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, vprintf,
                    (const char *__restrict format, va_list vlist)) {
@@ -22,4 +22,4 @@ LLVM_LIBC_FUNCTION(int, vprintf,
   return ret_val;
 }
 
-} // namespace LIBC_NAMESPACE
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/linux/CMakeLists.txt b/libc/src/stdio/linux/CMakeLists.txt
index d6241e1ca0439..1b2fcb33ce54d 100644
--- a/libc/src/stdio/linux/CMakeLists.txt
+++ b/libc/src/stdio/linux/CMakeLists.txt
@@ -5,7 +5,7 @@ add_entrypoint_object(
   HDRS
     ../remove.h
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.include.unistd
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
@@ -22,6 +22,7 @@ add_entrypoint_object(
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
     libc.src.errno.errno
+    libc.hdr.fcntl_macros
 )
 
 add_entrypoint_object(
diff --git a/libc/src/stdio/linux/remove.cpp b/libc/src/stdio/linux/remove.cpp
index 9e299aaf43e45..dbb4491d0e6cc 100644
--- a/libc/src/stdio/linux/remove.cpp
+++ b/libc/src/stdio/linux/remove.cpp
@@ -11,9 +11,9 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "hdr/fcntl_macros.h" // For AT_* macros.
 #include "src/__support/macros/config.h"
 #include "src/errno/libc_errno.h"
-#include <fcntl.h>       // For AT_* macros.
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/linux/rename.cpp b/libc/src/stdio/linux/rename.cpp
index 69fd22720ed19..fbcb29be48f4e 100644
--- a/libc/src/stdio/linux/rename.cpp
+++ b/libc/src/stdio/linux/rename.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdio/rename.h"
-#include "include/llvm-libc-macros/linux/fcntl-macros.h"
+#include "hdr/fcntl_macros.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
diff --git a/libc/src/stdio/vsscanf.h b/libc/src/stdio/vsscanf.h
index 992c44d3d95b9..c57b1743e477e 100644
--- a/libc/src/stdio/vsscanf.h
+++ b/libc/src/stdio/vsscanf.h
@@ -9,12 +9,14 @@
 #ifndef LLVM_LIBC_SRC_STDIO_VSSCANF_H
 #define LLVM_LIBC_SRC_STDIO_VSSCANF_H
 
+#include "src/__support/macros/config.h"
+
 #include <stdarg.h>
 
-namespace LIBC_NAMESPACE {
+namespace LIBC_NAMESPACE_DECL {
 
 int vsscanf(const char *s, const char *format, va_list vlist);
 
-} // namespace LIBC_NAMESPACE
+} // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_SRC_STDIO_VSSCANF_H
diff --git a/libc/src/sys/mman/linux/CMakeLists.txt b/libc/src/sys/mman/linux/CMakeLists.txt
index 11188254cfbd4..47c16f79bc8d5 100644
--- a/libc/src/sys/mman/linux/CMakeLists.txt
+++ b/libc/src/sys/mman/linux/CMakeLists.txt
@@ -187,8 +187,7 @@ add_entrypoint_object(
     ../shm_open.h
   DEPENDS
     libc.src.fcntl.open
-    libc.include.llvm-libc-macros.fcntl_macros
-    libc.include.llvm-libc-types.mode_t
+    libc.hdr.types.mode_t
     .shm_common
 )
 
diff --git a/libc/src/sys/mman/linux/shm_open.cpp b/libc/src/sys/mman/linux/shm_open.cpp
index d235e57aefdeb..11de482272d00 100644
--- a/libc/src/sys/mman/linux/shm_open.cpp
+++ b/libc/src/sys/mman/linux/shm_open.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/sys/mman/shm_open.h"
-#include "llvm-libc-macros/fcntl-macros.h"
+#include "hdr/types/mode_t.h"
 #include "src/__support/macros/config.h"
 #include "src/fcntl/open.h"
 #include "src/sys/mman/linux/shm_common.h"
diff --git a/libc/src/sys/mman/shm_open.h b/libc/src/sys/mman/shm_open.h
index c890304aa4acf..1872dd30cb6f5 100644
--- a/libc/src/sys/mman/shm_open.h
+++ b/libc/src/sys/mman/shm_open.h
@@ -9,8 +9,8 @@
 #ifndef LLVM_LIBC_SRC_SYS_MMAN_SHM_OPEN_H
 #define LLVM_LIBC_SRC_SYS_MMAN_SHM_OPEN_H
 
+#include "hdr/types/mode_t.h"
 #include "src/__support/macros/config.h"
-#include <llvm-libc-types/mode_t.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/sys/stat/linux/CMakeLists.txt b/libc/src/sys/stat/linux/CMakeLists.txt
index 415d2fa5c8771..9aeb14636c2c1 100644
--- a/libc/src/sys/stat/linux/CMakeLists.txt
+++ b/libc/src/sys/stat/linux/CMakeLists.txt
@@ -5,7 +5,8 @@ add_entrypoint_object(
   HDRS
     ../chmod.h
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.types.mode_t
+    libc.hdr.fcntl_macros
     libc.include.sys_stat
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
@@ -19,6 +20,7 @@ add_entrypoint_object(
   HDRS
     ../fchmod.h
   DEPENDS
+    libc.hdr.types.mode_t
     libc.include.sys_stat
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
@@ -45,7 +47,8 @@ add_entrypoint_object(
   HDRS
     ../mkdir.h
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.types.mode_t
+    libc.hdr.fcntl_macros
     libc.include.sys_stat
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
@@ -84,7 +87,7 @@ add_entrypoint_object(
     ../stat.h
   DEPENDS
     .kernel_statx
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.include.sys_stat
     libc.src.errno.errno
 )
@@ -97,7 +100,7 @@ add_entrypoint_object(
     ../lstat.h
   DEPENDS
     .kernel_statx
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.include.sys_stat
     libc.src.errno.errno
 )
@@ -110,7 +113,7 @@ add_entrypoint_object(
     ../fstat.h
   DEPENDS
     .kernel_statx
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.include.sys_stat
     libc.src.errno.errno
 )
diff --git a/libc/src/sys/stat/linux/chmod.cpp b/libc/src/sys/stat/linux/chmod.cpp
index c91cabb514a8c..57d5bae6b8191 100644
--- a/libc/src/sys/stat/linux/chmod.cpp
+++ b/libc/src/sys/stat/linux/chmod.cpp
@@ -11,9 +11,10 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "hdr/fcntl_macros.h"
+#include "hdr/types/mode_t.h"
 #include "src/__support/macros/config.h"
 #include "src/errno/libc_errno.h"
-#include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sys/stat/linux/fchmod.cpp b/libc/src/sys/stat/linux/fchmod.cpp
index 7b6c7b7091a82..0d6fd359169aa 100644
--- a/libc/src/sys/stat/linux/fchmod.cpp
+++ b/libc/src/sys/stat/linux/fchmod.cpp
@@ -11,9 +11,9 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "hdr/types/mode_t.h"
 #include "src/__support/macros/config.h"
 #include "src/errno/libc_errno.h"
-#include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sys/stat/linux/fstat.cpp b/libc/src/sys/stat/linux/fstat.cpp
index 411aa47bcda2a..35cf8f08f782d 100644
--- a/libc/src/sys/stat/linux/fstat.cpp
+++ b/libc/src/sys/stat/linux/fstat.cpp
@@ -13,7 +13,7 @@
 
 #include "src/__support/common.h"
 
-#include <fcntl.h>
+#include "hdr/fcntl_macros.h"
 #include <sys/stat.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/stat/linux/lstat.cpp b/libc/src/sys/stat/linux/lstat.cpp
index 5a6eff068d1dd..354c5b6e029a4 100644
--- a/libc/src/sys/stat/linux/lstat.cpp
+++ b/libc/src/sys/stat/linux/lstat.cpp
@@ -14,7 +14,7 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
-#include <fcntl.h>
+#include "hdr/fcntl_macros.h"
 #include <sys/stat.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/stat/linux/mkdir.cpp b/libc/src/sys/stat/linux/mkdir.cpp
index 527c3d2058d2b..b319b5c8393de 100644
--- a/libc/src/sys/stat/linux/mkdir.cpp
+++ b/libc/src/sys/stat/linux/mkdir.cpp
@@ -11,9 +11,10 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "hdr/fcntl_macros.h"
+#include "hdr/types/mode_t.h"
 #include "src/__support/macros/config.h"
 #include "src/errno/libc_errno.h"
-#include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sys/stat/linux/stat.cpp b/libc/src/sys/stat/linux/stat.cpp
index c5149e6e3c883..de9cdb197d687 100644
--- a/libc/src/sys/stat/linux/stat.cpp
+++ b/libc/src/sys/stat/linux/stat.cpp
@@ -13,7 +13,7 @@
 
 #include "src/__support/common.h"
 
-#include <fcntl.h>
+#include "hdr/fcntl_macros.h"
 #include <sys/stat.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/CMakeLists.txt b/libc/src/unistd/linux/CMakeLists.txt
index 9b0d752cefbd8..472438ca72e49 100644
--- a/libc/src/unistd/linux/CMakeLists.txt
+++ b/libc/src/unistd/linux/CMakeLists.txt
@@ -5,6 +5,7 @@ add_entrypoint_object(
   HDRS
     ../access.h
   DEPENDS
+    libc.hdr.fcntl_macros
     libc.include.unistd
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
@@ -57,7 +58,7 @@ add_entrypoint_object(
   HDRS
     ../dup2.h
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.include.unistd
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
@@ -254,7 +255,7 @@ add_entrypoint_object(
   HDRS
     ../link.h
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.include.unistd
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
@@ -268,7 +269,7 @@ add_entrypoint_object(
   HDRS
     ../linkat.h
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.include.unistd
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
@@ -377,7 +378,7 @@ add_entrypoint_object(
   HDRS
     ../rmdir.h
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.include.unistd
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
@@ -391,7 +392,7 @@ add_entrypoint_object(
   HDRS
     ../readlink.h
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.include.unistd
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
@@ -405,7 +406,7 @@ add_entrypoint_object(
   HDRS
     ../readlinkat.h
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.include.unistd
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
@@ -419,7 +420,7 @@ add_entrypoint_object(
   HDRS
     ../symlink.h
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.include.unistd
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
@@ -433,7 +434,7 @@ add_entrypoint_object(
   HDRS
     ../symlinkat.h
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.include.unistd
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
@@ -485,7 +486,7 @@ add_entrypoint_object(
   HDRS
     ../unlink.h
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.include.unistd
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
@@ -499,7 +500,7 @@ add_entrypoint_object(
   HDRS
     ../unlinkat.h
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.include.unistd
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
diff --git a/libc/src/unistd/linux/access.cpp b/libc/src/unistd/linux/access.cpp
index e9ad74989b056..2f7ebbcdf9e81 100644
--- a/libc/src/unistd/linux/access.cpp
+++ b/libc/src/unistd/linux/access.cpp
@@ -11,9 +11,9 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "hdr/fcntl_macros.h"
 #include "src/__support/macros/config.h"
 #include "src/errno/libc_errno.h"
-#include <fcntl.h>
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/dup2.cpp b/libc/src/unistd/linux/dup2.cpp
index 51a19a71a7d85..c7c7c1a8ca786 100644
--- a/libc/src/unistd/linux/dup2.cpp
+++ b/libc/src/unistd/linux/dup2.cpp
@@ -11,9 +11,9 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "hdr/fcntl_macros.h"
 #include "src/__support/macros/config.h"
 #include "src/errno/libc_errno.h"
-#include <fcntl.h>
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/link.cpp b/libc/src/unistd/linux/link.cpp
index 37ca58eab1096..477806a70df74 100644
--- a/libc/src/unistd/linux/link.cpp
+++ b/libc/src/unistd/linux/link.cpp
@@ -13,7 +13,7 @@
 #include "src/__support/macros/config.h"
 #include "src/errno/libc_errno.h"
 
-#include <fcntl.h>
+#include "hdr/fcntl_macros.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/linkat.cpp b/libc/src/unistd/linux/linkat.cpp
index fcd6a5f75a196..40f68cc90c480 100644
--- a/libc/src/unistd/linux/linkat.cpp
+++ b/libc/src/unistd/linux/linkat.cpp
@@ -11,9 +11,9 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "hdr/fcntl_macros.h"
 #include "src/__support/macros/config.h"
 #include "src/errno/libc_errno.h"
-#include <fcntl.h>
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/readlink.cpp b/libc/src/unistd/linux/readlink.cpp
index 7b15245004405..2055e6b3400f2 100644
--- a/libc/src/unistd/linux/readlink.cpp
+++ b/libc/src/unistd/linux/readlink.cpp
@@ -11,9 +11,9 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "hdr/fcntl_macros.h"
 #include "src/__support/macros/config.h"
 #include "src/errno/libc_errno.h"
-#include <fcntl.h>
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/readlinkat.cpp b/libc/src/unistd/linux/readlinkat.cpp
index 19a9ff9fbeb72..e5e4d0d39bc9c 100644
--- a/libc/src/unistd/linux/readlinkat.cpp
+++ b/libc/src/unistd/linux/readlinkat.cpp
@@ -11,9 +11,9 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "hdr/fcntl_macros.h"
 #include "src/__support/macros/config.h"
 #include "src/errno/libc_errno.h"
-#include <fcntl.h>
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/rmdir.cpp b/libc/src/unistd/linux/rmdir.cpp
index 8974468ebcf16..075af12af64c5 100644
--- a/libc/src/unistd/linux/rmdir.cpp
+++ b/libc/src/unistd/linux/rmdir.cpp
@@ -13,7 +13,7 @@
 #include "src/__support/macros/config.h"
 #include "src/errno/libc_errno.h"
 
-#include <fcntl.h>
+#include "hdr/fcntl_macros.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/symlink.cpp b/libc/src/unistd/linux/symlink.cpp
index 5efd4df85edab..9e1b2886ea0f5 100644
--- a/libc/src/unistd/linux/symlink.cpp
+++ b/libc/src/unistd/linux/symlink.cpp
@@ -13,7 +13,7 @@
 #include "src/__support/macros/config.h"
 #include "src/errno/libc_errno.h"
 
-#include <fcntl.h>
+#include "hdr/fcntl_macros.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/symlinkat.cpp b/libc/src/unistd/linux/symlinkat.cpp
index 63d2e6d1507a5..bcf2d0f8cc055 100644
--- a/libc/src/unistd/linux/symlinkat.cpp
+++ b/libc/src/unistd/linux/symlinkat.cpp
@@ -11,9 +11,9 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "hdr/fcntl_macros.h"
 #include "src/__support/macros/config.h"
 #include "src/errno/libc_errno.h"
-#include <fcntl.h>
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/unlink.cpp b/libc/src/unistd/linux/unlink.cpp
index de7cae8b826eb..72d8e2398e3d7 100644
--- a/libc/src/unistd/linux/unlink.cpp
+++ b/libc/src/unistd/linux/unlink.cpp
@@ -11,9 +11,9 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "hdr/fcntl_macros.h"
 #include "src/__support/macros/config.h"
 #include "src/errno/libc_errno.h"
-#include <fcntl.h>
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/unlinkat.cpp b/libc/src/unistd/linux/unlinkat.cpp
index e794f242b9459..4ed20f542f170 100644
--- a/libc/src/unistd/linux/unlinkat.cpp
+++ b/libc/src/unistd/linux/unlinkat.cpp
@@ -13,7 +13,7 @@
 #include "src/__support/macros/config.h"
 #include "src/errno/libc_errno.h"
 
-#include <fcntl.h>
+#include "hdr/fcntl_macros.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/test/src/fcntl/CMakeLists.txt b/libc/test/src/fcntl/CMakeLists.txt
index 48048b7fe8866..b522fef7439df 100644
--- a/libc/test/src/fcntl/CMakeLists.txt
+++ b/libc/test/src/fcntl/CMakeLists.txt
@@ -42,7 +42,7 @@ add_libc_unittest(
   SRCS
     openat_test.cpp
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.src.errno.errno
     libc.src.fcntl.open
     libc.src.fcntl.openat
diff --git a/libc/test/src/fcntl/openat_test.cpp b/libc/test/src/fcntl/openat_test.cpp
index 9dafd125224a4..547359eb9f7a9 100644
--- a/libc/test/src/fcntl/openat_test.cpp
+++ b/libc/test/src/fcntl/openat_test.cpp
@@ -14,7 +14,7 @@
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <fcntl.h>
+#include "hdr/fcntl_macros.h"
 
 TEST(LlvmLibcUniStd, OpenAndReadTest) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
diff --git a/libc/test/src/sys/sendfile/CMakeLists.txt b/libc/test/src/sys/sendfile/CMakeLists.txt
index 82efaa147bd89..ceaa4accdd06e 100644
--- a/libc/test/src/sys/sendfile/CMakeLists.txt
+++ b/libc/test/src/sys/sendfile/CMakeLists.txt
@@ -9,7 +9,7 @@ add_libc_unittest(
   SRCS
     sendfile_test.cpp
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.include.sys_stat
     libc.src.errno.errno 
     libc.src.fcntl.open
diff --git a/libc/test/src/sys/sendfile/sendfile_test.cpp b/libc/test/src/sys/sendfile/sendfile_test.cpp
index 59025438a2467..a658212ddb72c 100644
--- a/libc/test/src/sys/sendfile/sendfile_test.cpp
+++ b/libc/test/src/sys/sendfile/sendfile_test.cpp
@@ -17,7 +17,7 @@
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <fcntl.h>
+#include "hdr/fcntl_macros.h"
 #include <sys/stat.h>
 
 namespace cpp = LIBC_NAMESPACE::cpp;
diff --git a/libc/test/src/sys/stat/CMakeLists.txt b/libc/test/src/sys/stat/CMakeLists.txt
index 877a129b627dd..dd3d0932755b7 100644
--- a/libc/test/src/sys/stat/CMakeLists.txt
+++ b/libc/test/src/sys/stat/CMakeLists.txt
@@ -9,7 +9,7 @@ add_libc_unittest(
   SRCS
     chmod_test.cpp
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.include.sys_stat
     libc.src.errno.errno
     libc.src.fcntl.open
@@ -25,7 +25,7 @@ add_libc_unittest(
   SRCS
     fchmodat_test.cpp
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.include.sys_stat
     libc.src.errno.errno
     libc.src.fcntl.open
@@ -41,7 +41,7 @@ add_libc_unittest(
   SRCS
     fchmod_test.cpp
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.include.sys_stat
     libc.src.errno.errno
     libc.src.fcntl.open
@@ -57,7 +57,7 @@ add_libc_unittest(
   SRCS
     mkdirat_test.cpp
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.include.sys_stat
     libc.src.errno.errno
     libc.src.sys.stat.mkdirat
@@ -71,7 +71,7 @@ add_libc_unittest(
   SRCS
     stat_test.cpp
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.include.sys_stat
     libc.src.errno.errno
     libc.src.sys.stat.stat
@@ -87,7 +87,7 @@ add_libc_unittest(
   SRCS
     lstat_test.cpp
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.include.sys_stat
     libc.src.errno.errno
     libc.src.sys.stat.lstat
@@ -103,7 +103,7 @@ add_libc_unittest(
   SRCS
     fstat_test.cpp
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.include.sys_stat
     libc.src.errno.errno
     libc.src.sys.stat.fstat
diff --git a/libc/test/src/sys/stat/chmod_test.cpp b/libc/test/src/sys/stat/chmod_test.cpp
index c688996615cee..83ab0f45b6f08 100644
--- a/libc/test/src/sys/stat/chmod_test.cpp
+++ b/libc/test/src/sys/stat/chmod_test.cpp
@@ -14,7 +14,7 @@
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <fcntl.h>
+#include "hdr/fcntl_macros.h"
 #include <sys/stat.h>
 
 TEST(LlvmLibcChmodTest, ChangeAndOpen) {
diff --git a/libc/test/src/sys/stat/fchmod_test.cpp b/libc/test/src/sys/stat/fchmod_test.cpp
index 91c0f68b8708c..03eb79d95ddd6 100644
--- a/libc/test/src/sys/stat/fchmod_test.cpp
+++ b/libc/test/src/sys/stat/fchmod_test.cpp
@@ -14,7 +14,7 @@
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <fcntl.h>
+#include "hdr/fcntl_macros.h"
 #include <sys/stat.h>
 
 TEST(LlvmLibcChmodTest, ChangeAndOpen) {
diff --git a/libc/test/src/sys/stat/fchmodat_test.cpp b/libc/test/src/sys/stat/fchmodat_test.cpp
index c43ef8ae13315..09970b6e0fb16 100644
--- a/libc/test/src/sys/stat/fchmodat_test.cpp
+++ b/libc/test/src/sys/stat/fchmodat_test.cpp
@@ -14,7 +14,7 @@
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <fcntl.h>
+#include "hdr/fcntl_macros.h"
 #include <sys/stat.h>
 
 TEST(LlvmLibcFchmodatTest, ChangeAndOpen) {
diff --git a/libc/test/src/sys/stat/fstat_test.cpp b/libc/test/src/sys/stat/fstat_test.cpp
index 1379eae26a47a..34c675d1a4e29 100644
--- a/libc/test/src/sys/stat/fstat_test.cpp
+++ b/libc/test/src/sys/stat/fstat_test.cpp
@@ -14,7 +14,7 @@
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <fcntl.h>
+#include "hdr/fcntl_macros.h"
 #include <sys/stat.h>
 
 TEST(LlvmLibcFStatTest, CreatAndReadMode) {
diff --git a/libc/test/src/sys/stat/lstat_test.cpp b/libc/test/src/sys/stat/lstat_test.cpp
index b44b3d1a59ce7..a723d5ae2e297 100644
--- a/libc/test/src/sys/stat/lstat_test.cpp
+++ b/libc/test/src/sys/stat/lstat_test.cpp
@@ -14,7 +14,7 @@
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <fcntl.h>
+#include "hdr/fcntl_macros.h"
 #include <sys/stat.h>
 
 TEST(LlvmLibcLStatTest, CreatAndReadMode) {
diff --git a/libc/test/src/sys/stat/mkdirat_test.cpp b/libc/test/src/sys/stat/mkdirat_test.cpp
index cbacc16b402d7..85e013de234e7 100644
--- a/libc/test/src/sys/stat/mkdirat_test.cpp
+++ b/libc/test/src/sys/stat/mkdirat_test.cpp
@@ -11,7 +11,7 @@
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <fcntl.h>
+#include "hdr/fcntl_macros.h"
 
 TEST(LlvmLibcMkdiratTest, CreateAndRemove) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
diff --git a/libc/test/src/sys/stat/stat_test.cpp b/libc/test/src/sys/stat/stat_test.cpp
index baf363382022a..0ddd8baaec1c9 100644
--- a/libc/test/src/sys/stat/stat_test.cpp
+++ b/libc/test/src/sys/stat/stat_test.cpp
@@ -14,7 +14,7 @@
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <fcntl.h>
+#include "hdr/fcntl_macros.h"
 #include <sys/stat.h>
 
 TEST(LlvmLibcStatTest, CreatAndReadMode) {
diff --git a/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp b/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp
index 8cb5f867453e4..2f3e0b96ff095 100644
--- a/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp
+++ b/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp
@@ -1,4 +1,4 @@
-#include "llvm-libc-macros/linux/fcntl-macros.h"
+#include "hdr/fcntl_macros.h"
 #include "src/__support/macros/config.h"
 #include "src/fcntl/open.h"
 #include "src/sys/statvfs/fstatvfs.h"
diff --git a/libc/test/src/unistd/CMakeLists.txt b/libc/test/src/unistd/CMakeLists.txt
index e03e56b3cf8ad..ce936cebad426 100644
--- a/libc/test/src/unistd/CMakeLists.txt
+++ b/libc/test/src/unistd/CMakeLists.txt
@@ -24,11 +24,12 @@ add_libc_unittest(
   SRCS
     chdir_test.cpp
   DEPENDS
+    libc.hdr.fcntl_macros
     libc.include.unistd
     libc.src.errno.errno
-    libc.src.fcntl.open
     libc.src.unistd.chdir
     libc.src.unistd.close
+    libc.src.fcntl.open 
     libc.test.UnitTest.ErrnoSetterMatcher
 )
 
@@ -223,7 +224,7 @@ add_libc_unittest(
   SRCS
     rmdir_test.cpp
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.src.errno.errno
     libc.src.sys.stat.mkdir
     libc.src.unistd.rmdir
@@ -262,7 +263,7 @@ add_libc_unittest(
   SRCS
     readlinkat_test.cpp
   DEPENDS
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.include.unistd
     libc.src.errno.errno
     libc.src.unistd.readlinkat
@@ -410,7 +411,7 @@ add_libc_unittest(
     syscall_test.cpp
   DEPENDS
     libc.include.unistd
-    libc.include.fcntl
+    libc.hdr.fcntl_macros
     libc.include.sys_syscall
     libc.src.errno.errno 
     libc.src.unistd.__llvm_libc_syscall
diff --git a/libc/test/src/unistd/chdir_test.cpp b/libc/test/src/unistd/chdir_test.cpp
index 51dc7bb15d3ee..e1bdcd77119f7 100644
--- a/libc/test/src/unistd/chdir_test.cpp
+++ b/libc/test/src/unistd/chdir_test.cpp
@@ -13,7 +13,7 @@
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <fcntl.h>
+#include "hdr/fcntl_macros.h"
 
 TEST(LlvmLibcChdirTest, ChangeAndOpen) {
   // The idea of this test is that we will first open an existing test file
diff --git a/libc/test/src/unistd/fchdir_test.cpp b/libc/test/src/unistd/fchdir_test.cpp
index ae88e1f22ed6b..0e39fde17c67b 100644
--- a/libc/test/src/unistd/fchdir_test.cpp
+++ b/libc/test/src/unistd/fchdir_test.cpp
@@ -13,7 +13,7 @@
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <fcntl.h>
+#include "hdr/fcntl_macros.h"
 
 TEST(LlvmLibcChdirTest, ChangeAndOpen) {
   // The idea of this test is that we will first open an existing test file
diff --git a/libc/test/src/unistd/readlinkat_test.cpp b/libc/test/src/unistd/readlinkat_test.cpp
index 1fa683b02b5b5..9e4bb9af02e76 100644
--- a/libc/test/src/unistd/readlinkat_test.cpp
+++ b/libc/test/src/unistd/readlinkat_test.cpp
@@ -15,7 +15,7 @@
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <fcntl.h>
+#include "hdr/fcntl_macros.h"
 
 namespace cpp = LIBC_NAMESPACE::cpp;
 
diff --git a/libc/test/src/unistd/rmdir_test.cpp b/libc/test/src/unistd/rmdir_test.cpp
index 93cb0f3f53c1b..4f4cd94c5cf0b 100644
--- a/libc/test/src/unistd/rmdir_test.cpp
+++ b/libc/test/src/unistd/rmdir_test.cpp
@@ -12,7 +12,7 @@
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <fcntl.h>
+#include "hdr/fcntl_macros.h"
 
 TEST(LlvmLibcRmdirTest, CreateAndRemove) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
diff --git a/libc/test/src/unistd/syscall_test.cpp b/libc/test/src/unistd/syscall_test.cpp
index cee29bd9afa30..f6cc3eab9aabe 100644
--- a/libc/test/src/unistd/syscall_test.cpp
+++ b/libc/test/src/unistd/syscall_test.cpp
@@ -11,7 +11,7 @@
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <fcntl.h>
+#include "hdr/fcntl_macros.h"
 #include <sys/stat.h>    // For S_* flags.
 #include <sys/syscall.h> // For syscall numbers.
 #include <unistd.h>
diff --git a/libcxx/include/__memory/uninitialized_algorithms.h b/libcxx/include/__memory/uninitialized_algorithms.h
index 54af1fa1a1cc5..3fa948ecc43cf 100644
--- a/libcxx/include/__memory/uninitialized_algorithms.h
+++ b/libcxx/include/__memory/uninitialized_algorithms.h
@@ -638,7 +638,8 @@ __uninitialized_allocator_relocate(_Alloc& __alloc, _Tp* __first, _Tp* __last, _
     __guard.__complete();
     std::__allocator_destroy(__alloc, __first, __last);
   } else {
-    __builtin_memcpy(__result, __first, sizeof(_Tp) * (__last - __first));
+    // Casting to void* to suppress clang complaining that this is technically UB.
+    __builtin_memcpy(static_cast<void*>(__result), __first, sizeof(_Tp) * (__last - __first));
   }
 }
 
diff --git a/libcxx/test/std/utilities/expected/types.h b/libcxx/test/std/utilities/expected/types.h
index 2b6983fb399c6..df73ebdfe495e 100644
--- a/libcxx/test/std/utilities/expected/types.h
+++ b/libcxx/test/std/utilities/expected/types.h
@@ -162,7 +162,7 @@ template <int Constant>
 struct TailClobberer {
   constexpr TailClobberer() noexcept {
     if (!std::is_constant_evaluated()) {
-      std::memset(this, Constant, sizeof(*this));
+      std::memset(static_cast<void*>(this), Constant, sizeof(*this));
     }
     // Always set `b` itself to `false` so that the comparison works.
     b = false;
@@ -245,7 +245,7 @@ struct BoolWithPadding {
   constexpr explicit BoolWithPadding() noexcept : BoolWithPadding(false) {}
   constexpr BoolWithPadding(bool val) noexcept {
     if (!std::is_constant_evaluated()) {
-      std::memset(this, 0, sizeof(*this));
+      std::memset(static_cast<void*>(this), 0, sizeof(*this));
     }
     val_ = val;
   }
@@ -268,7 +268,7 @@ struct IntWithoutPadding {
   constexpr explicit IntWithoutPadding() noexcept : IntWithoutPadding(0) {}
   constexpr IntWithoutPadding(int val) noexcept {
     if (!std::is_constant_evaluated()) {
-      std::memset(this, 0, sizeof(*this));
+      std::memset(static_cast<void*>(this), 0, sizeof(*this));
     }
     val_ = val;
   }
diff --git a/libcxx/test/support/min_allocator.h b/libcxx/test/support/min_allocator.h
index 13ee98289c36b..18f51f8072640 100644
--- a/libcxx/test/support/min_allocator.h
+++ b/libcxx/test/support/min_allocator.h
@@ -465,14 +465,14 @@ class safe_allocator {
   TEST_CONSTEXPR_CXX20 T* allocate(std::size_t n) {
     T* memory = std::allocator<T>().allocate(n);
     if (!TEST_IS_CONSTANT_EVALUATED)
-      std::memset(memory, 0, sizeof(T) * n);
+      std::memset(static_cast<void*>(memory), 0, sizeof(T) * n);
 
     return memory;
   }
 
   TEST_CONSTEXPR_CXX20 void deallocate(T* p, std::size_t n) {
     if (!TEST_IS_CONSTANT_EVALUATED)
-      DoNotOptimize(std::memset(p, 0, sizeof(T) * n));
+      DoNotOptimize(std::memset(static_cast<void*>(p), 0, sizeof(T) * n));
     std::allocator<T>().deallocate(p, n);
   }
 
diff --git a/lldb/include/lldb/Breakpoint/BreakpointLocation.h b/lldb/include/lldb/Breakpoint/BreakpointLocation.h
index 3592291bb2d06..cca00335bc3c6 100644
--- a/lldb/include/lldb/Breakpoint/BreakpointLocation.h
+++ b/lldb/include/lldb/Breakpoint/BreakpointLocation.h
@@ -11,12 +11,10 @@
 
 #include <memory>
 #include <mutex>
-#include <optional>
 
 #include "lldb/Breakpoint/BreakpointOptions.h"
 #include "lldb/Breakpoint/StoppointHitCounter.h"
 #include "lldb/Core/Address.h"
-#include "lldb/Symbol/LineEntry.h"
 #include "lldb/Utility/UserID.h"
 #include "lldb/lldb-private.h"
 
@@ -284,25 +282,6 @@ class BreakpointLocation
   /// Returns the breakpoint location ID.
   lldb::break_id_t GetID() const { return m_loc_id; }
 
-  /// Set the line entry that should be shown to users for this location.
-  /// It is up to the caller to verify that this is a valid entry to show.
-  /// The current use of this is to distinguish among line entries from a
-  /// virtual inlined call stack that all share the same address.
-  /// The line entry must have the same start address as the address for this
-  /// location.
-  bool SetPreferredLineEntry(const LineEntry &line_entry) {
-    if (m_address == line_entry.range.GetBaseAddress()) {
-      m_preferred_line_entry = line_entry;
-      return true;
-    }
-    assert(0 && "Tried to set a preferred line entry with a different address");
-    return false;
-  }
-
-  const std::optional<LineEntry> GetPreferredLineEntry() {
-    return m_preferred_line_entry;
-  }
-
 protected:
   friend class BreakpointSite;
   friend class BreakpointLocationList;
@@ -327,16 +306,6 @@ class BreakpointLocation
   /// If it returns false we should continue, otherwise stop.
   bool IgnoreCountShouldStop();
 
-  /// If this location knows that the virtual stack frame it represents is
-  /// not frame 0, return the suggested stack frame instead.  This will happen
-  /// when the location's address contains a "virtual inlined call stack" and
-  /// the breakpoint was set on a file & line that are not at the bottom of that
-  /// stack.  For now we key off the "preferred line entry" - looking for that
-  /// in the blocks that start with the stop PC.
-  /// This version of the API doesn't take an "inlined" parameter because it
-  /// only changes frames in the inline stack.
-  std::optional<uint32_t> GetSuggestedStackFrameIndex();
-
 private:
   void SwapLocation(lldb::BreakpointLocationSP swap_from);
 
@@ -400,11 +369,6 @@ class BreakpointLocation
   lldb::break_id_t m_loc_id; ///< Breakpoint location ID.
   StoppointHitCounter m_hit_counter; ///< Number of times this breakpoint
                                      /// location has been hit.
-  /// If this exists, use it to print the stop description rather than the
-  /// LineEntry m_address resolves to directly.  Use this for instance when the
-  /// location was given somewhere in the virtual inlined call stack since the
-  /// Address always resolves to the lowest entry in the stack.
-  std::optional<LineEntry> m_preferred_line_entry;
 
   void SetShouldResolveIndirectFunctions(bool do_resolve) {
     m_should_resolve_indirect_functions = do_resolve;
diff --git a/lldb/include/lldb/Breakpoint/BreakpointSite.h b/lldb/include/lldb/Breakpoint/BreakpointSite.h
index 7b3f7be23639f..17b76d51c1ae5 100644
--- a/lldb/include/lldb/Breakpoint/BreakpointSite.h
+++ b/lldb/include/lldb/Breakpoint/BreakpointSite.h
@@ -170,11 +170,6 @@ class BreakpointSite : public std::enable_shared_from_this<BreakpointSite>,
   /// \see lldb::DescriptionLevel
   void GetDescription(Stream *s, lldb::DescriptionLevel level);
 
-  // This runs through all the breakpoint locations owning this site and returns
-  // the greatest of their suggested stack frame indexes.  This only handles
-  // inlined stack changes.
-  std::optional<uint32_t> GetSuggestedStackFrameIndex();
-
   /// Tell whether a breakpoint has a location at this site.
   ///
   /// \param[in] bp_id
diff --git a/lldb/include/lldb/Core/Declaration.h b/lldb/include/lldb/Core/Declaration.h
index c864b88c6b32a..4a0e9047b5469 100644
--- a/lldb/include/lldb/Core/Declaration.h
+++ b/lldb/include/lldb/Core/Declaration.h
@@ -84,14 +84,10 @@ class Declaration {
   /// \param[in] declaration
   ///     The const Declaration object to compare with.
   ///
-  /// \param[in] full
-  ///     Same meaning as Full in FileSpec::Equal.  True means an empty
-  ///     directory is not equal to a specified one, false means it is equal.
-  ///
   /// \return
   ///     Returns \b true if \b declaration is at the same file and
   ///     line, \b false otherwise.
-  bool FileAndLineEqual(const Declaration &declaration, bool full) const;
+  bool FileAndLineEqual(const Declaration &declaration) const;
 
   /// Dump a description of this object to a Stream.
   ///
diff --git a/lldb/include/lldb/Target/StopInfo.h b/lldb/include/lldb/Target/StopInfo.h
index 45beac129e86f..fae90364deaf0 100644
--- a/lldb/include/lldb/Target/StopInfo.h
+++ b/lldb/include/lldb/Target/StopInfo.h
@@ -77,18 +77,6 @@ class StopInfo : public std::enable_shared_from_this<StopInfo> {
       m_description.clear();
   }
 
-  /// This gives the StopInfo a chance to suggest a stack frame to select.
-  /// Passing true for inlined_stack will request changes to the inlined
-  /// call stack.  Passing false will request changes to the real stack
-  /// frame.  The inlined stack gets adjusted before we call into the thread
-  /// plans so they can reason based on the correct values.  The real stack
-  /// adjustment is handled after the frame recognizers get a chance to adjust
-  /// the frame.
-  virtual std::optional<uint32_t>
-  GetSuggestedStackFrameIndex(bool inlined_stack) {
-    return {};
-  }
-
   virtual bool IsValidForOperatingSystemThread(Thread &thread) { return true; }
 
   /// A Continue operation can result in a false stop event
diff --git a/lldb/include/lldb/Target/ThreadPlanStepInRange.h b/lldb/include/lldb/Target/ThreadPlanStepInRange.h
index 9da8370ef1c92..f9ef87942a7c0 100644
--- a/lldb/include/lldb/Target/ThreadPlanStepInRange.h
+++ b/lldb/include/lldb/Target/ThreadPlanStepInRange.h
@@ -80,8 +80,8 @@ class ThreadPlanStepInRange : public ThreadPlanStepRange,
   bool m_step_past_prologue; // FIXME: For now hard-coded to true, we could put
                              // a switch in for this if there's
                              // demand for that.
-  LazyBool m_virtual_step;   // true if we've just done a "virtual step", i.e.
-                             // just moved the inline stack depth.
+  bool m_virtual_step; // true if we've just done a "virtual step", i.e. just
+                       // moved the inline stack depth.
   ConstString m_step_into_target;
   ThreadPlanStepInRange(const ThreadPlanStepInRange &) = delete;
   const ThreadPlanStepInRange &
diff --git a/lldb/source/Breakpoint/BreakpointLocation.cpp b/lldb/source/Breakpoint/BreakpointLocation.cpp
index c7ea50407ae1c..ad9057c8141e9 100644
--- a/lldb/source/Breakpoint/BreakpointLocation.cpp
+++ b/lldb/source/Breakpoint/BreakpointLocation.cpp
@@ -508,20 +508,8 @@ void BreakpointLocation::GetDescription(Stream *s,
         s->PutCString("re-exported target = ");
       else
         s->PutCString("where = ");
-
-      // If there's a preferred line entry for printing, use that.
-      bool show_function_info = true;
-      if (auto preferred = GetPreferredLineEntry()) {
-        sc.line_entry = *preferred;
-        // FIXME: We're going to get the function name wrong when the preferred
-        // line entry is not the lowest one.  For now, just leave the function
-        // out in this case, but we really should also figure out how to easily
-        // fake the function name here.
-        show_function_info = false;
-      }
       sc.DumpStopContext(s, m_owner.GetTarget().GetProcessSP().get(), m_address,
-                         false, true, false, show_function_info,
-                         show_function_info, show_function_info);
+                         false, true, false, true, true, true);
     } else {
       if (sc.module_sp) {
         s->EOL();
@@ -549,10 +537,7 @@ void BreakpointLocation::GetDescription(Stream *s,
         if (sc.line_entry.line > 0) {
           s->EOL();
           s->Indent("location = ");
-          if (auto preferred = GetPreferredLineEntry())
-            preferred->DumpStopContext(s, true);
-          else
-            sc.line_entry.DumpStopContext(s, true);
+          sc.line_entry.DumpStopContext(s, true);
         }
 
       } else {
@@ -671,50 +656,6 @@ void BreakpointLocation::SendBreakpointLocationChangedEvent(
   }
 }
 
-std::optional<uint32_t> BreakpointLocation::GetSuggestedStackFrameIndex() {
-  auto preferred_opt = GetPreferredLineEntry();
-  if (!preferred_opt)
-    return {};
-  LineEntry preferred = *preferred_opt;
-  SymbolContext sc;
-  if (!m_address.CalculateSymbolContext(&sc))
-    return {};
-  // Don't return anything special if frame 0 is the preferred line entry.
-  // We not really telling the stack frame list to do anything special in that
-  // case.
-  if (!LineEntry::Compare(sc.line_entry, preferred))
-    return {};
-
-  if (!sc.block)
-    return {};
-
-  // Blocks have their line info in Declaration form, so make one here:
-  Declaration preferred_decl(preferred.GetFile(), preferred.line,
-                             preferred.column);
-
-  uint32_t depth = 0;
-  Block *inlined_block = sc.block->GetContainingInlinedBlock();
-  while (inlined_block) {
-    // If we've moved to a block that this isn't the start of, that's not
-    // our inlining info or call site, so we can stop here.
-    Address start_address;
-    if (!inlined_block->GetStartAddress(start_address) ||
-        start_address != m_address)
-      return {};
-
-    const InlineFunctionInfo *info = inlined_block->GetInlinedFunctionInfo();
-    if (info) {
-      if (preferred_decl == info->GetDeclaration())
-        return depth;
-      if (preferred_decl == info->GetCallSite())
-        return depth + 1;
-    }
-    inlined_block = inlined_block->GetInlinedParent();
-    depth++;
-  }
-  return {};
-}
-
 void BreakpointLocation::SwapLocation(BreakpointLocationSP swap_from) {
   m_address = swap_from->m_address;
   m_should_resolve_indirect_functions =
diff --git a/lldb/source/Breakpoint/BreakpointResolver.cpp b/lldb/source/Breakpoint/BreakpointResolver.cpp
index 9643602d78c75..8307689c7640c 100644
--- a/lldb/source/Breakpoint/BreakpointResolver.cpp
+++ b/lldb/source/Breakpoint/BreakpointResolver.cpp
@@ -340,21 +340,6 @@ void BreakpointResolver::AddLocation(SearchFilter &filter,
   }
 
   BreakpointLocationSP bp_loc_sp(AddLocation(line_start));
-  // If the address that we resolved the location to returns a different
-  // LineEntry from the one in the incoming SC, we're probably dealing with an
-  // inlined call site, so set that as the preferred LineEntry:
-  LineEntry resolved_entry;
-  if (!skipped_prologue && bp_loc_sp &&
-      line_start.CalculateSymbolContextLineEntry(resolved_entry) &&
-      LineEntry::Compare(resolved_entry, sc.line_entry)) {
-    // FIXME: The function name will also be wrong here.  Do we need to record
-    // that as well, or can we figure that out again when we report this
-    // breakpoint location.
-    if (!bp_loc_sp->SetPreferredLineEntry(sc.line_entry)) {
-      LLDB_LOG(log, "Tried to add a preferred line entry that didn't have the "
-                    "same address as this location's address.");
-    }
-  }
   if (log && bp_loc_sp && !GetBreakpoint()->IsInternal()) {
     StreamString s;
     bp_loc_sp->GetDescription(&s, lldb::eDescriptionLevelVerbose);
diff --git a/lldb/source/Breakpoint/BreakpointSite.cpp b/lldb/source/Breakpoint/BreakpointSite.cpp
index 9700a57d3346e..3ca93f908e30b 100644
--- a/lldb/source/Breakpoint/BreakpointSite.cpp
+++ b/lldb/source/Breakpoint/BreakpointSite.cpp
@@ -87,23 +87,6 @@ void BreakpointSite::GetDescription(Stream *s, lldb::DescriptionLevel level) {
   m_constituents.GetDescription(s, level);
 }
 
-std::optional<uint32_t> BreakpointSite::GetSuggestedStackFrameIndex() {
-
-  std::optional<uint32_t> result;
-  std::lock_guard<std::recursive_mutex> guard(m_constituents_mutex);
-  for (BreakpointLocationSP loc_sp : m_constituents.BreakpointLocations()) {
-    std::optional<uint32_t> loc_frame_index =
-        loc_sp->GetSuggestedStackFrameIndex();
-    if (loc_frame_index) {
-      if (result)
-        result = std::max(*loc_frame_index, *result);
-      else
-        result = loc_frame_index;
-    }
-  }
-  return result;
-}
-
 bool BreakpointSite::IsInternal() const { return m_constituents.IsInternal(); }
 
 uint8_t *BreakpointSite::GetTrapOpcodeBytes() { return &m_trap_opcode[0]; }
diff --git a/lldb/source/Core/Declaration.cpp b/lldb/source/Core/Declaration.cpp
index a485c4b9ba48a..579a3999d14ea 100644
--- a/lldb/source/Core/Declaration.cpp
+++ b/lldb/source/Core/Declaration.cpp
@@ -70,9 +70,8 @@ int Declaration::Compare(const Declaration &a, const Declaration &b) {
   return 0;
 }
 
-bool Declaration::FileAndLineEqual(const Declaration &declaration,
-                                   bool full) const {
-  int file_compare = FileSpec::Compare(this->m_file, declaration.m_file, full);
+bool Declaration::FileAndLineEqual(const Declaration &declaration) const {
+  int file_compare = FileSpec::Compare(this->m_file, declaration.m_file, true);
   return file_compare == 0 && this->m_line == declaration.m_line;
 }
 
diff --git a/lldb/source/Symbol/Block.cpp b/lldb/source/Symbol/Block.cpp
index 5c7772a6db780..f7d9c0d2d3306 100644
--- a/lldb/source/Symbol/Block.cpp
+++ b/lldb/source/Symbol/Block.cpp
@@ -230,7 +230,7 @@ Block *Block::GetContainingInlinedBlockWithCallSite(
     const auto *function_info = inlined_block->GetInlinedFunctionInfo();
 
     if (function_info &&
-        function_info->GetCallSite().FileAndLineEqual(find_call_site, true))
+        function_info->GetCallSite().FileAndLineEqual(find_call_site))
       return inlined_block;
     inlined_block = inlined_block->GetInlinedParent();
   }
diff --git a/lldb/source/Symbol/CompileUnit.cpp b/lldb/source/Symbol/CompileUnit.cpp
index f0f7e40ae70d8..db8f8ce6bcbc9 100644
--- a/lldb/source/Symbol/CompileUnit.cpp
+++ b/lldb/source/Symbol/CompileUnit.cpp
@@ -251,10 +251,7 @@ void CompileUnit::ResolveSymbolContext(
     SymbolContextItem resolve_scope, SymbolContextList &sc_list,
     RealpathPrefixes *realpath_prefixes) {
   const FileSpec file_spec = src_location_spec.GetFileSpec();
-  const uint32_t line =
-      src_location_spec.GetLine().value_or(LLDB_INVALID_LINE_NUMBER);
-  const uint32_t column_num =
-      src_location_spec.GetColumn().value_or(LLDB_INVALID_COLUMN_NUMBER);
+  const uint32_t line = src_location_spec.GetLine().value_or(0);
   const bool check_inlines = src_location_spec.GetCheckInlines();
 
   // First find all of the file indexes that match our "file_spec". If
@@ -315,112 +312,6 @@ void CompileUnit::ResolveSymbolContext(
         0, file_indexes, src_location_spec, &line_entry);
   }
 
-  // If we didn't manage to find a breakpoint that matched the line number
-  // requested, that might be because it is only an inline call site, and
-  // doesn't have a line entry in the line table.  Scan for that here.
-  //
-  // We are making the assumption that if there was an inlined function it will
-  // contribute at least 1 non-call-site entry to the line table.  That's handy
-  // because we don't move line breakpoints over function boundaries, so if we
-  // found a hit, and there were also a call site entry, it would have to be in
-  // the function containing the PC of the line table match.  That way we can
-  // limit the call site search to that function.
-  // We will miss functions that ONLY exist as a call site entry.
-
-  if (line_entry.IsValid() &&
-      (line_entry.line != line || line_entry.column != column_num) &&
-      resolve_scope & eSymbolContextLineEntry && check_inlines) {
-    // We don't move lines over function boundaries, so the address in the
-    // line entry will be the in function that contained the line that might
-    // be a CallSite, and we can just iterate over that function to find any
-    // inline records, and dig up their call sites.
-    Address start_addr = line_entry.range.GetBaseAddress();
-    Function *function = start_addr.CalculateSymbolContextFunction();
-
-    Declaration sought_decl(file_spec, line, column_num);
-    // We use this recursive function to descend the block structure looking
-    // for a block that has this Declaration as in it's CallSite info.
-    // This function recursively scans the sibling blocks of the incoming
-    // block parameter.
-    std::function<void(Block &)> examine_block =
-        [&sought_decl, &sc_list, &src_location_spec, resolve_scope,
-         &examine_block](Block &block) -> void {
-      // Iterate over the sibling child blocks of the incoming block.
-      Block *sibling_block = block.GetFirstChild();
-      while (sibling_block) {
-        // We only have to descend through the regular blocks, looking for
-        // immediate inlines, since those are the only ones that will have this
-        // callsite.
-        const InlineFunctionInfo *inline_info =
-            sibling_block->GetInlinedFunctionInfo();
-        if (inline_info) {
-          // If this is the call-site we are looking for, record that:
-          // We need to be careful because the call site from the debug info
-          // will generally have a column, but the user might not have specified
-          // it.
-          Declaration found_decl = inline_info->GetCallSite();
-          uint32_t sought_column = sought_decl.GetColumn();
-          if (found_decl.FileAndLineEqual(sought_decl, false) &&
-              (sought_column == LLDB_INVALID_COLUMN_NUMBER ||
-               sought_column == found_decl.GetColumn())) {
-            // If we found a call site, it belongs not in this inlined block,
-            // but in the parent block that inlined it.
-            Address parent_start_addr;
-            if (sibling_block->GetParent()->GetStartAddress(
-                    parent_start_addr)) {
-              SymbolContext sc;
-              parent_start_addr.CalculateSymbolContext(&sc, resolve_scope);
-              // Now swap out the line entry for the one we found.
-              LineEntry call_site_line = sc.line_entry;
-              call_site_line.line = found_decl.GetLine();
-              call_site_line.column = found_decl.GetColumn();
-              bool matches_spec = true;
-              // If the user asked for an exact match, we need to make sure the
-              // call site we found actually matches the location.
-              if (src_location_spec.GetExactMatch()) {
-                matches_spec = false;
-                if ((src_location_spec.GetFileSpec() ==
-                     sc.line_entry.GetFile()) &&
-                    (src_location_spec.GetLine() &&
-                     *src_location_spec.GetLine() == call_site_line.line) &&
-                    (src_location_spec.GetColumn() &&
-                     *src_location_spec.GetColumn() == call_site_line.column))
-                  matches_spec = true;
-              }
-              if (matches_spec &&
-                  sibling_block->GetRangeAtIndex(0, call_site_line.range)) {
-                SymbolContext call_site_sc(sc.target_sp, sc.module_sp,
-                                           sc.comp_unit, sc.function, sc.block,
-                                           &call_site_line, sc.symbol);
-                sc_list.Append(call_site_sc);
-              }
-            }
-          }
-        }
-
-        // Descend into the child blocks:
-        examine_block(*sibling_block);
-        // Now go to the next sibling:
-        sibling_block = sibling_block->GetSibling();
-      }
-    };
-
-    if (function) {
-      // We don't need to examine the function block, it can't be inlined.
-      Block &func_block = function->GetBlock(true);
-      examine_block(func_block);
-    }
-    // If we found entries here, we are done.  We only get here because we
-    // didn't find an exact line entry for this line & column, but if we found
-    // an exact match from the call site info that's strictly better than
-    // continuing to look for matches further on in the file.
-    // FIXME: Should I also do this for "call site line exists between the
-    // given line number and the later line we found in the line table"?  That's
-    // a closer approximation to our general sliding algorithm.
-    if (sc_list.GetSize())
-      return;
-  }
-
   // If "exact == true", then "found_line" will be the same as "line". If
   // "exact == false", the "found_line" will be the closest line entry
   // with a line number greater than "line" and we will use this for our
diff --git a/lldb/source/Target/StackFrameList.cpp b/lldb/source/Target/StackFrameList.cpp
index 94a381edd5e20..3849ec5ed178d 100644
--- a/lldb/source/Target/StackFrameList.cpp
+++ b/lldb/source/Target/StackFrameList.cpp
@@ -85,32 +85,121 @@ void StackFrameList::ResetCurrentInlinedDepth() {
     return;
 
   std::lock_guard<std::recursive_mutex> guard(m_mutex);
+  
+  GetFramesUpTo(0, DoNotAllowInterruption);
+  if (m_frames.empty())
+    return;
+  if (!m_frames[0]->IsInlined()) {
+    m_current_inlined_depth = UINT32_MAX;
+    m_current_inlined_pc = LLDB_INVALID_ADDRESS;
+    Log *log = GetLog(LLDBLog::Step);
+    if (log && log->GetVerbose())
+      LLDB_LOGF(
+          log,
+          "ResetCurrentInlinedDepth: Invalidating current inlined depth.\n");
+    return;
+  }
 
-  m_current_inlined_pc = LLDB_INVALID_ADDRESS;
-  m_current_inlined_depth = UINT32_MAX;
+  // We only need to do something special about inlined blocks when we are
+  // at the beginning of an inlined function:
+  // FIXME: We probably also have to do something special if the PC is at
+  // the END of an inlined function, which coincides with the end of either
+  // its containing function or another inlined function.
+
+  Block *block_ptr = m_frames[0]->GetFrameBlock();
+  if (!block_ptr)
+    return;
 
+  Address pc_as_address;
+  lldb::addr_t curr_pc = m_thread.GetRegisterContext()->GetPC();
+  pc_as_address.SetLoadAddress(curr_pc, &(m_thread.GetProcess()->GetTarget()));
+  AddressRange containing_range;
+  if (!block_ptr->GetRangeContainingAddress(pc_as_address, containing_range) ||
+      pc_as_address != containing_range.GetBaseAddress())
+    return;
+
+  // If we got here because of a breakpoint hit, then set the inlined depth
+  // depending on where the breakpoint was set. If we got here because of a
+  // crash, then set the inlined depth to the deepest most block.  Otherwise,
+  // we stopped here naturally as the result of a step, so set ourselves in the
+  // containing frame of the whole set of nested inlines, so the user can then
+  // "virtually" step into the frames one by one, or next over the whole mess.
+  // Note: We don't have to handle being somewhere in the middle of the stack
+  // here, since ResetCurrentInlinedDepth doesn't get called if there is a
+  // valid inlined depth set.
   StopInfoSP stop_info_sp = m_thread.GetStopInfo();
   if (!stop_info_sp)
     return;
+  switch (stop_info_sp->GetStopReason()) {
+  case eStopReasonWatchpoint:
+  case eStopReasonException:
+  case eStopReasonExec:
+  case eStopReasonFork:
+  case eStopReasonVFork:
+  case eStopReasonVForkDone:
+  case eStopReasonSignal:
+    // In all these cases we want to stop in the deepest frame.
+    m_current_inlined_pc = curr_pc;
+    m_current_inlined_depth = 0;
+    break;
+  case eStopReasonBreakpoint: {
+    // FIXME: Figure out what this break point is doing, and set the inline
+    // depth appropriately.  Be careful to take into account breakpoints that
+    // implement step over prologue, since that should do the default
+    // calculation. For now, if the breakpoints corresponding to this hit are
+    // all internal, I set the stop location to the top of the inlined stack,
+    // since that will make things like stepping over prologues work right.
+    // But if there are any non-internal breakpoints I do to the bottom of the
+    // stack, since that was the old behavior.
+    uint32_t bp_site_id = stop_info_sp->GetValue();
+    BreakpointSiteSP bp_site_sp(
+        m_thread.GetProcess()->GetBreakpointSiteList().FindByID(bp_site_id));
+    bool all_internal = true;
+    if (bp_site_sp) {
+      uint32_t num_owners = bp_site_sp->GetNumberOfConstituents();
+      for (uint32_t i = 0; i < num_owners; i++) {
+        Breakpoint &bp_ref =
+            bp_site_sp->GetConstituentAtIndex(i)->GetBreakpoint();
+        if (!bp_ref.IsInternal()) {
+          all_internal = false;
+        }
+      }
+    }
+    if (!all_internal) {
+      m_current_inlined_pc = curr_pc;
+      m_current_inlined_depth = 0;
+      break;
+    }
+  }
+    [[fallthrough]];
+  default: {
+    // Otherwise, we should set ourselves at the container of the inlining, so
+    // that the user can descend into them. So first we check whether we have
+    // more than one inlined block sharing this PC:
+    int num_inlined_functions = 0;
+
+    for (Block *container_ptr = block_ptr->GetInlinedParent();
+         container_ptr != nullptr;
+         container_ptr = container_ptr->GetInlinedParent()) {
+      if (!container_ptr->GetRangeContainingAddress(pc_as_address,
+                                                    containing_range))
+        break;
+      if (pc_as_address != containing_range.GetBaseAddress())
+        break;
 
-  bool inlined = true;
-  auto inline_depth = stop_info_sp->GetSuggestedStackFrameIndex(inlined);
-  // We're only adjusting the inlined stack here.
-  Log *log = GetLog(LLDBLog::Step);
-  if (inline_depth) {
-    m_current_inlined_depth = *inline_depth;
-    m_current_inlined_pc = m_thread.GetRegisterContext()->GetPC();
-
+      num_inlined_functions++;
+    }
+    m_current_inlined_pc = curr_pc;
+    m_current_inlined_depth = num_inlined_functions + 1;
+    Log *log = GetLog(LLDBLog::Step);
     if (log && log->GetVerbose())
       LLDB_LOGF(log,
                 "ResetCurrentInlinedDepth: setting inlined "
                 "depth: %d 0x%" PRIx64 ".\n",
-                m_current_inlined_depth, m_current_inlined_pc);
-  } else {
-    if (log && log->GetVerbose())
-      LLDB_LOGF(
-          log,
-          "ResetCurrentInlinedDepth: Invalidating current inlined depth.\n");
+                m_current_inlined_depth, curr_pc);
+
+    break;
+  }
   }
 }
 
@@ -727,48 +816,19 @@ void StackFrameList::SelectMostRelevantFrame() {
 
   RecognizedStackFrameSP recognized_frame_sp = frame_sp->GetRecognizedFrame();
 
-  if (recognized_frame_sp) {
-    if (StackFrameSP most_relevant_frame_sp =
-            recognized_frame_sp->GetMostRelevantFrame()) {
-      LLDB_LOG(log, "Found most relevant frame at index {0}",
-               most_relevant_frame_sp->GetFrameIndex());
-      SetSelectedFrame(most_relevant_frame_sp.get());
-      return;
-    }
-  }
-  LLDB_LOG(log, "Frame #0 not recognized");
-
-  // If this thread has a non-trivial StopInof, then let it suggest
-  // a most relevant frame:
-  StopInfoSP stop_info_sp = m_thread.GetStopInfo();
-  uint32_t stack_idx = 0;
-  bool found_relevant = false;
-  if (stop_info_sp) {
-    // Here we're only asking the stop info if it wants to adjust the real stack
-    // index.  We have to ask about the m_inlined_stack_depth in
-    // Thread::ShouldStop since the plans need to reason with that info.
-    bool inlined = false;
-    std::optional<uint32_t> stack_opt =
-        stop_info_sp->GetSuggestedStackFrameIndex(inlined);
-    if (stack_opt) {
-      stack_idx = *stack_opt;
-      found_relevant = true;
-    }
+  if (!recognized_frame_sp) {
+    LLDB_LOG(log, "Frame #0 not recognized");
+    return;
   }
 
-  frame_sp = GetFrameAtIndex(stack_idx);
-  if (!frame_sp)
-    LLDB_LOG(log, "Stop info suggested relevant frame {0} but it didn't exist",
-             stack_idx);
-  else if (found_relevant)
-    LLDB_LOG(log, "Setting selected frame from stop info to {0}", stack_idx);
-  // Note, we don't have to worry about "inlined" frames here, because we've
-  // already calculated the inlined frame in Thread::ShouldStop, and
-  // SetSelectedFrame will take care of that adjustment for us.
-  SetSelectedFrame(frame_sp.get());
-
-  if (!found_relevant)
+  if (StackFrameSP most_relevant_frame_sp =
+          recognized_frame_sp->GetMostRelevantFrame()) {
+    LLDB_LOG(log, "Found most relevant frame at index {0}",
+             most_relevant_frame_sp->GetFrameIndex());
+    SetSelectedFrame(most_relevant_frame_sp.get());
+  } else {
     LLDB_LOG(log, "No relevant frame!");
+  }
 }
 
 uint32_t StackFrameList::GetSelectedFrameIndex(
@@ -781,7 +841,6 @@ uint32_t StackFrameList::GetSelectedFrameIndex(
     // isn't set, then don't force a selection here, just return 0.
     if (!select_most_relevant)
       return 0;
-    // If the inlined stack frame is set, then use that:
     m_selected_frame_idx = 0;
   }
   return *m_selected_frame_idx;
diff --git a/lldb/source/Target/StopInfo.cpp b/lldb/source/Target/StopInfo.cpp
index f6387d47504e6..60aa65ed38c74 100644
--- a/lldb/source/Target/StopInfo.cpp
+++ b/lldb/source/Target/StopInfo.cpp
@@ -15,7 +15,6 @@
 #include "lldb/Breakpoint/WatchpointResource.h"
 #include "lldb/Core/Debugger.h"
 #include "lldb/Expression/UserExpression.h"
-#include "lldb/Symbol/Block.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/StopInfo.h"
 #include "lldb/Target/Target.h"
@@ -247,22 +246,6 @@ class StopInfoBreakpoint : public StopInfo {
     return m_description.c_str();
   }
 
-  std::optional<uint32_t>
-  GetSuggestedStackFrameIndex(bool inlined_stack) override {
-    if (!inlined_stack)
-      return {};
-
-    ThreadSP thread_sp(m_thread_wp.lock());
-    if (!thread_sp)
-      return {};
-    BreakpointSiteSP bp_site_sp(
-        thread_sp->GetProcess()->GetBreakpointSiteList().FindByID(m_value));
-    if (!bp_site_sp)
-      return {};
-
-    return bp_site_sp->GetSuggestedStackFrameIndex();
-  }
-
 protected:
   bool ShouldStop(Event *event_ptr) override {
     // This just reports the work done by PerformAction or the synchronous
@@ -1181,44 +1164,6 @@ class StopInfoTrace : public StopInfo {
     else
       return m_description.c_str();
   }
-
-  std::optional<uint32_t>
-  GetSuggestedStackFrameIndex(bool inlined_stack) override {
-    // Trace only knows how to adjust inlined stacks:
-    if (!inlined_stack)
-      return {};
-
-    ThreadSP thread_sp = GetThread();
-    StackFrameSP frame_0_sp = thread_sp->GetStackFrameAtIndex(0);
-    if (!frame_0_sp)
-      return {};
-    if (!frame_0_sp->IsInlined())
-      return {};
-    Block *block_ptr = frame_0_sp->GetFrameBlock();
-    if (!block_ptr)
-      return {};
-    Address pc_address = frame_0_sp->GetFrameCodeAddress();
-    AddressRange containing_range;
-    if (!block_ptr->GetRangeContainingAddress(pc_address, containing_range) ||
-        pc_address != containing_range.GetBaseAddress())
-      return {};
-
-    int num_inlined_functions = 0;
-
-    for (Block *container_ptr = block_ptr->GetInlinedParent();
-         container_ptr != nullptr;
-         container_ptr = container_ptr->GetInlinedParent()) {
-      if (!container_ptr->GetRangeContainingAddress(pc_address,
-                                                    containing_range))
-        break;
-      if (pc_address != containing_range.GetBaseAddress())
-        break;
-
-      num_inlined_functions++;
-    }
-    inlined_stack = true;
-    return num_inlined_functions + 1;
-  }
 };
 
 // StopInfoException
diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp
index 735295e6f2593..8373cdc36268f 100644
--- a/lldb/source/Target/Thread.cpp
+++ b/lldb/source/Target/Thread.cpp
@@ -619,14 +619,6 @@ void Thread::WillStop() {
 
 void Thread::SetupForResume() {
   if (GetResumeState() != eStateSuspended) {
-    // First check whether this thread is going to "actually" resume at all.
-    // For instance, if we're stepping from one level to the next of an
-    // virtual inlined call stack, we just change the inlined call stack index
-    // without actually running this thread.  In that case, for this thread we
-    // shouldn't push a step over breakpoint plan or do that work.
-    if (GetCurrentPlan()->IsVirtualStep())
-      return;
-
     // If we're at a breakpoint push the step-over breakpoint plan.  Do this
     // before telling the current plan it will resume, since we might change
     // what the current plan is.
diff --git a/lldb/source/Target/ThreadPlanStepInRange.cpp b/lldb/source/Target/ThreadPlanStepInRange.cpp
index 325a70619908b..567dcc26d0d37 100644
--- a/lldb/source/Target/ThreadPlanStepInRange.cpp
+++ b/lldb/source/Target/ThreadPlanStepInRange.cpp
@@ -41,7 +41,7 @@ ThreadPlanStepInRange::ThreadPlanStepInRange(
                           "Step Range stepping in", thread, range, addr_context,
                           stop_others),
       ThreadPlanShouldStopHere(this), m_step_past_prologue(true),
-      m_virtual_step(eLazyBoolCalculate), m_step_into_target(step_into_target) {
+      m_virtual_step(false), m_step_into_target(step_into_target) {
   SetCallbacks();
   SetFlagsToDefault();
   SetupAvoidNoDebug(step_in_avoids_code_without_debug_info,
@@ -149,7 +149,7 @@ bool ThreadPlanStepInRange::ShouldStop(Event *event_ptr) {
       m_sub_plan_sp.reset();
   }
 
-  if (m_virtual_step == eLazyBoolYes) {
+  if (m_virtual_step) {
     // If we've just completed a virtual step, all we need to do is check for a
     // ShouldStopHere plan, and otherwise we're done.
     // FIXME - This can be both a step in and a step out.  Probably should
@@ -431,7 +431,7 @@ bool ThreadPlanStepInRange::DoPlanExplainsStop(Event *event_ptr) {
 
   bool return_value = false;
 
-  if (m_virtual_step == eLazyBoolYes) {
+  if (m_virtual_step) {
     return_value = true;
   } else {
     StopInfoSP stop_info_sp = GetPrivateStopInfo();
@@ -460,13 +460,10 @@ bool ThreadPlanStepInRange::DoPlanExplainsStop(Event *event_ptr) {
 
 bool ThreadPlanStepInRange::DoWillResume(lldb::StateType resume_state,
                                          bool current_plan) {
-  m_virtual_step = eLazyBoolCalculate;
+  m_virtual_step = false;
   if (resume_state == eStateStepping && current_plan) {
     Thread &thread = GetThread();
     // See if we are about to step over a virtual inlined call.
-    // But if we already know we're virtual stepping, don't decrement the
-    // inlined depth again...
-
     bool step_without_resume = thread.DecrementCurrentInlinedDepth();
     if (step_without_resume) {
       Log *log = GetLog(LLDBLog::Step);
@@ -479,20 +476,11 @@ bool ThreadPlanStepInRange::DoWillResume(lldb::StateType resume_state,
       // FIXME: Maybe it would be better to create a InlineStep stop reason, but
       // then
       // the whole rest of the world would have to handle that stop reason.
-      m_virtual_step = eLazyBoolYes;
+      m_virtual_step = true;
     }
     return !step_without_resume;
   }
   return true;
 }
 
-bool ThreadPlanStepInRange::IsVirtualStep() {
-  if (m_virtual_step == eLazyBoolCalculate) {
-    Thread &thread = GetThread();
-    if (thread.GetCurrentInlinedDepth() == UINT32_MAX)
-      m_virtual_step = eLazyBoolNo;
-    else
-      m_virtual_step = eLazyBoolYes;
-  }
-  return m_virtual_step == eLazyBoolYes;
-}
+bool ThreadPlanStepInRange::IsVirtualStep() { return m_virtual_step; }
diff --git a/lldb/source/Target/ThreadPlanStepOverRange.cpp b/lldb/source/Target/ThreadPlanStepOverRange.cpp
index 643ee827c865c..ef5b4b5c434d1 100644
--- a/lldb/source/Target/ThreadPlanStepOverRange.cpp
+++ b/lldb/source/Target/ThreadPlanStepOverRange.cpp
@@ -402,7 +402,7 @@ bool ThreadPlanStepOverRange::DoWillResume(lldb::StateType resume_state,
       if (in_inlined_stack) {
         Log *log = GetLog(LLDBLog::Step);
         LLDB_LOGF(log,
-                  "ThreadPlanStepOverRange::DoWillResume: adjusting range to "
+                  "ThreadPlanStepInRange::DoWillResume: adjusting range to "
                   "the frame at inlined depth %d.",
                   thread.GetCurrentInlinedDepth());
         StackFrameSP stack_sp = thread.GetStackFrameAtIndex(0);
diff --git a/lldb/test/API/functionalities/inline-stepping/TestInlineStepping.py b/lldb/test/API/functionalities/inline-stepping/TestInlineStepping.py
index f52e0f0fd5bcf..752c3a9cbd286 100644
--- a/lldb/test/API/functionalities/inline-stepping/TestInlineStepping.py
+++ b/lldb/test/API/functionalities/inline-stepping/TestInlineStepping.py
@@ -32,12 +32,6 @@ def test_step_in_template_with_python_api(self):
         self.build()
         self.step_in_template()
 
-    @add_test_categories(["pyapi"])
-    def test_virtual_inline_stepping(self):
-        """Test stepping through a virtual inlined call stack"""
-        self.build()
-        self.virtual_inline_stepping()
-
     def setUp(self):
         # Call super's setUp().
         TestBase.setUp(self)
@@ -363,60 +357,3 @@ def step_in_template(self):
 
         step_sequence = [["// In max_value specialized", "into"]]
         self.run_step_sequence(step_sequence)
-
-    def run_to_call_site_and_step(self, source_regex, func_name, start_pos):
-        main_spec = lldb.SBFileSpec("calling.cpp")
-        # Set the breakpoint by file and line, not sourced regex because
-        # we want to make sure we can set breakpoints on call sites:
-        call_site_line_num = line_number(self.main_source, source_regex)
-        target, process, thread, bkpt = lldbutil.run_to_line_breakpoint(
-            self, main_spec, call_site_line_num
-        )
-
-        # Make sure that the location is at the call site (run_to_line_breakpoint already asserted
-        # that there's one location.):
-        bkpt_loc = bkpt.location[0]
-        strm = lldb.SBStream()
-        result = bkpt_loc.GetDescription(strm, lldb.eDescriptionLevelFull)
-
-        self.assertTrue(result, "Got a location description")
-        desc = strm.GetData()
-        self.assertIn(f"calling.cpp:{call_site_line_num}", desc, "Right line listed")
-        # We don't get the function name right yet - so we omit it in printing.
-        # Turn on this test when that is working.
-        # self.assertIn(func_name, desc, "Right function listed")
-
-        pc = thread.frame[0].pc
-        for i in range(start_pos, 3):
-            thread.StepInto()
-            frame_0 = thread.frame[0]
-
-            trivial_line_num = line_number(
-                self.main_source, f"In caller_trivial_inline_{i}."
-            )
-            self.assertEqual(
-                frame_0.line_entry.line,
-                trivial_line_num,
-                f"Stepped into the caller_trivial_inline_{i}",
-            )
-            if pc != frame_0.pc:
-                # If we get here, we stepped to the expected line number, but
-                # the compiler on this system has decided to insert an instruction
-                # between the call site of an inlined function with no arguments,
-                # returning void, and its immediate call to another void inlined function
-                # with no arguments.  We aren't going to be testing virtual inline
-                # stepping for this function...
-                break
-
-        process.Kill()
-        target.Clear()
-
-    def virtual_inline_stepping(self):
-        """Use the Python API's to step through a virtual inlined stack"""
-        self.run_to_call_site_and_step("At caller_trivial_inline_1", "main", 1)
-        self.run_to_call_site_and_step(
-            "In caller_trivial_inline_1", "caller_trivial_inline_1", 2
-        )
-        self.run_to_call_site_and_step(
-            "In caller_trivial_inline_2", "caller_trivial_inline_2", 3
-        )
diff --git a/lldb/test/API/functionalities/inline-stepping/calling.cpp b/lldb/test/API/functionalities/inline-stepping/calling.cpp
index d7ee56b3c0790..49179ce7c9788 100644
--- a/lldb/test/API/functionalities/inline-stepping/calling.cpp
+++ b/lldb/test/API/functionalities/inline-stepping/calling.cpp
@@ -13,12 +13,6 @@ int called_by_inline_ref (int &value);
 inline void inline_trivial_1 () __attribute__((always_inline));
 inline void inline_trivial_2 () __attribute__((always_inline));
 
-// These three should share the same initial pc so we can test
-// virtual inline stepping.
-inline void caller_trivial_inline_1() __attribute__((always_inline));
-inline void caller_trivial_inline_2() __attribute__((always_inline));
-inline void caller_trivial_inline_3() __attribute__((always_inline));
-
 void caller_trivial_1 ();
 void caller_trivial_2 ();
 
@@ -85,23 +79,6 @@ caller_trivial_2 ()
     inline_value += 1;  // At increment in caller_trivial_2.
 }
 
-// When you call caller_trivial_inline_1, the inlined call-site
-// should share a PC with all three of the following inlined
-// functions, so we can exercise "virtual inline stepping".
-void caller_trivial_inline_1() {
-  caller_trivial_inline_2(); // In caller_trivial_inline_1.
-  inline_value += 1;
-}
-
-void caller_trivial_inline_2() {
-  caller_trivial_inline_3(); // In caller_trivial_inline_2.
-  inline_value += 1;
-}
-
-void caller_trivial_inline_3() {
-  inline_value += 1; // In caller_trivial_inline_3.
-}
-
 void
 called_by_inline_trivial ()
 {
@@ -155,7 +132,5 @@ main (int argc, char **argv)
     max_value(123, 456);                                // Call max_value template
     max_value(std::string("abc"), std::string("0022")); // Call max_value specialized
 
-    caller_trivial_inline_1(); // At caller_trivial_inline_1.
-
     return 0;            // About to return from main.
 }
diff --git a/lldb/tools/debugserver/source/MacOSX/MachVMRegion.cpp b/lldb/tools/debugserver/source/MacOSX/MachVMRegion.cpp
index 60d4c3bc293a3..97908b4acaf28 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachVMRegion.cpp
+++ b/lldb/tools/debugserver/source/MacOSX/MachVMRegion.cpp
@@ -208,7 +208,8 @@ std::vector<std::string> MachVMRegion::GetMemoryTypes() const {
       m_data.user_tag == VM_MEMORY_MALLOC_LARGE_REUSABLE ||
       m_data.user_tag == VM_MEMORY_MALLOC_HUGE ||
       m_data.user_tag == VM_MEMORY_REALLOC ||
-      m_data.user_tag == VM_MEMORY_SBRK) {
+      m_data.user_tag == VM_MEMORY_SBRK ||
+      m_data.user_tag == VM_MEMORY_SANITIZER) {
     types.push_back("heap");
     if (m_data.user_tag == VM_MEMORY_MALLOC_TINY) {
       types.push_back("malloc-tiny");
diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
index ac34ddafc5e72..2a890905dc632 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
@@ -503,7 +503,7 @@ struct DependT {
   using LocatorList = ObjectListT<I, E>;
   using TaskDependenceType = tomp::type::TaskDependenceType;
 
-  struct WithLocators { // Modern form
+  struct DepType { // The form with task dependence type.
     using TupleTrait = std::true_type;
     // Empty LocatorList means "omp_all_memory".
     std::tuple<TaskDependenceType, OPT(Iterator), LocatorList> t;
@@ -511,7 +511,7 @@ struct DependT {
 
   using Doacross = DoacrossT<T, I, E>;
   using UnionTrait = std::true_type;
-  std::variant<Doacross, WithLocators> u; // Doacross form is legacy
+  std::variant<Doacross, DepType> u; // Doacross form is legacy
 };
 
 // V5.2: [3.5] `destroy` clause
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
index 902d1305c818a..d12bc260f5cf4 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
@@ -135,10 +135,14 @@ template <> struct MappingTraits<TypeIdSummary> {
   }
 };
 
-struct FunctionSummaryYaml {
+struct GlobalValueSummaryYaml {
+  // Commonly used fields
   unsigned Linkage, Visibility;
   bool NotEligibleToImport, Live, IsLocal, CanAutoHide;
   unsigned ImportType;
+  // Fields for AliasSummary
+  std::optional<uint64_t> Aliasee;
+  // Fields for FunctionSummary
   std::vector<uint64_t> Refs;
   std::vector<uint64_t> TypeTests;
   std::vector<FunctionSummary::VFuncId> TypeTestAssumeVCalls,
@@ -176,8 +180,8 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(FunctionSummary::ConstVCall)
 namespace llvm {
 namespace yaml {
 
-template <> struct MappingTraits<FunctionSummaryYaml> {
-  static void mapping(IO &io, FunctionSummaryYaml& summary) {
+template <> struct MappingTraits<GlobalValueSummaryYaml> {
+  static void mapping(IO &io, GlobalValueSummaryYaml &summary) {
     io.mapOptional("Linkage", summary.Linkage);
     io.mapOptional("Visibility", summary.Visibility);
     io.mapOptional("NotEligibleToImport", summary.NotEligibleToImport);
@@ -185,6 +189,7 @@ template <> struct MappingTraits<FunctionSummaryYaml> {
     io.mapOptional("Local", summary.IsLocal);
     io.mapOptional("CanAutoHide", summary.CanAutoHide);
     io.mapOptional("ImportType", summary.ImportType);
+    io.mapOptional("Aliasee", summary.Aliasee);
     io.mapOptional("Refs", summary.Refs);
     io.mapOptional("TypeTests", summary.TypeTests);
     io.mapOptional("TypeTestAssumeVCalls", summary.TypeTestAssumeVCalls);
@@ -199,7 +204,7 @@ template <> struct MappingTraits<FunctionSummaryYaml> {
 } // End yaml namespace
 } // End llvm namespace
 
-LLVM_YAML_IS_SEQUENCE_VECTOR(FunctionSummaryYaml)
+LLVM_YAML_IS_SEQUENCE_VECTOR(GlobalValueSummaryYaml)
 
 namespace llvm {
 namespace yaml {
@@ -207,61 +212,99 @@ namespace yaml {
 // FIXME: Add YAML mappings for the rest of the module summary.
 template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
   static void inputOne(IO &io, StringRef Key, GlobalValueSummaryMapTy &V) {
-    std::vector<FunctionSummaryYaml> FSums;
-    io.mapRequired(Key.str().c_str(), FSums);
+    std::vector<GlobalValueSummaryYaml> GVSums;
+    io.mapRequired(Key.str().c_str(), GVSums);
     uint64_t KeyInt;
     if (Key.getAsInteger(0, KeyInt)) {
       io.setError("key not an integer");
       return;
     }
     auto &Elem = V.try_emplace(KeyInt, /*IsAnalysis=*/false).first->second;
-    for (auto &FSum : FSums) {
+    for (auto &GVSum : GVSums) {
+      GlobalValueSummary::GVFlags GVFlags(
+          static_cast<GlobalValue::LinkageTypes>(GVSum.Linkage),
+          static_cast<GlobalValue::VisibilityTypes>(GVSum.Visibility),
+          GVSum.NotEligibleToImport, GVSum.Live, GVSum.IsLocal,
+          GVSum.CanAutoHide,
+          static_cast<GlobalValueSummary::ImportKind>(GVSum.ImportType));
+      if (GVSum.Aliasee) {
+        auto ASum = std::make_unique<AliasSummary>(GVFlags);
+        if (!V.count(*GVSum.Aliasee))
+          V.emplace(*GVSum.Aliasee, /*IsAnalysis=*/false);
+        ValueInfo AliaseeVI(/*IsAnalysis=*/false, &*V.find(*GVSum.Aliasee));
+        // Note: Aliasee cannot be filled until all summaries are loaded.
+        // This is done in fixAliaseeLinks() which is called in
+        // MappingTraits<ModuleSummaryIndex>::mapping().
+        ASum->setAliasee(AliaseeVI, /*Aliasee=*/nullptr);
+        Elem.SummaryList.push_back(std::move(ASum));
+        continue;
+      }
       SmallVector<ValueInfo, 0> Refs;
-      Refs.reserve(FSum.Refs.size());
-      for (auto &RefGUID : FSum.Refs) {
+      Refs.reserve(GVSum.Refs.size());
+      for (auto &RefGUID : GVSum.Refs) {
         auto It = V.try_emplace(RefGUID, /*IsAnalysis=*/false).first;
         Refs.push_back(ValueInfo(/*IsAnalysis=*/false, &*It));
       }
       Elem.SummaryList.push_back(std::make_unique<FunctionSummary>(
-          GlobalValueSummary::GVFlags(
-              static_cast<GlobalValue::LinkageTypes>(FSum.Linkage),
-              static_cast<GlobalValue::VisibilityTypes>(FSum.Visibility),
-              FSum.NotEligibleToImport, FSum.Live, FSum.IsLocal,
-              FSum.CanAutoHide,
-              static_cast<GlobalValueSummary::ImportKind>(FSum.ImportType)),
-          /*NumInsts=*/0, FunctionSummary::FFlags{}, std::move(Refs),
-          SmallVector<FunctionSummary::EdgeTy, 0>{}, std::move(FSum.TypeTests),
-          std::move(FSum.TypeTestAssumeVCalls),
-          std::move(FSum.TypeCheckedLoadVCalls),
-          std::move(FSum.TypeTestAssumeConstVCalls),
-          std::move(FSum.TypeCheckedLoadConstVCalls),
+          GVFlags, /*NumInsts=*/0, FunctionSummary::FFlags{}, std::move(Refs),
+          SmallVector<FunctionSummary::EdgeTy, 0>{}, std::move(GVSum.TypeTests),
+          std::move(GVSum.TypeTestAssumeVCalls),
+          std::move(GVSum.TypeCheckedLoadVCalls),
+          std::move(GVSum.TypeTestAssumeConstVCalls),
+          std::move(GVSum.TypeCheckedLoadConstVCalls),
           ArrayRef<FunctionSummary::ParamAccess>{}, ArrayRef<CallsiteInfo>{},
           ArrayRef<AllocInfo>{}));
     }
   }
   static void output(IO &io, GlobalValueSummaryMapTy &V) {
     for (auto &P : V) {
-      std::vector<FunctionSummaryYaml> FSums;
+      std::vector<GlobalValueSummaryYaml> GVSums;
       for (auto &Sum : P.second.SummaryList) {
         if (auto *FSum = dyn_cast<FunctionSummary>(Sum.get())) {
           std::vector<uint64_t> Refs;
           Refs.reserve(FSum->refs().size());
           for (auto &VI : FSum->refs())
             Refs.push_back(VI.getGUID());
-          FSums.push_back(FunctionSummaryYaml{
+          GVSums.push_back(GlobalValueSummaryYaml{
               FSum->flags().Linkage, FSum->flags().Visibility,
               static_cast<bool>(FSum->flags().NotEligibleToImport),
               static_cast<bool>(FSum->flags().Live),
               static_cast<bool>(FSum->flags().DSOLocal),
               static_cast<bool>(FSum->flags().CanAutoHide),
-              FSum->flags().ImportType, Refs, FSum->type_tests(),
-              FSum->type_test_assume_vcalls(), FSum->type_checked_load_vcalls(),
+              FSum->flags().ImportType, /*Aliasee=*/std::nullopt, Refs,
+              FSum->type_tests(), FSum->type_test_assume_vcalls(),
+              FSum->type_checked_load_vcalls(),
               FSum->type_test_assume_const_vcalls(),
               FSum->type_checked_load_const_vcalls()});
-          }
+        } else if (auto *ASum = dyn_cast<AliasSummary>(Sum.get());
+                   ASum && ASum->hasAliasee()) {
+          GVSums.push_back(GlobalValueSummaryYaml{
+              ASum->flags().Linkage, ASum->flags().Visibility,
+              static_cast<bool>(ASum->flags().NotEligibleToImport),
+              static_cast<bool>(ASum->flags().Live),
+              static_cast<bool>(ASum->flags().DSOLocal),
+              static_cast<bool>(ASum->flags().CanAutoHide),
+              ASum->flags().ImportType,
+              /*Aliasee=*/ASum->getAliaseeGUID()});
+        }
+      }
+      if (!GVSums.empty())
+        io.mapRequired(llvm::utostr(P.first).c_str(), GVSums);
+    }
+  }
+  static void fixAliaseeLinks(GlobalValueSummaryMapTy &V) {
+    for (auto &P : V) {
+      for (auto &Sum : P.second.SummaryList) {
+        if (auto *Alias = dyn_cast<AliasSummary>(Sum.get())) {
+          ValueInfo AliaseeVI = Alias->getAliaseeVI();
+          auto AliaseeSL = AliaseeVI.getSummaryList();
+          if (AliaseeSL.empty()) {
+            ValueInfo EmptyVI;
+            Alias->setAliasee(EmptyVI, nullptr);
+          } else
+            Alias->setAliasee(AliaseeVI, AliaseeSL[0].get());
+        }
       }
-      if (!FSums.empty())
-        io.mapRequired(llvm::utostr(P.first).c_str(), FSums);
     }
   }
 };
@@ -281,6 +324,9 @@ template <> struct CustomMappingTraits<TypeIdSummaryMapTy> {
 template <> struct MappingTraits<ModuleSummaryIndex> {
   static void mapping(IO &io, ModuleSummaryIndex& index) {
     io.mapOptional("GlobalValueMap", index.GlobalValueMap);
+    if (!io.outputting())
+      CustomMappingTraits<GlobalValueSummaryMapTy>::fixAliaseeLinks(
+          index.GlobalValueMap);
     io.mapOptional("TypeIdMap", index.TypeIdMap);
     io.mapOptional("WithGlobalValueDeadStripping",
                    index.WithGlobalValueDeadStripping);
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
index d4b0b54375b02..49dcec26dbc55 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
@@ -31,6 +31,7 @@ enum class ResultReason {
   NotInstructions,
   DiffOpcodes,
   DiffTypes,
+  DiffMathFlags,
 };
 
 #ifndef NDEBUG
@@ -53,6 +54,8 @@ struct ToStr {
       return "DiffOpcodes";
     case ResultReason::DiffTypes:
       return "DiffTypes";
+    case ResultReason::DiffMathFlags:
+      return "DiffMathFlags";
     }
     llvm_unreachable("Unknown ResultReason enum");
   }
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h
index 64f57edb38484..9577e8ef7b37c 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h
@@ -12,7 +12,11 @@
 #ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_VECUTILS_H
 #define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_VECUTILS_H
 
-class Utils {
+#include "llvm/SandboxIR/Type.h"
+
+namespace llvm::sandboxir {
+
+class VecUtils {
 public:
   /// \Returns the number of elements in \p Ty. That is the number of lanes if a
   /// fixed vector or 1 if scalar. ScalableVectors have unknown size and
@@ -25,6 +29,8 @@ class Utils {
   static Type *getElementType(Type *Ty) {
     return Ty->isVectorTy() ? cast<FixedVectorType>(Ty)->getElementType() : Ty;
   }
-}
+};
+
+} // namespace llvm::sandboxir
 
-#endif LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_VECUTILS_H
+#endif // LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_VECUTILS_H
diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index cbf38f2c57a35..6c874fcabcc30 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -19,6 +19,7 @@
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "Utils/AArch64BaseInfo.h"
+#include "Utils/AArch64SMEAttributes.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index dfaa36f7f512d..9af6429c5caee 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -212,6 +212,7 @@
 #include "AArch64TargetMachine.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "Utils/AArch64SMEAttributes.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 22d632ed1e5f8..9d7051239ef99 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -19,6 +19,7 @@
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "Utils/AArch64BaseInfo.h"
+#include "Utils/AArch64SMEAttributes.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -10083,9 +10084,9 @@ SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
 // Thus, it's only used for ptrauth references to extern_weak to avoid null
 // checks.
 
-SDValue AArch64TargetLowering::LowerPtrAuthGlobalAddressStatically(
+static SDValue LowerPtrAuthGlobalAddressStatically(
     SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,
-    SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) const {
+    SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) {
   const auto *TGN = cast<GlobalAddressSDNode>(TGA.getNode());
   assert(TGN->getGlobal()->hasExternalWeakLinkage());
 
@@ -27575,6 +27576,22 @@ AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
   return TargetLowering::getSafeStackPointerLocation(IRB);
 }
 
+/// If a physical register, this returns the register that receives the
+/// exception address on entry to an EH pad.
+Register AArch64TargetLowering::getExceptionPointerRegister(
+    const Constant *PersonalityFn) const {
+  // FIXME: This is a guess. Has this been defined yet?
+  return AArch64::X0;
+}
+
+/// If a physical register, this returns the register that receives the
+/// exception typeid on entry to a landing pad.
+Register AArch64TargetLowering::getExceptionSelectorRegister(
+    const Constant *PersonalityFn) const {
+  // FIXME: This is a guess. Has this been defined yet?
+  return AArch64::X1;
+}
+
 bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
     const Instruction &AndI) const {
   // Only sink 'and' mask to cmp use block if it is masking a single bit, since
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 160cd18ca53b3..d696355bb062a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -14,8 +14,6 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H
 #define LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H
 
-#include "AArch64.h"
-#include "Utils/AArch64SMEAttributes.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/SelectionDAG.h"
@@ -830,18 +828,12 @@ class AArch64TargetLowering : public TargetLowering {
   /// If a physical register, this returns the register that receives the
   /// exception address on entry to an EH pad.
   Register
-  getExceptionPointerRegister(const Constant *PersonalityFn) const override {
-    // FIXME: This is a guess. Has this been defined yet?
-    return AArch64::X0;
-  }
+  getExceptionPointerRegister(const Constant *PersonalityFn) const override;
 
   /// If a physical register, this returns the register that receives the
   /// exception typeid on entry to a landing pad.
   Register
-  getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
-    // FIXME: This is a guess. Has this been defined yet?
-    return AArch64::X1;
-  }
+  getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
 
   bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
 
@@ -1132,11 +1124,6 @@ class AArch64TargetLowering : public TargetLowering {
                                  SelectionDAG &DAG) const;
   SDValue LowerWindowsGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerPtrAuthGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerPtrAuthGlobalAddressStatically(SDValue TGA, SDLoc DL, EVT VT,
-                                              AArch64PACKey::ID Key,
-                                              SDValue Discriminator,
-                                              SDValue AddrDiscriminator,
-                                              SelectionDAG &DAG) const;
   SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 19ef6f4fb32e7..525538db8036c 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -11,6 +11,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64TargetMachine.h"
+#include "Utils/AArch64SMEAttributes.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-selectiondag-info"
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index ff3c69f7e10c6..71f9bbbbc3504 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -10,6 +10,7 @@
 #include "AArch64ExpandImm.h"
 #include "AArch64PerfectShuffle.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64SMEAttributes.h"
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 6cbfb018b3183..065858c428944 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -18,6 +18,7 @@
 #include "AArch64MachineFunctionInfo.h"
 #include "AArch64RegisterInfo.h"
 #include "AArch64Subtarget.h"
+#include "Utils/AArch64SMEAttributes.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/ObjCARCUtil.h"
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index dd65dbe594a63..6024027afaf6c 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1536,6 +1536,14 @@ bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(
 
 bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
                                              MachineInstr &MI) const {
+  auto LowerBinOp = [&MI](unsigned Opcode) {
+    MachineIRBuilder MIB(MI);
+    MIB.buildInstr(Opcode, {MI.getOperand(0)},
+                   {MI.getOperand(2), MI.getOperand(3)});
+    MI.eraseFromParent();
+    return true;
+  };
+
   Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
   switch (IntrinsicID) {
   case Intrinsic::vacopy: {
@@ -1675,37 +1683,25 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     return true;
   }
   case Intrinsic::aarch64_neon_smax:
+    return LowerBinOp(TargetOpcode::G_SMAX);
   case Intrinsic::aarch64_neon_smin:
+    return LowerBinOp(TargetOpcode::G_SMIN);
   case Intrinsic::aarch64_neon_umax:
+    return LowerBinOp(TargetOpcode::G_UMAX);
   case Intrinsic::aarch64_neon_umin:
+    return LowerBinOp(TargetOpcode::G_UMIN);
   case Intrinsic::aarch64_neon_fmax:
+    return LowerBinOp(TargetOpcode::G_FMAXIMUM);
   case Intrinsic::aarch64_neon_fmin:
+    return LowerBinOp(TargetOpcode::G_FMINIMUM);
   case Intrinsic::aarch64_neon_fmaxnm:
-  case Intrinsic::aarch64_neon_fminnm: {
-    MachineIRBuilder MIB(MI);
-    if (IntrinsicID == Intrinsic::aarch64_neon_smax)
-      MIB.buildSMax(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3));
-    else if (IntrinsicID == Intrinsic::aarch64_neon_smin)
-      MIB.buildSMin(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3));
-    else if (IntrinsicID == Intrinsic::aarch64_neon_umax)
-      MIB.buildUMax(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3));
-    else if (IntrinsicID == Intrinsic::aarch64_neon_umin)
-      MIB.buildUMin(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3));
-    else if (IntrinsicID == Intrinsic::aarch64_neon_fmax)
-      MIB.buildInstr(TargetOpcode::G_FMAXIMUM, {MI.getOperand(0)},
-                     {MI.getOperand(2), MI.getOperand(3)});
-    else if (IntrinsicID == Intrinsic::aarch64_neon_fmin)
-      MIB.buildInstr(TargetOpcode::G_FMINIMUM, {MI.getOperand(0)},
-                     {MI.getOperand(2), MI.getOperand(3)});
-    else if (IntrinsicID == Intrinsic::aarch64_neon_fmaxnm)
-      MIB.buildInstr(TargetOpcode::G_FMAXNUM, {MI.getOperand(0)},
-                     {MI.getOperand(2), MI.getOperand(3)});
-    else if (IntrinsicID == Intrinsic::aarch64_neon_fminnm)
-      MIB.buildInstr(TargetOpcode::G_FMINNUM, {MI.getOperand(0)},
-                     {MI.getOperand(2), MI.getOperand(3)});
-    MI.eraseFromParent();
-    return true;
-  }
+    return LowerBinOp(TargetOpcode::G_FMAXNUM);
+  case Intrinsic::aarch64_neon_fminnm:
+    return LowerBinOp(TargetOpcode::G_FMINNUM);
+  case Intrinsic::aarch64_neon_smull:
+    return LowerBinOp(AArch64::G_UMULL);
+  case Intrinsic::aarch64_neon_umull:
+    return LowerBinOp(AArch64::G_SMULL);
   case Intrinsic::vector_reverse:
     // TODO: Add support for vector_reverse
     return false;
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 147b32b1ca990..68ae5de06423c 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -47,6 +47,7 @@ def ResRetInt32Ty : DXILOpParamType;
 def HandleTy : DXILOpParamType;
 def ResBindTy : DXILOpParamType;
 def ResPropsTy : DXILOpParamType;
+def SplitDoubleTy : DXILOpParamType;
 
 class DXILOpClass;
 
@@ -779,6 +780,15 @@ def FlattenedThreadIdInGroup :  DXILOp<96, flattenedThreadIdInGroup> {
   let attributes = [Attributes<DXIL1_0, [ReadNone]>];
 }
 
+def SplitDouble :  DXILOp<102, splitDouble> {
+  let Doc = "Splits a double into 2 uints";
+  let arguments = [OverloadTy];
+  let result = SplitDoubleTy;
+  let overloads = [Overloads<DXIL1_0, [DoubleTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
 def AnnotateHandle : DXILOp<217, annotateHandle> {
   let Doc = "annotate handle with resource properties";
   let arguments = [HandleTy, ResPropsTy];
diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
index 7719d6b107911..5d5bb3eacace2 100644
--- a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
@@ -229,6 +229,13 @@ static StructType *getResPropsType(LLVMContext &Context) {
   return StructType::create({Int32Ty, Int32Ty}, "dx.types.ResourceProperties");
 }
 
+static StructType *getSplitDoubleType(LLVMContext &Context) {
+  if (auto *ST = StructType::getTypeByName(Context, "dx.types.splitdouble"))
+    return ST;
+  Type *Int32Ty = Type::getInt32Ty(Context);
+  return StructType::create({Int32Ty, Int32Ty}, "dx.types.splitdouble");
+}
+
 static Type *getTypeFromOpParamType(OpParamType Kind, LLVMContext &Ctx,
                                     Type *OverloadTy) {
   switch (Kind) {
@@ -266,6 +273,8 @@ static Type *getTypeFromOpParamType(OpParamType Kind, LLVMContext &Ctx,
     return getResBindType(Ctx);
   case OpParamType::ResPropsTy:
     return getResPropsType(Ctx);
+  case OpParamType::SplitDoubleTy:
+    return getSplitDoubleType(Ctx);
   }
   llvm_unreachable("Invalid parameter kind");
   return nullptr;
@@ -467,6 +476,10 @@ StructType *DXILOpBuilder::getResRetType(Type *ElementTy) {
   return ::getResRetType(ElementTy);
 }
 
+StructType *DXILOpBuilder::getSplitDoubleType(LLVMContext &Context) {
+  return ::getSplitDoubleType(Context);
+}
+
 StructType *DXILOpBuilder::getHandleType() {
   return ::getHandleType(IRB.getContext());
 }
diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.h b/llvm/lib/Target/DirectX/DXILOpBuilder.h
index 037ae3822cfb9..df5a0240870f4 100644
--- a/llvm/lib/Target/DirectX/DXILOpBuilder.h
+++ b/llvm/lib/Target/DirectX/DXILOpBuilder.h
@@ -49,6 +49,10 @@ class DXILOpBuilder {
 
   /// Get a `%dx.types.ResRet` type with the given element type.
   StructType *getResRetType(Type *ElementTy);
+
+  /// Get the `%dx.types.splitdouble` type.
+  StructType *getSplitDoubleType(LLVMContext &Context);
+
   /// Get the `%dx.types.Handle` type.
   StructType *getHandleType();
 
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index c62ba8c21d679..f7722d7707476 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -17,6 +17,7 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsDirectX.h"
 #include "llvm/IR/Module.h"
@@ -128,6 +129,30 @@ class OpLowerer {
     });
   }
 
+  [[nodiscard]] bool replaceFunctionWithNamedStructOp(
+      Function &F, dxil::OpCode DXILOp, Type *NewRetTy,
+      llvm::function_ref<Error(CallInst *CI, CallInst *Op)> ReplaceUses) {
+    bool IsVectorArgExpansion = isVectorArgExpansion(F);
+    return replaceFunction(F, [&](CallInst *CI) -> Error {
+      SmallVector<Value *> Args;
+      OpBuilder.getIRB().SetInsertPoint(CI);
+      if (IsVectorArgExpansion) {
+        SmallVector<Value *> NewArgs = argVectorFlatten(CI, OpBuilder.getIRB());
+        Args.append(NewArgs.begin(), NewArgs.end());
+      } else
+        Args.append(CI->arg_begin(), CI->arg_end());
+
+      Expected<CallInst *> OpCall =
+          OpBuilder.tryCreateOp(DXILOp, Args, CI->getName(), NewRetTy);
+      if (Error E = OpCall.takeError())
+        return E;
+      if (Error E = ReplaceUses(CI, *OpCall))
+        return E;
+
+      return Error::success();
+    });
+  }
+
   /// Create a cast between a `target("dx")` type and `dx.types.Handle`, which
   /// is intended to be removed by the end of lowering. This is used to allow
   /// lowering of ops which need to change their return or argument types in a
@@ -263,6 +288,26 @@ class OpLowerer {
     return lowerToBindAndAnnotateHandle(F);
   }
 
+  Error replaceSplitDoubleCallUsages(CallInst *Intrin, CallInst *Op) {
+    for (Use &U : make_early_inc_range(Intrin->uses())) {
+      if (auto *EVI = dyn_cast<ExtractValueInst>(U.getUser())) {
+
+        if (EVI->getNumIndices() != 1)
+          return createStringError(std::errc::invalid_argument,
+                                   "Splitdouble has only 2 elements");
+        EVI->setOperand(0, Op);
+      } else {
+        return make_error<StringError>(
+            "Splitdouble use is not ExtractValueInst",
+            inconvertibleErrorCode());
+      }
+    }
+
+    Intrin->eraseFromParent();
+
+    return Error::success();
+  }
+
   /// Replace uses of \c Intrin with the values in the `dx.ResRet` of \c Op.
   /// Since we expect to be post-scalarization, make an effort to avoid vectors.
   Error replaceResRetUses(CallInst *Intrin, CallInst *Op, bool HasCheckBit) {
@@ -488,6 +533,16 @@ class OpLowerer {
       case Intrinsic::dx_typedBufferStore:
         HasErrors |= lowerTypedBufferStore(F);
         break;
+      // TODO: this can be removed when
+      // https://github.com/llvm/llvm-project/issues/113192 is fixed
+      case Intrinsic::dx_splitdouble:
+        HasErrors |= replaceFunctionWithNamedStructOp(
+            F, OpCode::SplitDouble,
+            OpBuilder.getSplitDoubleType(M.getContext()),
+            [&](CallInst *CI, CallInst *Op) {
+              return replaceSplitDoubleCallUsages(CI, Op);
+            });
+        break;
       }
       Updated = true;
     }
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 3fcfc6a876776..6ba371069bb23 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -2083,8 +2083,12 @@ bool LowerTypeTestsModule::lower() {
     for (auto &I : *ExportSummary)
       for (auto &GVS : I.second.SummaryList)
         if (GVS->isLive())
-          for (const auto &Ref : GVS->refs())
+          for (const auto &Ref : GVS->refs()) {
             AddressTaken.insert(Ref.getGUID());
+            for (auto &RefGVS : Ref.getSummaryList())
+              if (auto Alias = dyn_cast<AliasSummary>(RefGVS.get()))
+                AddressTaken.insert(Alias->getAliaseeGUID());
+          }
 
     NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions");
     if (CfiFunctionsMD) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 88086f24dfdce..778d928252e05 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7562,67 +7562,62 @@ static void addRuntimeUnrollDisableMetaData(Loop *L) {
   }
 }
 
-// Check if \p RedResult is a ComputeReductionResult instruction, and if it is
-// create a merge phi node for it.
-static void createAndCollectMergePhiForReduction(
-    VPInstruction *RedResult,
-    VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock,
-    bool VectorizingEpilogue) {
-  if (!RedResult ||
-      RedResult->getOpcode() != VPInstruction::ComputeReductionResult)
+// If \p R is a ComputeReductionResult when vectorizing the epilog loop,
+// fix the reduction's scalar PHI node by adding the incoming value from the
+// main vector loop.
+static void fixReductionScalarResumeWhenVectorizingEpilog(
+    VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock) {
+  auto *EpiRedResult = dyn_cast<VPInstruction>(R);
+  if (!EpiRedResult ||
+      EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult)
     return;
 
-  auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
-  const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
-
-  Value *FinalValue = State.get(RedResult, VPLane(VPLane::getFirstLane()));
-  auto *ResumePhi =
-      dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
-  if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind(
-                                 RdxDesc.getRecurrenceKind())) {
-    auto *Cmp = cast<ICmpInst>(PhiR->getStartValue()->getUnderlyingValue());
-    assert(Cmp->getPredicate() == CmpInst::ICMP_NE);
-    assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue());
-    ResumePhi = cast<PHINode>(Cmp->getOperand(0));
-  }
-  assert((!VectorizingEpilogue || ResumePhi) &&
-         "when vectorizing the epilogue loop, we need a resume phi from main "
-         "vector loop");
-
-  // TODO: bc.merge.rdx should not be created here, instead it should be
-  // modeled in VPlan.
-  BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
-  // Create a phi node that merges control-flow from the backedge-taken check
-  // block and the middle block.
-  auto *BCBlockPhi =
-      PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
-                      LoopScalarPreHeader->getTerminator()->getIterator());
-
-  // If we are fixing reductions in the epilogue loop then we should already
-  // have created a bc.merge.rdx Phi after the main vector body. Ensure that
-  // we carry over the incoming values correctly.
+  auto *EpiRedHeaderPhi =
+      cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0));
+  const RecurrenceDescriptor &RdxDesc =
+      EpiRedHeaderPhi->getRecurrenceDescriptor();
+  Value *MainResumeValue =
+      EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
+  if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
+          RdxDesc.getRecurrenceKind())) {
+    auto *Cmp = cast<ICmpInst>(MainResumeValue);
+    assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
+           "AnyOf expected to start with ICMP_NE");
+    assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue() &&
+           "AnyOf expected to start by comparing main resume value to original "
+           "start value");
+    MainResumeValue = Cmp->getOperand(0);
+  }
+  PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);
+
+  // When fixing reductions in the epilogue loop we should already have
+  // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
+  // over the incoming values correctly.
+  using namespace VPlanPatternMatch;
+  auto IsResumePhi = [](VPUser *U) {
+    return match(
+        U, m_VPInstruction<VPInstruction::ResumePhi>(m_VPValue(), m_VPValue()));
+  };
+  assert(count_if(EpiRedResult->users(), IsResumePhi) == 1 &&
+         "ResumePhi must have a single user");
+  auto *EpiResumePhiVPI =
+      cast<VPInstruction>(*find_if(EpiRedResult->users(), IsResumePhi));
+  auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiVPI, true));
+  BasicBlock *LoopScalarPreHeader = EpiResumePhi->getParent();
+  bool Updated = false;
   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
-    if (Incoming == LoopMiddleBlock)
-      BCBlockPhi->addIncoming(FinalValue, Incoming);
-    else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
-      BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
-                              Incoming);
-    else
-      BCBlockPhi->addIncoming(RdxDesc.getRecurrenceStartValue(), Incoming);
+    if (is_contained(MainResumePhi->blocks(), Incoming)) {
+      assert(EpiResumePhi->getIncomingValueForBlock(Incoming) ==
+                 RdxDesc.getRecurrenceStartValue() &&
+             "Trying to reset unexpected value");
+      assert(!Updated && "Should update at most 1 incoming value");
+      EpiResumePhi->setIncomingValueForBlock(
+          Incoming, MainResumePhi->getIncomingValueForBlock(Incoming));
+      Updated = true;
+    }
   }
-
-  auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
-  // TODO: This fixup should instead be modeled in VPlan.
-  // Fix the scalar loop reduction variable with the incoming reduction sum
-  // from the vector body and from the backedge value.
-  int IncomingEdgeBlockIdx =
-      OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
-  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
-  // Pick the other block.
-  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
-  OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
-  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
-  OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
+  assert(Updated && "Must update EpiResumePhi.");
+  (void)Updated;
 }
 
 DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
@@ -7713,11 +7708,11 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   // 2.5 Collect reduction resume values.
   auto *ExitVPBB =
       cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
-  for (VPRecipeBase &R : *ExitVPBB) {
-    createAndCollectMergePhiForReduction(
-        dyn_cast<VPInstruction>(&R), State, OrigLoop,
-        State.CFG.VPBB2IRBB[ExitVPBB], VectorizingEpilogue);
-  }
+  if (VectorizingEpilogue)
+    for (VPRecipeBase &R : *ExitVPBB) {
+      fixReductionScalarResumeWhenVectorizingEpilog(
+          &R, State, State.CFG.VPBB2IRBB[ExitVPBB]);
+    }
 
   // 2.6. Maintain Loop Hints
   // Keep all loop hints from the original loop on the vector loop (we'll
@@ -9518,6 +9513,17 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
         });
     FinalReductionResult->insertBefore(*MiddleVPBB, IP);
 
+    // Order is strict: if there are multiple successors, the first is the exit
+    // block, second is the scalar preheader.
+    VPBasicBlock *ScalarPHVPBB =
+        cast<VPBasicBlock>(MiddleVPBB->getSuccessors().back());
+    VPBuilder ScalarPHBuilder(ScalarPHVPBB);
+    auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
+        VPInstruction::ResumePhi, {FinalReductionResult, PhiR->getStartValue()},
+        {}, "bc.merge.rdx");
+    auto *RedPhi = cast<PHINode>(PhiR->getUnderlyingInstr());
+    Plan->addLiveOut(RedPhi, ResumePhiRecipe);
+
     // Adjust AnyOf reductions; replace the reduction phi for the selected value
     // with a boolean reduction phi node to check if the condition is true in
     // any iteration. The final value is selected by the final
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
index e4546c2f98113..346d8a90589f5 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
@@ -8,9 +8,11 @@
 
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h"
 #include "llvm/SandboxIR/Instruction.h"
+#include "llvm/SandboxIR/Operator.h"
 #include "llvm/SandboxIR/Utils.h"
 #include "llvm/SandboxIR/Value.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h"
 
 namespace llvm::sandboxir {
 
@@ -26,7 +28,35 @@ void LegalityResult::dump() const {
 std::optional<ResultReason>
 LegalityAnalysis::notVectorizableBasedOnOpcodesAndTypes(
     ArrayRef<Value *> Bndl) {
-  // TODO: Unimplemented.
+  auto *I0 = cast<Instruction>(Bndl[0]);
+  auto Opcode = I0->getOpcode();
+  // If they have different opcodes, then we cannot form a vector (for now).
+  if (any_of(drop_begin(Bndl), [Opcode](Value *V) {
+        return cast<Instruction>(V)->getOpcode() != Opcode;
+      }))
+    return ResultReason::DiffOpcodes;
+
+  // If not the same scalar type, Pack. This will accept scalars and vectors as
+  // long as the element type is the same.
+  Type *ElmTy0 = VecUtils::getElementType(Utils::getExpectedType(I0));
+  if (any_of(drop_begin(Bndl), [ElmTy0](Value *V) {
+        return VecUtils::getElementType(Utils::getExpectedType(V)) != ElmTy0;
+      }))
+    return ResultReason::DiffTypes;
+
+  // TODO: Allow vectorization of instrs with different flags as long as we
+  // change them to the least common one.
+  // For now pack if differnt FastMathFlags.
+  if (isa<FPMathOperator>(I0)) {
+    FastMathFlags FMF0 = cast<Instruction>(Bndl[0])->getFastMathFlags();
+    if (any_of(drop_begin(Bndl), [FMF0](auto *V) {
+          return cast<Instruction>(V)->getFastMathFlags() != FMF0;
+        }))
+      return ResultReason::DiffMathFlags;
+  }
+
+  // TODO: Missing checks
+
   return std::nullopt;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index d677526bab000..11397703b4442 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -4,16 +4,7 @@
 ; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 ; CHECK-GI:       warning: Instruction selection used fallback path for pmlsl2_v8i16_uzp1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for smlsl2_v8i16_uzp1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for umlsl2_v8i16_uzp1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for smlsl2_v4i32_uzp1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for umlsl2_v4i32_uzp1
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for pmlsl_pmlsl2_v8i16_uzp1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for smlsl_smlsl2_v8i16_uzp1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for umlsl_umlsl2_v8i16_uzp1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for smlsl_smlsl2_v4i32_uzp1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for umlsl_umlsl2_v4i32_uzp1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for do_stuff
 
 define <8 x i16> @smull_v8i8_v8i16(ptr %A, ptr %B) nounwind {
 ; CHECK-LABEL: smull_v8i8_v8i16:
@@ -2025,13 +2016,30 @@ define void @pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) {
 }
 
 define void @smlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) {
-; CHECK-LABEL: smlsl2_v8i16_uzp1:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q2, [x1, #16]
-; CHECK-NEXT:    uzp1 v2.16b, v0.16b, v2.16b
-; CHECK-NEXT:    smlsl2 v1.8h, v0.16b, v2.16b
-; CHECK-NEXT:    str q1, [x0]
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: smlsl2_v8i16_uzp1:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    ldr q2, [x1, #16]
+; CHECK-NEON-NEXT:    uzp1 v2.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT:    smlsl2 v1.8h, v0.16b, v2.16b
+; CHECK-NEON-NEXT:    str q1, [x0]
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smlsl2_v8i16_uzp1:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    ldr q2, [x1, #16]
+; CHECK-SVE-NEXT:    uzp1 v2.16b, v0.16b, v2.16b
+; CHECK-SVE-NEXT:    smlsl2 v1.8h, v0.16b, v2.16b
+; CHECK-SVE-NEXT:    str q1, [x0]
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: smlsl2_v8i16_uzp1:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q2, [x1, #16]
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    xtn v2.8b, v2.8h
+; CHECK-GI-NEXT:    umlsl v1.8h, v0.8b, v2.8b
+; CHECK-GI-NEXT:    str q1, [x0]
+; CHECK-GI-NEXT:    ret
   %5 = getelementptr inbounds i32, ptr %3, i64 4
   %6 = load <8 x i16>, ptr %5, align 4
   %7 = trunc <8 x i16> %6 to <8 x i8>
@@ -2043,13 +2051,30 @@ define void @smlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) {
 }
 
 define void @umlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) {
-; CHECK-LABEL: umlsl2_v8i16_uzp1:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q2, [x1, #16]
-; CHECK-NEXT:    uzp1 v2.16b, v0.16b, v2.16b
-; CHECK-NEXT:    umlsl2 v1.8h, v0.16b, v2.16b
-; CHECK-NEXT:    str q1, [x0]
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: umlsl2_v8i16_uzp1:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    ldr q2, [x1, #16]
+; CHECK-NEON-NEXT:    uzp1 v2.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT:    umlsl2 v1.8h, v0.16b, v2.16b
+; CHECK-NEON-NEXT:    str q1, [x0]
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umlsl2_v8i16_uzp1:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    ldr q2, [x1, #16]
+; CHECK-SVE-NEXT:    uzp1 v2.16b, v0.16b, v2.16b
+; CHECK-SVE-NEXT:    umlsl2 v1.8h, v0.16b, v2.16b
+; CHECK-SVE-NEXT:    str q1, [x0]
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: umlsl2_v8i16_uzp1:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q2, [x1, #16]
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    xtn v2.8b, v2.8h
+; CHECK-GI-NEXT:    smlsl v1.8h, v0.8b, v2.8b
+; CHECK-GI-NEXT:    str q1, [x0]
+; CHECK-GI-NEXT:    ret
   %5 = getelementptr inbounds i32, ptr %3, i64 4
   %6 = load <8 x i16>, ptr %5, align 4
   %7 = trunc <8 x i16> %6 to <8 x i8>
@@ -2061,13 +2086,30 @@ define void @umlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) {
 }
 
 define void @smlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3) {
-; CHECK-LABEL: smlsl2_v4i32_uzp1:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q2, [x1, #16]
-; CHECK-NEXT:    uzp1 v2.8h, v0.8h, v2.8h
-; CHECK-NEXT:    smlsl2 v1.4s, v0.8h, v2.8h
-; CHECK-NEXT:    str q1, [x0]
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: smlsl2_v4i32_uzp1:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    ldr q2, [x1, #16]
+; CHECK-NEON-NEXT:    uzp1 v2.8h, v0.8h, v2.8h
+; CHECK-NEON-NEXT:    smlsl2 v1.4s, v0.8h, v2.8h
+; CHECK-NEON-NEXT:    str q1, [x0]
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smlsl2_v4i32_uzp1:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    ldr q2, [x1, #16]
+; CHECK-SVE-NEXT:    uzp1 v2.8h, v0.8h, v2.8h
+; CHECK-SVE-NEXT:    smlsl2 v1.4s, v0.8h, v2.8h
+; CHECK-SVE-NEXT:    str q1, [x0]
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: smlsl2_v4i32_uzp1:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q2, [x1, #16]
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    xtn v2.4h, v2.4s
+; CHECK-GI-NEXT:    umlsl v1.4s, v0.4h, v2.4h
+; CHECK-GI-NEXT:    str q1, [x0]
+; CHECK-GI-NEXT:    ret
   %5 = getelementptr inbounds i32, ptr %3, i64 4
   %6 = load <4 x i32>, ptr %5, align 4
   %7 = trunc <4 x i32> %6 to <4 x i16>
@@ -2079,13 +2121,30 @@ define void @smlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3) {
 }
 
 define void @umlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3) {
-; CHECK-LABEL: umlsl2_v4i32_uzp1:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q2, [x1, #16]
-; CHECK-NEXT:    uzp1 v2.8h, v0.8h, v2.8h
-; CHECK-NEXT:    umlsl2 v1.4s, v0.8h, v2.8h
-; CHECK-NEXT:    str q1, [x0]
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: umlsl2_v4i32_uzp1:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    ldr q2, [x1, #16]
+; CHECK-NEON-NEXT:    uzp1 v2.8h, v0.8h, v2.8h
+; CHECK-NEON-NEXT:    umlsl2 v1.4s, v0.8h, v2.8h
+; CHECK-NEON-NEXT:    str q1, [x0]
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umlsl2_v4i32_uzp1:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    ldr q2, [x1, #16]
+; CHECK-SVE-NEXT:    uzp1 v2.8h, v0.8h, v2.8h
+; CHECK-SVE-NEXT:    umlsl2 v1.4s, v0.8h, v2.8h
+; CHECK-SVE-NEXT:    str q1, [x0]
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: umlsl2_v4i32_uzp1:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q2, [x1, #16]
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    xtn v2.4h, v2.4s
+; CHECK-GI-NEXT:    smlsl v1.4s, v0.4h, v2.4h
+; CHECK-GI-NEXT:    str q1, [x0]
+; CHECK-GI-NEXT:    ret
   %5 = getelementptr inbounds i32, ptr %3, i64 4
   %6 = load <4 x i32>, ptr %5, align 4
   %7 = trunc <4 x i32> %6 to <4 x i16>
@@ -2124,14 +2183,35 @@ entry:
 }
 
 define void @smlsl_smlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, i32 %4) {
-; CHECK-LABEL: smlsl_smlsl2_v8i16_uzp1:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    uzp1 v2.16b, v2.16b, v3.16b
-; CHECK-NEXT:    smlsl v1.8h, v0.8b, v2.8b
-; CHECK-NEXT:    smlsl2 v1.8h, v0.16b, v2.16b
-; CHECK-NEXT:    str q1, [x0]
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: smlsl_smlsl2_v8i16_uzp1:
+; CHECK-NEON:       // %bb.0: // %entry
+; CHECK-NEON-NEXT:    ldp q2, q3, [x1]
+; CHECK-NEON-NEXT:    uzp1 v2.16b, v2.16b, v3.16b
+; CHECK-NEON-NEXT:    smlsl v1.8h, v0.8b, v2.8b
+; CHECK-NEON-NEXT:    smlsl2 v1.8h, v0.16b, v2.16b
+; CHECK-NEON-NEXT:    str q1, [x0]
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smlsl_smlsl2_v8i16_uzp1:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    ldp q2, q3, [x1]
+; CHECK-SVE-NEXT:    uzp1 v2.16b, v2.16b, v3.16b
+; CHECK-SVE-NEXT:    smlsl v1.8h, v0.8b, v2.8b
+; CHECK-SVE-NEXT:    smlsl2 v1.8h, v0.16b, v2.16b
+; CHECK-SVE-NEXT:    str q1, [x0]
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: smlsl_smlsl2_v8i16_uzp1:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldp q4, q2, [x1]
+; CHECK-GI-NEXT:    mov d3, v0.d[1]
+; CHECK-GI-NEXT:    xtn v2.8b, v2.8h
+; CHECK-GI-NEXT:    xtn v4.8b, v4.8h
+; CHECK-GI-NEXT:    umull v2.8h, v3.8b, v2.8b
+; CHECK-GI-NEXT:    umlal v2.8h, v0.8b, v4.8b
+; CHECK-GI-NEXT:    sub v0.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    str q0, [x0]
+; CHECK-GI-NEXT:    ret
 entry:
   %5 = load <8 x i16>, ptr %3, align 4
   %6 = trunc <8 x i16> %5 to <8 x i8>
@@ -2149,14 +2229,35 @@ entry:
 }
 
 define void @umlsl_umlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, i32 %4) {
-; CHECK-LABEL: umlsl_umlsl2_v8i16_uzp1:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    uzp1 v2.16b, v2.16b, v3.16b
-; CHECK-NEXT:    umlsl v1.8h, v0.8b, v2.8b
-; CHECK-NEXT:    umlsl2 v1.8h, v0.16b, v2.16b
-; CHECK-NEXT:    str q1, [x0]
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: umlsl_umlsl2_v8i16_uzp1:
+; CHECK-NEON:       // %bb.0: // %entry
+; CHECK-NEON-NEXT:    ldp q2, q3, [x1]
+; CHECK-NEON-NEXT:    uzp1 v2.16b, v2.16b, v3.16b
+; CHECK-NEON-NEXT:    umlsl v1.8h, v0.8b, v2.8b
+; CHECK-NEON-NEXT:    umlsl2 v1.8h, v0.16b, v2.16b
+; CHECK-NEON-NEXT:    str q1, [x0]
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umlsl_umlsl2_v8i16_uzp1:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    ldp q2, q3, [x1]
+; CHECK-SVE-NEXT:    uzp1 v2.16b, v2.16b, v3.16b
+; CHECK-SVE-NEXT:    umlsl v1.8h, v0.8b, v2.8b
+; CHECK-SVE-NEXT:    umlsl2 v1.8h, v0.16b, v2.16b
+; CHECK-SVE-NEXT:    str q1, [x0]
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: umlsl_umlsl2_v8i16_uzp1:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldp q4, q2, [x1]
+; CHECK-GI-NEXT:    mov d3, v0.d[1]
+; CHECK-GI-NEXT:    xtn v2.8b, v2.8h
+; CHECK-GI-NEXT:    xtn v4.8b, v4.8h
+; CHECK-GI-NEXT:    smull v2.8h, v3.8b, v2.8b
+; CHECK-GI-NEXT:    smlal v2.8h, v0.8b, v4.8b
+; CHECK-GI-NEXT:    sub v0.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    str q0, [x0]
+; CHECK-GI-NEXT:    ret
 entry:
   %5 = load <8 x i16>, ptr %3, align 4
   %6 = trunc <8 x i16> %5 to <8 x i8>
@@ -2174,14 +2275,35 @@ entry:
 }
 
 define void @smlsl_smlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3, i32 %4) {
-; CHECK-LABEL: smlsl_smlsl2_v4i32_uzp1:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
-; CHECK-NEXT:    smlsl v1.4s, v0.4h, v2.4h
-; CHECK-NEXT:    smlsl2 v1.4s, v0.8h, v2.8h
-; CHECK-NEXT:    str q1, [x0]
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: smlsl_smlsl2_v4i32_uzp1:
+; CHECK-NEON:       // %bb.0: // %entry
+; CHECK-NEON-NEXT:    ldp q2, q3, [x1]
+; CHECK-NEON-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
+; CHECK-NEON-NEXT:    smlsl v1.4s, v0.4h, v2.4h
+; CHECK-NEON-NEXT:    smlsl2 v1.4s, v0.8h, v2.8h
+; CHECK-NEON-NEXT:    str q1, [x0]
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smlsl_smlsl2_v4i32_uzp1:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    ldp q2, q3, [x1]
+; CHECK-SVE-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
+; CHECK-SVE-NEXT:    smlsl v1.4s, v0.4h, v2.4h
+; CHECK-SVE-NEXT:    smlsl2 v1.4s, v0.8h, v2.8h
+; CHECK-SVE-NEXT:    str q1, [x0]
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: smlsl_smlsl2_v4i32_uzp1:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldp q4, q2, [x1]
+; CHECK-GI-NEXT:    mov d3, v0.d[1]
+; CHECK-GI-NEXT:    xtn v2.4h, v2.4s
+; CHECK-GI-NEXT:    xtn v4.4h, v4.4s
+; CHECK-GI-NEXT:    umull v2.4s, v3.4h, v2.4h
+; CHECK-GI-NEXT:    umlal v2.4s, v0.4h, v4.4h
+; CHECK-GI-NEXT:    sub v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    str q0, [x0]
+; CHECK-GI-NEXT:    ret
 entry:
   %5 = load <4 x i32>, ptr %3, align 4
   %6 = trunc <4 x i32> %5 to <4 x i16>
@@ -2199,14 +2321,35 @@ entry:
 }
 
 define void @umlsl_umlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3, i32 %4) {
-; CHECK-LABEL: umlsl_umlsl2_v4i32_uzp1:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
-; CHECK-NEXT:    umlsl v1.4s, v0.4h, v2.4h
-; CHECK-NEXT:    umlsl2 v1.4s, v0.8h, v2.8h
-; CHECK-NEXT:    str q1, [x0]
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: umlsl_umlsl2_v4i32_uzp1:
+; CHECK-NEON:       // %bb.0: // %entry
+; CHECK-NEON-NEXT:    ldp q2, q3, [x1]
+; CHECK-NEON-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
+; CHECK-NEON-NEXT:    umlsl v1.4s, v0.4h, v2.4h
+; CHECK-NEON-NEXT:    umlsl2 v1.4s, v0.8h, v2.8h
+; CHECK-NEON-NEXT:    str q1, [x0]
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umlsl_umlsl2_v4i32_uzp1:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    ldp q2, q3, [x1]
+; CHECK-SVE-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
+; CHECK-SVE-NEXT:    umlsl v1.4s, v0.4h, v2.4h
+; CHECK-SVE-NEXT:    umlsl2 v1.4s, v0.8h, v2.8h
+; CHECK-SVE-NEXT:    str q1, [x0]
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: umlsl_umlsl2_v4i32_uzp1:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldp q4, q2, [x1]
+; CHECK-GI-NEXT:    mov d3, v0.d[1]
+; CHECK-GI-NEXT:    xtn v2.4h, v2.4s
+; CHECK-GI-NEXT:    xtn v4.4h, v4.4s
+; CHECK-GI-NEXT:    smull v2.4s, v3.4h, v2.4h
+; CHECK-GI-NEXT:    smlal v2.4s, v0.4h, v4.4h
+; CHECK-GI-NEXT:    sub v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    str q0, [x0]
+; CHECK-GI-NEXT:    ret
 entry:
   %5 = load <4 x i32>, ptr %3, align 4
   %6 = trunc <4 x i32> %5 to <4 x i16>
@@ -2224,13 +2367,31 @@ entry:
 }
 
 define <2 x i32> @do_stuff(<2 x i64> %0, <2 x i64> %1) {
-; CHECK-LABEL: do_stuff:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v0.4s
-; CHECK-NEXT:    smull2 v0.2d, v1.4s, v0.4s
-; CHECK-NEXT:    xtn v0.2s, v0.2d
-; CHECK-NEXT:    add v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: do_stuff:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    uzp1 v0.4s, v0.4s, v0.4s
+; CHECK-NEON-NEXT:    smull2 v0.2d, v1.4s, v0.4s
+; CHECK-NEON-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEON-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: do_stuff:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    uzp1 v0.4s, v0.4s, v0.4s
+; CHECK-SVE-NEXT:    smull2 v0.2d, v1.4s, v0.4s
+; CHECK-SVE-NEXT:    xtn v0.2s, v0.2d
+; CHECK-SVE-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: do_stuff:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    ext v2.16b, v1.16b, v2.16b, #8
+; CHECK-GI-NEXT:    umull v0.2d, v2.2s, v0.2s
+; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    ret
   %bc.1 = bitcast <2 x i64> %1 to <4 x i32>
   %trunc.0 = trunc <2 x i64> %0 to <2 x i32>
   %shuff.hi = shufflevector <4 x i32> %bc.1, <4 x i32> zeroinitializer, <2 x i32> <i32 2, i32 3>
diff --git a/llvm/test/CodeGen/DirectX/split-double.ll b/llvm/test/CodeGen/DirectX/split-double.ll
deleted file mode 100644
index 759590fa56279..0000000000000
--- a/llvm/test/CodeGen/DirectX/split-double.ll
+++ /dev/null
@@ -1,45 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes='function(scalarizer)' -S -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
-
-define void @test_vector_double_split_void(<2 x double> noundef %d) {
-; CHECK-LABEL: define void @test_vector_double_split_void(
-; CHECK-SAME: <2 x double> noundef [[D:%.*]]) {
-; CHECK-NEXT:    [[D_I0:%.*]] = extractelement <2 x double> [[D]], i64 0
-; CHECK-NEXT:    [[HLSL_ASUINT_I0:%.*]] = call { i32, i32 } @llvm.dx.splitdouble.i32(double [[D_I0]])
-; CHECK-NEXT:    [[D_I1:%.*]] = extractelement <2 x double> [[D]], i64 1
-; CHECK-NEXT:    [[HLSL_ASUINT_I1:%.*]] = call { i32, i32 } @llvm.dx.splitdouble.i32(double [[D_I1]])
-; CHECK-NEXT:    ret void
-;
-  %hlsl.asuint = call { <2 x i32>, <2 x i32> }  @llvm.dx.splitdouble.v2i32(<2 x double> %d)
-  ret void
-}
-
-define noundef <3 x i32> @test_vector_double_split(<3 x double> noundef %d) {
-; CHECK-LABEL: define noundef <3 x i32> @test_vector_double_split(
-; CHECK-SAME: <3 x double> noundef [[D:%.*]]) {
-; CHECK-NEXT:    [[D_I0:%.*]] = extractelement <3 x double> [[D]], i64 0
-; CHECK-NEXT:    [[HLSL_ASUINT_I0:%.*]] = call { i32, i32 } @llvm.dx.splitdouble.i32(double [[D_I0]])
-; CHECK-NEXT:    [[D_I1:%.*]] = extractelement <3 x double> [[D]], i64 1
-; CHECK-NEXT:    [[HLSL_ASUINT_I1:%.*]] = call { i32, i32 } @llvm.dx.splitdouble.i32(double [[D_I1]])
-; CHECK-NEXT:    [[D_I2:%.*]] = extractelement <3 x double> [[D]], i64 2
-; CHECK-NEXT:    [[HLSL_ASUINT_I2:%.*]] = call { i32, i32 } @llvm.dx.splitdouble.i32(double [[D_I2]])
-; CHECK-NEXT:    [[DOTELEM0:%.*]] = extractvalue { i32, i32 } [[HLSL_ASUINT_I0]], 0
-; CHECK-NEXT:    [[DOTELEM01:%.*]] = extractvalue { i32, i32 } [[HLSL_ASUINT_I1]], 0
-; CHECK-NEXT:    [[DOTELEM02:%.*]] = extractvalue { i32, i32 } [[HLSL_ASUINT_I2]], 0
-; CHECK-NEXT:    [[DOTELEM1:%.*]] = extractvalue { i32, i32 } [[HLSL_ASUINT_I0]], 1
-; CHECK-NEXT:    [[DOTELEM13:%.*]] = extractvalue { i32, i32 } [[HLSL_ASUINT_I1]], 1
-; CHECK-NEXT:    [[DOTELEM14:%.*]] = extractvalue { i32, i32 } [[HLSL_ASUINT_I2]], 1
-; CHECK-NEXT:    [[DOTI0:%.*]] = add i32 [[DOTELEM0]], [[DOTELEM1]]
-; CHECK-NEXT:    [[DOTI1:%.*]] = add i32 [[DOTELEM01]], [[DOTELEM13]]
-; CHECK-NEXT:    [[DOTI2:%.*]] = add i32 [[DOTELEM02]], [[DOTELEM14]]
-; CHECK-NEXT:    [[DOTUPTO015:%.*]] = insertelement <3 x i32> poison, i32 [[DOTI0]], i64 0
-; CHECK-NEXT:    [[DOTUPTO116:%.*]] = insertelement <3 x i32> [[DOTUPTO015]], i32 [[DOTI1]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x i32> [[DOTUPTO116]], i32 [[DOTI2]], i64 2
-; CHECK-NEXT:    ret <3 x i32> [[TMP1]]
-;
-  %hlsl.asuint = call { <3 x i32>, <3 x i32> }  @llvm.dx.splitdouble.v3i32(<3 x double> %d)
-  %1 = extractvalue { <3 x i32>, <3 x i32> } %hlsl.asuint, 0
-  %2 = extractvalue { <3 x i32>, <3 x i32> } %hlsl.asuint, 1
-  %3 = add <3 x i32> %1, %2
-  ret <3 x i32> %3
-}
diff --git a/llvm/test/CodeGen/DirectX/splitdouble.ll b/llvm/test/CodeGen/DirectX/splitdouble.ll
new file mode 100644
index 0000000000000..1443ba6269255
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/splitdouble.ll
@@ -0,0 +1,76 @@
+; RUN: opt -passes='function(scalarizer)' -S -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,NOLOWER
+; RUN: opt -passes='function(scalarizer),module(dxil-op-lower)' -S -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,WITHLOWER
+
+define i32 @test_scalar(double noundef %D) {
+; CHECK-LABEL: define i32 @test_scalar(
+; CHECK-SAME: double noundef [[D:%.*]]) {
+; NOLOWER-NEXT:    [[HLSL_ASUINT_I0:%.*]] = call { i32, i32 } @llvm.dx.splitdouble.i32(double [[D]])
+; WITHLOWER-NEXT:  [[HLSL_ASUINT_I0:%.*]] = call %dx.types.splitdouble @dx.op.splitDouble.f64(i32 102, double [[D]])
+; NOLOWER-NEXT:    [[EV1:%.*]] = extractvalue { i32, i32 } [[HLSL_ASUINT_I0]], 0
+; NOLOWER-NEXT:    [[EV2:%.*]] = extractvalue { i32, i32 } [[HLSL_ASUINT_I0]], 1
+; WITHLOWER-NEXT:  [[EV1:%.*]] = extractvalue %dx.types.splitdouble [[HLSL_ASUINT_I0]], 0
+; WITHLOWER-NEXT:  [[EV2:%.*]] = extractvalue %dx.types.splitdouble [[HLSL_ASUINT_I0]], 1
+; CHECK-NEXT:      [[ADD:%.*]] = add i32 [[EV1]], [[EV2]]
+; CHECK-NEXT:      ret i32 [[ADD]]
+;
+  %hlsl.splitdouble = call { i32, i32 } @llvm.dx.splitdouble.i32(double %D)
+  %1 = extractvalue { i32, i32 } %hlsl.splitdouble, 0
+  %2 = extractvalue { i32, i32 } %hlsl.splitdouble, 1
+  %add = add i32 %1, %2
+  ret i32 %add
+}
+
+
+define void @test_vector_double_split_void(<2 x double> noundef %d) {
+; CHECK-LABEL: define void @test_vector_double_split_void(
+; CHECK-SAME: <2 x double> noundef [[D:%.*]]) {
+; CHECK-NEXT:      [[D_I0:%.*]] = extractelement <2 x double> [[D]], i64 0
+; NOLOWER-NEXT:    [[HLSL_ASUINT_I0:%.*]] = call { i32, i32 } @llvm.dx.splitdouble.i32(double [[D_I0]])
+; WITHLOWER-NEXT:  [[HLSL_ASUINT_I0:%.*]] = call %dx.types.splitdouble @dx.op.splitDouble.f64(i32 102, double [[D_I0]])
+; CHECK-NEXT:      [[D_I1:%.*]] = extractelement <2 x double> [[D]], i64 1
+; NOLOWER-NEXT:    [[HLSL_ASUINT_I1:%.*]] = call { i32, i32 } @llvm.dx.splitdouble.i32(double [[D_I1]])
+; WITHLOWER-NEXT:  [[HLSL_ASUINT_I1:%.*]] = call %dx.types.splitdouble @dx.op.splitDouble.f64(i32 102, double [[D_I1]])
+; CHECK-NEXT:      ret void
+;
+  %hlsl.asuint = call { <2 x i32>, <2 x i32> }  @llvm.dx.splitdouble.v2i32(<2 x double> %d)
+  ret void
+}
+
+define noundef <3 x i32> @test_vector_double_split(<3 x double> noundef %d) {
+; CHECK-LABEL: define noundef <3 x i32> @test_vector_double_split(
+; CHECK-SAME: <3 x double> noundef [[D:%.*]]) {
+; CHECK-NEXT:      [[D_I0:%.*]] = extractelement <3 x double> [[D]], i64 0
+; NOLOWER-NEXT:    [[HLSL_ASUINT_I0:%.*]] = call { i32, i32 } @llvm.dx.splitdouble.i32(double [[D_I0]])
+; WITHLOWER-NEXT:  [[HLSL_ASUINT_I0:%.*]] = call %dx.types.splitdouble @dx.op.splitDouble.f64(i32 102, double [[D_I0]])
+; CHECK-NEXT:      [[D_I1:%.*]] = extractelement <3 x double> [[D]], i64 1
+; NOLOWER-NEXT:    [[HLSL_ASUINT_I1:%.*]] = call { i32, i32 } @llvm.dx.splitdouble.i32(double [[D_I1]])
+; WITHLOWER-NEXT:  [[HLSL_ASUINT_I1:%.*]] = call %dx.types.splitdouble @dx.op.splitDouble.f64(i32 102, double [[D_I1]])
+; CHECK-NEXT:      [[D_I2:%.*]] = extractelement <3 x double> [[D]], i64 2
+; NOLOWER-NEXT:    [[HLSL_ASUINT_I2:%.*]] = call { i32, i32 } @llvm.dx.splitdouble.i32(double [[D_I2]])
+; WITHLOWER-NEXT:  [[HLSL_ASUINT_I2:%.*]] = call %dx.types.splitdouble @dx.op.splitDouble.f64(i32 102, double [[D_I2]])
+; NOLOWER-NEXT:    [[DOTELEM0:%.*]] = extractvalue { i32, i32 } [[HLSL_ASUINT_I0]], 0
+; WITHLOWER-NEXT:  [[DOTELEM0:%.*]] = extractvalue %dx.types.splitdouble [[HLSL_ASUINT_I0]], 0
+; NOLOWER-NEXT:    [[DOTELEM01:%.*]] = extractvalue { i32, i32 } [[HLSL_ASUINT_I1]], 0
+; WITHLOWER-NEXT:  [[DOTELEM01:%.*]] = extractvalue %dx.types.splitdouble [[HLSL_ASUINT_I1]], 0
+; NOLOWER-NEXT:    [[DOTELEM02:%.*]] = extractvalue { i32, i32 } [[HLSL_ASUINT_I2]], 0
+; WITHLOWER-NEXT:  [[DOTELEM02:%.*]] = extractvalue %dx.types.splitdouble [[HLSL_ASUINT_I2]], 0
+; NOLOWER-NEXT:    [[DOTELEM1:%.*]] = extractvalue { i32, i32 } [[HLSL_ASUINT_I0]], 1
+; WITHLOWER-NEXT:  [[DOTELEM1:%.*]] = extractvalue %dx.types.splitdouble [[HLSL_ASUINT_I0]], 1
+; NOLOWER-NEXT:    [[DOTELEM13:%.*]] = extractvalue { i32, i32 } [[HLSL_ASUINT_I1]], 1
+; WITHLOWER-NEXT:  [[DOTELEM13:%.*]] = extractvalue %dx.types.splitdouble [[HLSL_ASUINT_I1]], 1
+; NOLOWER-NEXT:    [[DOTELEM14:%.*]] = extractvalue { i32, i32 } [[HLSL_ASUINT_I2]], 1
+; WITHLOWER-NEXT:  [[DOTELEM14:%.*]] = extractvalue %dx.types.splitdouble [[HLSL_ASUINT_I2]], 1
+; CHECK-NEXT:      [[DOTI0:%.*]] = add i32 [[DOTELEM0]], [[DOTELEM1]]
+; CHECK-NEXT:      [[DOTI1:%.*]] = add i32 [[DOTELEM01]], [[DOTELEM13]]
+; CHECK-NEXT:      [[DOTI2:%.*]] = add i32 [[DOTELEM02]], [[DOTELEM14]]
+; CHECK-NEXT:      [[DOTUPTO015:%.*]] = insertelement <3 x i32> poison, i32 [[DOTI0]], i64 0
+; CHECK-NEXT:      [[DOTUPTO116:%.*]] = insertelement <3 x i32> [[DOTUPTO015]], i32 [[DOTI1]], i64 1
+; CHECK-NEXT:      [[TMP1:%.*]] = insertelement <3 x i32> [[DOTUPTO116]], i32 [[DOTI2]], i64 2
+; CHECK-NEXT:      ret <3 x i32> [[TMP1]]
+;
+  %hlsl.asuint = call { <3 x i32>, <3 x i32> }  @llvm.dx.splitdouble.v3i32(<3 x double> %d)
+  %1 = extractvalue { <3 x i32>, <3 x i32> } %hlsl.asuint, 0
+  %2 = extractvalue { <3 x i32>, <3 x i32> } %hlsl.asuint, 1
+  %3 = add <3 x i32> %1, %2
+  ret <3 x i32> %3
+}
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/splitdouble.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/splitdouble.ll
new file mode 100644
index 0000000000000..d18b16b843c37
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/splitdouble.ll
@@ -0,0 +1,40 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; Make sure lowering is correctly generating spirv code.
+
+; CHECK-DAG: %[[#double:]] = OpTypeFloat 64
+; CHECK-DAG: %[[#vec_2_double:]] = OpTypeVector %[[#double]] 2
+; CHECK-DAG: %[[#int_32:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#vec_2_int_32:]] = OpTypeVector %[[#int_32]] 2
+; CHECK-DAG: %[[#vec_4_int_32:]] = OpTypeVector %[[#int_32]] 4
+
+
+define spir_func noundef i32 @test_scalar(double noundef %D) local_unnamed_addr {
+entry:
+  ; CHECK-LABEL: ; -- Begin function test_scalar
+  ; CHECK: %[[#param:]] = OpFunctionParameter %[[#double]]
+  ; CHECK: %[[#bitcast:]] = OpBitcast %[[#vec_2_int_32]] %[[#param]]
+  %0 = bitcast double %D to <2 x i32>
+  ; CHECK: %[[#]] = OpCompositeExtract %[[#int_32]] %[[#bitcast]] 0
+  %1 = extractelement <2 x i32> %0, i64 0
+  ; CHECK: %[[#]] = OpCompositeExtract %[[#int_32]] %[[#bitcast]] 1
+  %2 = extractelement <2 x i32> %0, i64 1
+  %add = add i32 %1, %2
+  ret i32 %add
+}
+
+
+define spir_func noundef <2 x i32> @test_vector(<2 x double> noundef %D) local_unnamed_addr {
+entry:
+  ; CHECK-LABEL: ; -- Begin function test_vector
+  ; CHECK: %[[#param:]] = OpFunctionParameter %[[#vec_2_double]]
+  ; CHECK: %[[#CAST1:]] = OpBitcast %[[#vec_4_int_32]] %[[#param]]
+  ; CHECK: %[[#SHUFF2:]] = OpVectorShuffle %[[#vec_2_int_32]] %[[#CAST1]] %[[#]] 0 2
+  ; CHECK: %[[#SHUFF3:]] = OpVectorShuffle %[[#vec_2_int_32]] %[[#CAST1]] %[[#]] 1 3
+  %0 = bitcast <2 x double> %D to <4 x i32>
+  %1 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+  %2 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
+  %add = add <2 x i32> %1, %2
+  ret <2 x i32> %add
+}
diff --git a/llvm/test/CodeGen/X86/scmp.ll b/llvm/test/CodeGen/X86/scmp.ll
index 0746a07d2cdf2..5ae5caf3e88b2 100644
--- a/llvm/test/CodeGen/X86/scmp.ll
+++ b/llvm/test/CodeGen/X86/scmp.ll
@@ -848,7 +848,7 @@ define <16 x i32> @scmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; AVX512-NEXT:    vpcmpgtb %xmm0, %xmm1, %k1
 ; AVX512-NEXT:    vpcmpgtb %xmm1, %xmm0, %k2
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm0 {%k2} {z} = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/ucmp.ll b/llvm/test/CodeGen/X86/ucmp.ll
index cd643cb8d6375..6a52acfe2fb30 100644
--- a/llvm/test/CodeGen/X86/ucmp.ll
+++ b/llvm/test/CodeGen/X86/ucmp.ll
@@ -819,7 +819,7 @@ define <16 x i32> @ucmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; AVX512-NEXT:    vpcmpltub %xmm1, %xmm0, %k1
 ; AVX512-NEXT:    vpcmpnleub %xmm1, %xmm0, %k2
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm0 {%k2} {z} = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512-NEXT:    retq
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index 1326751a847d7..59db6c197ef8c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -65,7 +65,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT: No successors
 ; IF-EVL-INLOOP-EMPTY:
 ; IF-EVL-INLOOP-NEXT: scalar.ph:
+; IF-EVL-INLOOP-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
 ; IF-EVL-INLOOP-NEXT: No successors
+; IF-EVL-INLOOP-EMPTY:
+; IF-EVL-INLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]>
 ; IF-EVL-INLOOP-NEXT: }
 ;
 
@@ -104,7 +107,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-NEXT: No successors
 ; NO-VP-OUTLOOP-EMPTY:
 ; NO-VP-OUTLOOP-NEXT: scalar.ph:
+; NO-VP-OUTLOOP-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
 ; NO-VP-OUTLOOP-NEXT: No successors
+; NO-VP-OUTLOOP-EMPTY:
+; NO-VP-OUTLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]>
 ; NO-VP-OUTLOOP-NEXT: }
 ;
 
@@ -143,7 +149,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-NEXT: No successors
 ; NO-VP-INLOOP-EMPTY:
 ; NO-VP-INLOOP-NEXT: scalar.ph:
+; NO-VP-INLOOP-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
 ; NO-VP-INLOOP-NEXT: No successors
+; NO-VP-INLOOP-EMPTY:
+; NO-VP-INLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]>
 ; NO-VP-INLOOP-NEXT: }
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index 8e56614a2e3d5..b05980bef1b38 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -232,9 +232,11 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
+; CHECK-NEXT:   EMIT vp<[[RESUME_RED:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<1234>
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: Live-out i32 %recur = vp<[[RESUME_1_P]]>
+; CHECK-NEXT: Live-out i32 %and.red = vp<[[RESUME_RED]]>
 ; CHECK-NEXT: }
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index 0dde507d08be7..2247295295663 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -165,7 +165,10 @@ define float @print_reduction(i64 %n, ptr noalias %y) {
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00>
 ; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: Live-out float %red = vp<[[RED_RESUME]]>
 ; CHECK-NEXT: }
 ;
 entry:
@@ -221,7 +224,10 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00>
 ; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: Live-out float %red = vp<[[RED_RESUME]]>
 ; CHECK-NEXT: }
 ;
 entry:
@@ -447,7 +453,10 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00>
 ; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: Live-out float %sum.07 = vp<[[RED_RESUME]]>
 ; CHECK-NEXT:}
 
 entry:
diff --git a/llvm/test/Transforms/LowerTypeTests/cfi-icall-alias.ll b/llvm/test/Transforms/LowerTypeTests/cfi-icall-alias.ll
new file mode 100644
index 0000000000000..0c5324ee96c93
--- /dev/null
+++ b/llvm/test/Transforms/LowerTypeTests/cfi-icall-alias.ll
@@ -0,0 +1,54 @@
+;; Check that if the address of a weak function is only taken through an alias,
+;; it is still added to a list of exported functions and @llvm.type.test() is
+;; lowered to an actual check against the generated CFI jumptable.
+
+RUN: rm -rf %t.dir && split-file %s %t.dir && cd %t.dir
+RUN: opt test.ll --thinlto-bc --thinlto-split-lto-unit -o test.bc
+RUN: llvm-modextract test.bc -n 0 -o test0.bc
+RUN: llvm-modextract test.bc -n 1 -o test1.bc
+
+;; Check that a CFI jumptable is generated.
+RUN: opt test1.bc -passes=lowertypetests -lowertypetests-read-summary=in.yaml \
+RUN:   -lowertypetests-summary-action=export -lowertypetests-write-summary=exported.yaml \
+RUN:   -S -o - | FileCheck %s --check-prefix=REGULAR
+REGULAR: @__typeid__ZTSFvvE_global_addr = hidden alias i8, ptr @.cfi.jumptable
+REGULAR: @f = alias void (), ptr @.cfi.jumptable
+REGULAR: define private void @.cfi.jumptable()
+
+;; CHECK that @llvm.type.test() is lowered to an actual check.
+RUN: opt test0.bc -passes=lowertypetests -lowertypetests-read-summary=exported.yaml \
+RUN:   -lowertypetests-summary-action=import -S -o - | FileCheck %s --check-prefix=THIN
+THIN:      define i1 @test() {
+THIN-NEXT:   %1 = icmp eq i64 ptrtoint (ptr @alias to i64), ptrtoint (ptr @__typeid__ZTSFvvE_global_addr to i64)
+THIN-NEXT:   ret i1 %1
+THIN-NEXT: }
+
+;--- test.ll
+target triple = "x86_64-pc-linux-gnu"
+
+@alias = alias void(), ptr @f
+
+define weak void @f() !type !0 {
+  ret void
+}
+
+define i1 @test() {
+  %1 = call i1 @llvm.type.test(ptr nonnull @alias, metadata !"_ZTSFvvE")
+  ret i1 %1
+}
+
+declare i1 @llvm.type.test(ptr, metadata)
+
+!0 = !{i64 0, !"_ZTSFvvE"}
+;--- in.yaml
+---
+GlobalValueMap:
+  8346051122425466633: # guid("test")
+    - Live: true
+      Refs: [5833419078793185394] # guid("alias")
+      TypeTests: [9080559750644022485] # guid("_ZTSFvvE")
+  5833419078793185394: # guid("alias")
+    - Aliasee: 14740650423002898831 # guid("f")
+  14740650423002898831: # guid("f")
+    -
+...
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/CMakeLists.txt b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/CMakeLists.txt
index 24512cb0225e8..df689767b7724 100644
--- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/CMakeLists.txt
+++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/CMakeLists.txt
@@ -13,4 +13,5 @@ add_llvm_unittest(SandboxVectorizerTests
   LegalityTest.cpp
   SchedulerTest.cpp
   SeedCollectorTest.cpp	
+  VecUtilsTest.cpp
 )
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp
index 56c6bf5f1ef1f..aaa8e96de6d17 100644
--- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp
@@ -29,13 +29,19 @@ struct LegalityTest : public testing::Test {
 
 TEST_F(LegalityTest, Legality) {
   parseIR(C, R"IR(
-define void @foo(ptr %ptr) {
+define void @foo(ptr %ptr, <2 x float> %vec2, <3 x float> %vec3, i8 %arg, float %farg0, float %farg1) {
   %gep0 = getelementptr float, ptr %ptr, i32 0
   %gep1 = getelementptr float, ptr %ptr, i32 1
+  %gep3 = getelementptr float, ptr %ptr, i32 3
   %ld0 = load float, ptr %gep0
   %ld1 = load float, ptr %gep0
   store float %ld0, ptr %gep0
   store float %ld1, ptr %gep1
+  store <2 x float> %vec2, ptr %gep1
+  store <3 x float> %vec3, ptr %gep3
+  store i8 %arg, ptr %gep1
+  %fadd0 = fadd float %farg0, %farg0
+  %fadd1 = fadd fast float %farg1, %farg1
   ret void
 }
 )IR");
@@ -46,10 +52,16 @@ define void @foo(ptr %ptr) {
   auto It = BB->begin();
   [[maybe_unused]] auto *Gep0 = cast<sandboxir::GetElementPtrInst>(&*It++);
   [[maybe_unused]] auto *Gep1 = cast<sandboxir::GetElementPtrInst>(&*It++);
+  [[maybe_unused]] auto *Gep3 = cast<sandboxir::GetElementPtrInst>(&*It++);
   [[maybe_unused]] auto *Ld0 = cast<sandboxir::LoadInst>(&*It++);
   [[maybe_unused]] auto *Ld1 = cast<sandboxir::LoadInst>(&*It++);
   auto *St0 = cast<sandboxir::StoreInst>(&*It++);
   auto *St1 = cast<sandboxir::StoreInst>(&*It++);
+  auto *StVec2 = cast<sandboxir::StoreInst>(&*It++);
+  auto *StVec3 = cast<sandboxir::StoreInst>(&*It++);
+  auto *StI8 = cast<sandboxir::StoreInst>(&*It++);
+  auto *FAdd0 = cast<sandboxir::BinaryOperator>(&*It++);
+  auto *FAdd1 = cast<sandboxir::BinaryOperator>(&*It++);
 
   sandboxir::LegalityAnalysis Legality;
   const auto &Result = Legality.canVectorize({St0, St1});
@@ -62,6 +74,30 @@ define void @foo(ptr %ptr) {
     EXPECT_EQ(cast<sandboxir::Pack>(Result).getReason(),
               sandboxir::ResultReason::NotInstructions);
   }
+  {
+    // Check DiffOpcodes
+    const auto &Result = Legality.canVectorize({St0, Ld0});
+    EXPECT_TRUE(isa<sandboxir::Pack>(Result));
+    EXPECT_EQ(cast<sandboxir::Pack>(Result).getReason(),
+              sandboxir::ResultReason::DiffOpcodes);
+  }
+  {
+    // Check DiffTypes
+    EXPECT_TRUE(isa<sandboxir::Widen>(Legality.canVectorize({St0, StVec2})));
+    EXPECT_TRUE(isa<sandboxir::Widen>(Legality.canVectorize({StVec2, StVec3})));
+
+    const auto &Result = Legality.canVectorize({St0, StI8});
+    EXPECT_TRUE(isa<sandboxir::Pack>(Result));
+    EXPECT_EQ(cast<sandboxir::Pack>(Result).getReason(),
+              sandboxir::ResultReason::DiffTypes);
+  }
+  {
+    // Check DiffMathFlags
+    const auto &Result = Legality.canVectorize({FAdd0, FAdd1});
+    EXPECT_TRUE(isa<sandboxir::Pack>(Result));
+    EXPECT_EQ(cast<sandboxir::Pack>(Result).getReason(),
+              sandboxir::ResultReason::DiffMathFlags);
+  }
 }
 
 #ifndef NDEBUG
@@ -85,5 +121,8 @@ TEST_F(LegalityTest, LegalityResultDump) {
   EXPECT_TRUE(Matches(Legality.createLegalityResult<sandboxir::Pack>(
                           sandboxir::ResultReason::DiffTypes),
                       "Pack Reason: DiffTypes"));
+  EXPECT_TRUE(Matches(Legality.createLegalityResult<sandboxir::Pack>(
+                          sandboxir::ResultReason::DiffMathFlags),
+                      "Pack Reason: DiffMathFlags"));
 }
 #endif // NDEBUG
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/VecUtilsTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/VecUtilsTest.cpp
new file mode 100644
index 0000000000000..e0b0828496439
--- /dev/null
+++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/VecUtilsTest.cpp
@@ -0,0 +1,37 @@
+//===- VecUtilsTest.cpp --------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/SandboxIR/Context.h"
+#include "llvm/SandboxIR/Type.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+struct VecUtilsTest : public testing::Test {
+  LLVMContext C;
+};
+
+TEST_F(VecUtilsTest, GetNumElements) {
+  sandboxir::Context Ctx(C);
+  auto *ElemTy = sandboxir::Type::getInt32Ty(Ctx);
+  EXPECT_EQ(sandboxir::VecUtils::getNumElements(ElemTy), 1);
+  auto *VTy = sandboxir::FixedVectorType::get(ElemTy, 2);
+  EXPECT_EQ(sandboxir::VecUtils::getNumElements(VTy), 2);
+  auto *VTy1 = sandboxir::FixedVectorType::get(ElemTy, 1);
+  EXPECT_EQ(sandboxir::VecUtils::getNumElements(VTy1), 1);
+}
+
+TEST_F(VecUtilsTest, GetElementType) {
+  sandboxir::Context Ctx(C);
+  auto *ElemTy = sandboxir::Type::getInt32Ty(Ctx);
+  EXPECT_EQ(sandboxir::VecUtils::getElementType(ElemTy), ElemTy);
+  auto *VTy = sandboxir::FixedVectorType::get(ElemTy, 2);
+  EXPECT_EQ(sandboxir::VecUtils::getElementType(VTy), ElemTy);
+}
diff --git a/mlir/lib/Conversion/LinalgToStandard/CMakeLists.txt b/mlir/lib/Conversion/LinalgToStandard/CMakeLists.txt
index cbe85789b29a3..7fc4af5403185 100644
--- a/mlir/lib/Conversion/LinalgToStandard/CMakeLists.txt
+++ b/mlir/lib/Conversion/LinalgToStandard/CMakeLists.txt
@@ -15,6 +15,7 @@ add_mlir_conversion_library(MLIRLinalgToStandard
   MLIRIR
   MLIRLinalgDialect
   MLIRLinalgTransforms
+  MLIRLLVMDialect
   MLIRMemRefDialect
   MLIRPass
   MLIRSCFDialect
diff --git a/mlir/lib/Conversion/MathToLibm/CMakeLists.txt b/mlir/lib/Conversion/MathToLibm/CMakeLists.txt
index 61c46e9bfe250..0a4eb97474f3a 100644
--- a/mlir/lib/Conversion/MathToLibm/CMakeLists.txt
+++ b/mlir/lib/Conversion/MathToLibm/CMakeLists.txt
@@ -14,6 +14,7 @@ add_mlir_conversion_library(MLIRMathToLibm
   MLIRArithDialect
   MLIRDialectUtils
   MLIRFuncDialect
+  MLIRLLVMDialect
   MLIRMathDialect
   MLIRPass
   MLIRTransformUtils