diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h index 8ec4cf853fed2..882ae40086cea 100644 --- a/clang/include/clang/Driver/ToolChain.h +++ b/clang/include/clang/Driver/ToolChain.h @@ -380,6 +380,10 @@ class ToolChain { /// Check if the toolchain should use the integrated assembler. virtual bool useIntegratedAs() const; + /// Check if the toolchain should use AsmParser to parse inlineAsm when + /// integrated assembler is not default. + virtual bool parseInlineAsmUsingAsmParser() const { return false; } + /// IsMathErrnoDefault - Does this tool chain use -fmath-errno by default. virtual bool IsMathErrnoDefault() const { return true; } diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp index ca3fc5af76895..3000b8416adfd 100644 --- a/clang/lib/Driver/ToolChains/AIX.cpp +++ b/clang/lib/Driver/ToolChains/AIX.cpp @@ -176,6 +176,8 @@ void aix::Linker::ConstructJob(Compilation &C, const JobAction &JA, /// AIX - AIX tool chain which can call as(1) and ld(1) directly. AIX::AIX(const Driver &D, const llvm::Triple &Triple, const ArgList &Args) : ToolChain(D, Triple, Args) { + ParseInlineAsmUsingAsmParser = Args.hasFlag( + options::OPT_fintegrated_as, options::OPT_fno_integrated_as, true); getLibraryPaths().push_back(getDriver().SysRoot + "/usr/lib"); } diff --git a/clang/lib/Driver/ToolChains/AIX.h b/clang/lib/Driver/ToolChains/AIX.h index 1534af950c88f..d1ec6d10fb3a0 100644 --- a/clang/lib/Driver/ToolChains/AIX.h +++ b/clang/lib/Driver/ToolChains/AIX.h @@ -59,6 +59,9 @@ class LLVM_LIBRARY_VISIBILITY AIX : public ToolChain { AIX(const Driver &D, const llvm::Triple &Triple, const llvm::opt::ArgList &Args); + bool parseInlineAsmUsingAsmParser() const override { + return ParseInlineAsmUsingAsmParser; + } bool isPICDefault() const override { return true; } bool isPIEDefault() const override { return false; } bool isPICDefaultForced() const override { return true; } @@ -87,6 +90,7 @@ class LLVM_LIBRARY_VISIBILITY AIX : public ToolChain { private: llvm::StringRef GetHeaderSysroot(const llvm::opt::ArgList &DriverArgs) const; + bool ParseInlineAsmUsingAsmParser; }; } // end namespace toolchains diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 9c0922c8497cf..85204ceaa49a2 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -5038,7 +5038,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, << A->getValue() << A->getOption().getName(); } - if (!TC.useIntegratedAs()) + // If toolchain choose to use MCAsmParser for inline asm don't pass the + // option to disable integrated-as explictly. + if (!TC.useIntegratedAs() && !TC.parseInlineAsmUsingAsmParser()) CmdArgs.push_back("-no-integrated-as"); if (Args.hasArg(options::OPT_fdebug_pass_structure)) { diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index bca0bb4ada672..676421552a757 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -598,7 +598,8 @@ static void InitializeCPlusPlusFeatureTestMacros(const LangOptions &LangOpts, } // C++2b features. if (LangOpts.CPlusPlus2b) { - Builder.defineMacro("__cpp_implicit_move", "202011L"); + if (!LangOpts.MSVCCompat) + Builder.defineMacro("__cpp_implicit_move", "202011L"); Builder.defineMacro("__cpp_size_t_suffix", "202011L"); } if (LangOpts.Char8) diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp index 506c06b412b6f..59e64c4b1c5b1 100644 --- a/clang/lib/Sema/SemaStmt.cpp +++ b/clang/lib/Sema/SemaStmt.cpp @@ -3333,8 +3333,13 @@ Sema::NamedReturnInfo Sema::getNamedReturnInfo(Expr *&E, bool ForceCXX2b) { if (!VD) return NamedReturnInfo(); NamedReturnInfo Res = getNamedReturnInfo(VD); + // FIXME: We supress simpler implicit move here (unless ForceCXX2b is true) + // in msvc compatibility mode just as a temporary work around, + // as the MSVC STL has issues with this change. + // We will come back later with a more targeted approach. if (Res.Candidate && !E->isXValue() && - (ForceCXX2b || getLangOpts().CPlusPlus2b)) { + (ForceCXX2b || + (getLangOpts().CPlusPlus2b && !getLangOpts().MSVCCompat))) { E = ImplicitCastExpr::Create(Context, VD->getType().getNonReferenceType(), CK_NoOp, E, nullptr, VK_XValue, FPOptionsOverride()); diff --git a/clang/test/Driver/aix-as.c b/clang/test/Driver/aix-as.c index aa8c610359037..def2adc97daaa 100644 --- a/clang/test/Driver/aix-as.c +++ b/clang/test/Driver/aix-as.c @@ -63,3 +63,18 @@ // CHECK-AS32-MultiInput: "{{.*}}as{{(.exe)?}}" // CHECK-AS32-MultiInput: "-a32" // CHECK-AS32-MultiInput: "-many" + +// Check not passing no-integrated-as flag by default. +// RUN: %clang -no-canonical-prefixes %s -### -c -o %t.o 2>&1 \ +// RUN: -target powerpc64-ibm-aix7.1.0.0 \ +// RUN: | FileCheck --check-prefix=CHECK-IAS --implicit-check-not=-no-integrated-as %s +// CHECK-IAS: InstalledDir +// CHECK-IAS: "-a64" + +// Check passing no-integrated-as flag if specified by user. +// RUN: %clang -no-canonical-prefixes %s -### -c -o %t.o 2>&1 \ +// RUN: -target powerpc64-ibm-aix7.1.0.0 -fno-integrated-as \ +// RUN: | FileCheck --check-prefix=CHECK-NOIAS %s +// CHECK-NOIAS: InstalledDir +// CHECK-NOIAS: -no-integrated-as +// CHECK-NOIAS: "-a64" diff --git a/clang/test/SemaCXX/cxx2b-p2266-disable-with-msvc-compat.cpp b/clang/test/SemaCXX/cxx2b-p2266-disable-with-msvc-compat.cpp new file mode 100644 index 0000000000000..2143c0535e606 --- /dev/null +++ b/clang/test/SemaCXX/cxx2b-p2266-disable-with-msvc-compat.cpp @@ -0,0 +1,50 @@ +// RUN: %clang_cc1 -std=c++2b -fsyntax-only -fcxx-exceptions -verify=new %s +// RUN: %clang_cc1 -std=c++2b -fsyntax-only -fcxx-exceptions -fms-compatibility -verify=old %s +// RUN: %clang_cc1 -std=c++20 -fsyntax-only -fcxx-exceptions -verify=old %s + +// FIXME: This is a test for a temporary workaround where we disable simpler implicit moves +// when compiling with -fms-compatibility, because the MSVC STL does not compile. +// A better workaround is under discussion. +// The test cases here are just a copy from `CXX/class/class.init/class.copy.elision/p3.cpp`, +// so feel free to delete this file when the workaround is not needed anymore. + +struct CopyOnly { + CopyOnly(); // new-note {{candidate constructor not viable: requires 0 arguments, but 1 was provided}} + // new-note@-1 {{candidate constructor not viable: requires 0 arguments, but 1 was provided}} + CopyOnly(CopyOnly &); // new-note {{candidate constructor not viable: expects an lvalue for 1st argument}} + // new-note@-1 {{candidate constructor not viable: expects an lvalue for 1st argument}} +}; +struct MoveOnly { + MoveOnly(); + MoveOnly(MoveOnly &&); +}; +MoveOnly &&rref(); + +MoveOnly &&test1(MoveOnly &&w) { + return w; // old-error {{cannot bind to lvalue of type}} +} + +CopyOnly test2(bool b) { + static CopyOnly w1; + CopyOnly w2; + if (b) { + return w1; + } else { + return w2; // new-error {{no matching constructor for initialization}} + } +} + +template T &&test3(T &&x) { return x; } // old-error {{cannot bind to lvalue of type}} +template MoveOnly &test3(MoveOnly &); +template MoveOnly &&test3(MoveOnly &&); // old-note {{in instantiation of function template specialization}} + +MoveOnly &&test4() { + MoveOnly &&x = rref(); + return x; // old-error {{cannot bind to lvalue of type}} +} + +void test5() try { + CopyOnly x; + throw x; // new-error {{no matching constructor for initialization}} +} catch (...) { +} diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt index f5b07cee45c47..cdb33087ab53b 100644 --- a/compiler-rt/CMakeLists.txt +++ b/compiler-rt/CMakeLists.txt @@ -67,8 +67,12 @@ if (NOT COMPILER_RT_ASAN_SHADOW_SCALE STREQUAL "") -D${COMPILER_RT_ASAN_SHADOW_SCALE_DEFINITION}) endif() -set(COMPILER_RT_HWASAN_WITH_INTERCEPTORS ON CACHE BOOL - "Enable libc interceptors in HWASan (testing mode)") +if(FUCHSIA) + set(COMPILER_RT_HWASAN_WITH_INTERCEPTORS_DEFAULT OFF) +else() + set(COMPILER_RT_HWASAN_WITH_INTERCEPTORS_DEFAULT ON) +endif() +set(COMPILER_RT_HWASAN_WITH_INTERCEPTORS ${COMPILER_RT_HWASAN_WITH_INTERCEPTORS_DEFAULT} CACHE BOOL "Enable libc interceptors in HWASan (testing mode)") set(COMPILER_RT_BAREMETAL_BUILD OFF CACHE BOOL "Build for a bare-metal target.") diff --git a/compiler-rt/lib/hwasan/CMakeLists.txt b/compiler-rt/lib/hwasan/CMakeLists.txt index 137abf7c246ab..d65c9b843c1b5 100644 --- a/compiler-rt/lib/hwasan/CMakeLists.txt +++ b/compiler-rt/lib/hwasan/CMakeLists.txt @@ -7,6 +7,7 @@ set(HWASAN_RTL_SOURCES hwasan_allocation_functions.cpp hwasan_dynamic_shadow.cpp hwasan_exceptions.cpp + hwasan_fuchsia.cpp hwasan_globals.cpp hwasan_interceptors.cpp hwasan_interceptors_vfork.S @@ -44,6 +45,11 @@ set(HWASAN_RTL_HEADERS set(HWASAN_DEFINITIONS) append_list_if(COMPILER_RT_HWASAN_WITH_INTERCEPTORS HWASAN_WITH_INTERCEPTORS=1 HWASAN_DEFINITIONS) +if(FUCHSIA) + # Set this explicitly on Fuchsia, otherwise the default value is set to HWASAN_WITH_INTERCEPTORS. + list(APPEND HWASAN_DEFINITIONS HWASAN_REPLACE_OPERATORS_NEW_AND_DELETE=1) +endif() + set(HWASAN_RTL_CFLAGS ${SANITIZER_COMMON_CFLAGS}) append_rtti_flag(OFF HWASAN_RTL_CFLAGS) append_list_if(COMPILER_RT_HAS_FPIC_FLAG -fPIC HWASAN_RTL_CFLAGS) diff --git a/compiler-rt/lib/hwasan/hwasan_fuchsia.cpp b/compiler-rt/lib/hwasan/hwasan_fuchsia.cpp new file mode 100644 index 0000000000000..b8e67c4c48221 --- /dev/null +++ b/compiler-rt/lib/hwasan/hwasan_fuchsia.cpp @@ -0,0 +1,159 @@ +//===-- hwasan_fuchsia.cpp --------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file is a part of HWAddressSanitizer and contains Fuchsia-specific +/// code. +/// +//===----------------------------------------------------------------------===// + +#include "sanitizer_common/sanitizer_fuchsia.h" +#if SANITIZER_FUCHSIA + +#include "hwasan.h" +#include "hwasan_interface_internal.h" +#include "hwasan_report.h" +#include "hwasan_thread.h" +#include "hwasan_thread_list.h" + +// This TLS variable contains the location of the stack ring buffer and can be +// used to always find the hwasan thread object associated with the current +// running thread. +[[gnu::tls_model("initial-exec")]] +SANITIZER_INTERFACE_ATTRIBUTE +THREADLOCAL uptr __hwasan_tls; + +namespace __hwasan { + +// These are known parameters passed to the hwasan runtime on thread creation. +struct Thread::InitState { + uptr stack_bottom, stack_top; +}; + +static void FinishThreadInitialization(Thread *thread); + +void InitThreads() { + // This is the minimal alignment needed for the storage where hwasan threads + // and their stack ring buffers are placed. This alignment is necessary so the + // stack ring buffer can perform a simple calculation to get the next element + // in the RB. The instructions for this calculation are emitted by the + // compiler. (Full explanation in hwasan_thread_list.h.) + uptr alloc_size = UINT64_C(1) << kShadowBaseAlignment; + uptr thread_start = reinterpret_cast( + MmapAlignedOrDieOnFatalError(alloc_size, alloc_size, __func__)); + + InitThreadList(thread_start, alloc_size); + + // Create the hwasan thread object for the current (main) thread. Stack info + // for this thread is known from information passed via + // __sanitizer_startup_hook. + const Thread::InitState state = { + .stack_bottom = __sanitizer::MainThreadStackBase, + .stack_top = + __sanitizer::MainThreadStackBase + __sanitizer::MainThreadStackSize, + }; + FinishThreadInitialization(hwasanThreadList().CreateCurrentThread(&state)); +} + +uptr *GetCurrentThreadLongPtr() { return &__hwasan_tls; } + +// This is called from the parent thread before the new thread is created. Here +// we can propagate known info like the stack bounds to Thread::Init before +// jumping into the thread. We cannot initialize the stack ring buffer yet since +// we have not entered the new thread. +static void *BeforeThreadCreateHook(uptr user_id, bool detached, + const char *name, uptr stack_bottom, + uptr stack_size) { + const Thread::InitState state = { + .stack_bottom = stack_bottom, + .stack_top = stack_bottom + stack_size, + }; + return hwasanThreadList().CreateCurrentThread(&state); +} + +// This sets the stack top and bottom according to the InitState passed to +// CreateCurrentThread above. +void Thread::InitStackAndTls(const InitState *state) { + CHECK_NE(state->stack_bottom, 0); + CHECK_NE(state->stack_top, 0); + stack_bottom_ = state->stack_bottom; + stack_top_ = state->stack_top; + tls_end_ = tls_begin_ = 0; +} + +// This is called after creating a new thread with the pointer returned by +// BeforeThreadCreateHook. We are still in the creating thread and should check +// if it was actually created correctly. +static void ThreadCreateHook(void *hook, bool aborted) { + Thread *thread = static_cast(hook); + if (!aborted) { + // The thread was created successfully. + // ThreadStartHook can already be running in the new thread. + } else { + // The thread wasn't created after all. + // Clean up everything we set up in BeforeThreadCreateHook. + atomic_signal_fence(memory_order_seq_cst); + hwasanThreadList().ReleaseThread(thread); + } +} + +// This is called in the newly-created thread before it runs anything else, +// with the pointer returned by BeforeThreadCreateHook (above). Here we can +// setup the stack ring buffer. +static void ThreadStartHook(void *hook, thrd_t self) { + Thread *thread = static_cast(hook); + FinishThreadInitialization(thread); + thread->InitRandomState(); +} + +// This is the function that sets up the stack ring buffer and enables us to use +// GetCurrentThread. This function should only be called while IN the thread +// that we want to create the hwasan thread object for so __hwasan_tls can be +// properly referenced. +static void FinishThreadInitialization(Thread *thread) { + CHECK_NE(thread, nullptr); + + // The ring buffer is located immediately before the thread object. + uptr stack_buffer_size = hwasanThreadList().GetRingBufferSize(); + uptr stack_buffer_start = reinterpret_cast(thread) - stack_buffer_size; + thread->InitStackRingBuffer(stack_buffer_start, stack_buffer_size); +} + +static void ThreadExitHook(void *hook, thrd_t self) { + Thread *thread = static_cast(hook); + atomic_signal_fence(memory_order_seq_cst); + hwasanThreadList().ReleaseThread(thread); +} + +} // namespace __hwasan + +extern "C" { + +void *__sanitizer_before_thread_create_hook(thrd_t thread, bool detached, + const char *name, void *stack_base, + size_t stack_size) { + return __hwasan::BeforeThreadCreateHook( + reinterpret_cast(thread), detached, name, + reinterpret_cast(stack_base), stack_size); +} + +void __sanitizer_thread_create_hook(void *hook, thrd_t thread, int error) { + __hwasan::ThreadCreateHook(hook, error != thrd_success); +} + +void __sanitizer_thread_start_hook(void *hook, thrd_t self) { + __hwasan::ThreadStartHook(hook, reinterpret_cast(self)); +} + +void __sanitizer_thread_exit_hook(void *hook, thrd_t self) { + __hwasan::ThreadExitHook(hook, self); +} + +} // extern "C" + +#endif // SANITIZER_FUCHSIA diff --git a/compiler-rt/lib/hwasan/hwasan_thread.cpp b/compiler-rt/lib/hwasan/hwasan_thread.cpp index 764ca4f651b37..ee747a3beea5e 100644 --- a/compiler-rt/lib/hwasan/hwasan_thread.cpp +++ b/compiler-rt/lib/hwasan/hwasan_thread.cpp @@ -46,7 +46,12 @@ void Thread::Init(uptr stack_buffer_start, uptr stack_buffer_size, heap_allocations_ = HeapAllocationsRingBuffer::New(sz); InitStackAndTls(state); +#if !SANITIZER_FUCHSIA + // Do not initialize the stack ring buffer just yet on Fuchsia. Threads will + // be initialized before we enter the thread itself, so we will instead call + // this later. InitStackRingBuffer(stack_buffer_start, stack_buffer_size); +#endif } void Thread::InitStackRingBuffer(uptr stack_buffer_start, diff --git a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.member/detach.pass.cpp b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.member/detach.pass.cpp index ea82d5392aeb6..03dc79d2d8379 100644 --- a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.member/detach.pass.cpp +++ b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.member/detach.pass.cpp @@ -60,7 +60,7 @@ class G int G::n_alive = 0; bool G::op_run = false; -void foo() {} +void foo() { done = true; } int main(int, char**) { @@ -75,6 +75,7 @@ int main(int, char**) assert(G::n_alive == 1); } assert(G::n_alive == 0); + done = false; #ifndef TEST_HAS_NO_EXCEPTIONS { std::thread t0 = support::make_test_thread(foo); @@ -85,6 +86,11 @@ int main(int, char**) t0.detach(); } catch (std::system_error const&) { } + // Wait to make sure that the detached thread has started up. + // Without this, we could exit main and start destructing global + // resources that are needed when the thread starts up, while the + // detached thread would start up only later. + while (!done) {} } #endif diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp index f886f0e03929c..045ca85dcab31 100644 --- a/lld/MachO/Driver.cpp +++ b/lld/MachO/Driver.cpp @@ -528,7 +528,7 @@ static void compileBitcodeFiles() { // FIXME: Remove this once LTO.cpp honors config->exportDynamic. if (config->exportDynamic) for (InputFile *file : inputFiles) - if (auto *bitcodeFile = dyn_cast(file)) { + if (isa(file)) { warn("the effect of -export_dynamic on LTO is not yet implemented"); break; } diff --git a/lld/test/wasm/map-file.s b/lld/test/wasm/map-file.s index a215cc14d4cc0..9ca2f196f331e 100644 --- a/lld/test/wasm/map-file.s +++ b/lld/test/wasm/map-file.s @@ -10,6 +10,9 @@ wasm_global: bar: .functype bar () -> () i32.const somedata + i32.const somezeroes + drop + drop end_function write_global: @@ -30,9 +33,15 @@ somedata: .int32 123 .size somedata, 4 +.section .bss.somezeroes,"",@ +somezeroes: + .int32 0 +.size somezeroes, 4 + .section .debug_info,"",@ .int32 bar + # CHECK: Addr Off Size Out In Symbol # CHECK-NEXT: - 8 a TYPE # CHECK-NEXT: - 12 6 FUNCTION @@ -42,19 +51,22 @@ somedata: # CHECK-NEXT: 0 0 0 __stack_pointer # CHECK-NEXT: 1 0 0 wasm_global # CHECK-NEXT: - 33 15 EXPORT -# CHECK-NEXT: - 48 26 CODE -# CHECK-NEXT: - 49 9 {{.*}}{{/|\\}}map-file.s.tmp1.o:(bar) -# CHECK-NEXT: - 49 9 bar -# CHECK-NEXT: - 52 b {{.*}}{{/|\\}}map-file.s.tmp1.o:(write_global) -# CHECK-NEXT: - 52 b write_global -# CHECK-NEXT: - 5d f {{.*}}{{/|\\}}map-file.s.tmp1.o:(_start) -# CHECK-NEXT: - 5d f _start -# CHECK-NEXT: - 6e d DATA -# CHECK-NEXT: 400 6f 4 .data -# CHECK-NEXT: 400 75 4 {{.*}}{{/|\\}}map-file.s.tmp1.o:(.data.somedata) -# CHECK-NEXT: 400 75 4 somedata -# CHECK-NEXT: - 7b 12 CUSTOM(.debug_info) -# CHECK-NEXT: - 8d 50 CUSTOM(name) +# CHECK-NEXT: - 48 2e CODE +# CHECK-NEXT: - 49 11 {{.*}}{{/|\\}}map-file.s.tmp1.o:(bar) +# CHECK-NEXT: - 49 11 bar +# CHECK-NEXT: - 5a b {{.*}}{{/|\\}}map-file.s.tmp1.o:(write_global) +# CHECK-NEXT: - 5a b write_global +# CHECK-NEXT: - 65 f {{.*}}{{/|\\}}map-file.s.tmp1.o:(_start) +# CHECK-NEXT: - 65 f _start +# CHECK-NEXT: - 76 d DATA +# CHECK-NEXT: 400 77 4 .data +# CHECK-NEXT: 400 7d 4 {{.*}}{{/|\\}}map-file.s.tmp1.o:(.data.somedata) +# CHECK-NEXT: 400 7d 4 somedata +# CHECK-NEXT: 404 76 4 .bss +# CHECK-NEXT: 404 0 4 {{.*}}{{/|\\}}map-file.s.tmp1.o:(.bss.somezeroes) +# CHECK-NEXT: 404 0 4 somezeroes +# CHECK-NEXT: - 83 12 CUSTOM(.debug_info) +# CHECK-NEXT: - 95 50 CUSTOM(name) # RUN: not wasm-ld %t1.o -o /dev/null -Map=/ 2>&1 \ # RUN: | FileCheck -check-prefix=FAIL %s diff --git a/lld/wasm/MapFile.cpp b/lld/wasm/MapFile.cpp index c964efe1e742f..9dbab5046e23a 100644 --- a/lld/wasm/MapFile.cpp +++ b/lld/wasm/MapFile.cpp @@ -80,7 +80,9 @@ getSymbolStrings(ArrayRef syms) { auto *chunk = syms[i]->getChunk(); if (chunk == nullptr) return; - uint64_t fileOffset = chunk->outputSec->getOffset() + chunk->outSecOff; + uint64_t fileOffset = chunk->outputSec != nullptr + ? chunk->outputSec->getOffset() + chunk->outSecOff + : 0; uint64_t vma = -1; uint64_t size = 0; if (auto *DD = dyn_cast(syms[i])) { @@ -138,9 +140,11 @@ void lld::wasm::writeMapFile(ArrayRef outputSections) { oseg->size); os << oseg->name << '\n'; for (auto *chunk : oseg->inputSegments) { - writeHeader(os, chunk->getVA(), - chunk->outputSec->getOffset() + chunk->outSecOff, - chunk->getSize()); + uint64_t offset = + chunk->outputSec != nullptr + ? chunk->outputSec->getOffset() + chunk->outSecOff + : 0; + writeHeader(os, chunk->getVA(), offset, chunk->getSize()); os.indent(8) << toString(chunk) << '\n'; for (Symbol *sym : sectionSyms[chunk]) os << symStr[sym] << '\n'; diff --git a/llvm/include/llvm/Analysis/MemorySSA.h b/llvm/include/llvm/Analysis/MemorySSA.h index a26115aa82f11..f40b99968fd3a 100644 --- a/llvm/include/llvm/Analysis/MemorySSA.h +++ b/llvm/include/llvm/Analysis/MemorySSA.h @@ -329,7 +329,8 @@ class MemoryUse final : public MemoryUseOrDef { /*NumOperands=*/1) {} // allocate space for exactly one operand - void *operator new(size_t s) { return User::operator new(s, 1); } + void *operator new(size_t S) { return User::operator new(S, 1); } + void operator delete(void *Ptr) { User::operator delete(Ptr); } static bool classof(const Value *MA) { return MA->getValueID() == MemoryUseVal; @@ -389,7 +390,8 @@ class MemoryDef final : public MemoryUseOrDef { ID(Ver) {} // allocate space for exactly two operands - void *operator new(size_t s) { return User::operator new(s, 2); } + void *operator new(size_t S) { return User::operator new(S, 2); } + void operator delete(void *Ptr) { User::operator delete(Ptr); } static bool classof(const Value *MA) { return MA->getValueID() == MemoryDefVal; @@ -484,9 +486,11 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(MemoryUseOrDef, MemoryAccess) /// issue. class MemoryPhi final : public MemoryAccess { // allocate space for exactly zero operands - void *operator new(size_t s) { return User::operator new(s); } + void *operator new(size_t S) { return User::operator new(S); } public: + void operator delete(void *Ptr) { User::operator delete(Ptr); } + /// Provide fast operand accessors DECLARE_TRANSPARENT_OPERAND_ACCESSORS(MemoryAccess); diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h index d0fe1a264b74e..76667eac051dd 100644 --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -270,11 +270,19 @@ namespace llvm { /// operations. FunctionPass *createGCLoweringPass(); + /// GCLowering Pass - Used by gc.root to perform its default lowering + /// operations. + extern char &GCLoweringID; + /// ShadowStackGCLowering - Implements the custom lowering mechanism /// used by the shadow stack GC. Only runs on functions which opt in to /// the shadow stack collector. FunctionPass *createShadowStackGCLoweringPass(); + /// ShadowStackGCLowering - Implements the custom lowering mechanism + /// used by the shadow stack GC. + extern char &ShadowStackGCLoweringID; + /// GCMachineCodeAnalysis - Target-independent pass to mark safe points /// in machine code. Must be added very late during code generation, just /// prior to output, and importantly after all CFG transformations (such as diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index e2474e21052f1..1cf6ac0e5f949 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -290,8 +290,7 @@ class TargetLoweringBase { bool IsSwiftError : 1; bool IsCFGuardTarget : 1; MaybeAlign Alignment = None; - Type *ByValType = nullptr; - Type *PreallocatedType = nullptr; + Type *IndirectType = nullptr; ArgListEntry() : IsSExt(false), IsZExt(false), IsInReg(false), IsSRet(false), diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h index f04bef161ea71..9eb2ce33cf817 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h @@ -17,6 +17,7 @@ #include "llvm/ExecutionEngine/Orc/Core.h" #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h" #include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h" +#include "llvm/ExecutionEngine/Orc/Shared/CommonOrcRuntimeTypes.h" #include #include @@ -31,21 +32,13 @@ bool objCRegistrationEnabled(); class MachOJITDylibInitializers { public: - struct SectionExtent { - SectionExtent() = default; - SectionExtent(JITTargetAddress Address, uint64_t NumPtrs) - : Address(Address), NumPtrs(NumPtrs) {} - JITTargetAddress Address = 0; - uint64_t NumPtrs = 0; - }; - - using RawPointerSectionList = std::vector; + using RawPointerSectionList = std::vector; void setObjCImageInfoAddr(JITTargetAddress ObjCImageInfoAddr) { this->ObjCImageInfoAddr = ObjCImageInfoAddr; } - void addModInitsSection(SectionExtent ModInit) { + void addModInitsSection(shared::ExecutorAddressRange ModInit) { ModInitSections.push_back(std::move(ModInit)); } @@ -53,7 +46,7 @@ class MachOJITDylibInitializers { return ModInitSections; } - void addObjCSelRefsSection(SectionExtent ObjCSelRefs) { + void addObjCSelRefsSection(shared::ExecutorAddressRange ObjCSelRefs) { ObjCSelRefsSections.push_back(std::move(ObjCSelRefs)); } @@ -61,7 +54,7 @@ class MachOJITDylibInitializers { return ObjCSelRefsSections; } - void addObjCClassListSection(SectionExtent ObjCClassList) { + void addObjCClassListSection(shared::ExecutorAddressRange ObjCClassList) { ObjCClassListSections.push_back(std::move(ObjCClassList)); } @@ -118,8 +111,8 @@ class MachOPlatform : public Platform { jitlink::LinkGraph &G, jitlink::PassConfiguration &Config) override; - LocalDependenciesMap getSyntheticSymbolLocalDependencies( - MaterializationResponsibility &MR) override; + SyntheticSymbolDependenciesMap + getSyntheticSymbolDependencies(MaterializationResponsibility &MR) override; // FIXME: We should be tentatively tracking scraped sections and discarding // if the MR fails. @@ -136,9 +129,9 @@ class MachOPlatform : public Platform { private: using InitSymbolDepMap = - DenseMap; + DenseMap; - void preserveInitSectionIfPresent(JITLinkSymbolVector &Syms, + void preserveInitSectionIfPresent(JITLinkSymbolSet &Symbols, jitlink::LinkGraph &G, StringRef SectionName); @@ -152,9 +145,9 @@ class MachOPlatform : public Platform { }; void registerInitInfo(JITDylib &JD, JITTargetAddress ObjCImageInfoAddr, - MachOJITDylibInitializers::SectionExtent ModInits, - MachOJITDylibInitializers::SectionExtent ObjCSelRefs, - MachOJITDylibInitializers::SectionExtent ObjCClassList); + shared::ExecutorAddressRange ModInits, + shared::ExecutorAddressRange ObjCSelRefs, + shared::ExecutorAddressRange ObjCClassList); ExecutionSession &ES; ObjectLinkingLayer &ObjLinkingLayer; diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h b/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h index 55d0634a82ae5..3bb83342dcdbb 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h @@ -64,8 +64,9 @@ class ObjectLinkingLayer : public RTTIExtends, /// configured. class Plugin { public: - using JITLinkSymbolVector = std::vector; - using LocalDependenciesMap = DenseMap; + using JITLinkSymbolSet = DenseSet; + using SyntheticSymbolDependenciesMap = + DenseMap; virtual ~Plugin(); virtual void modifyPassConfig(MaterializationResponsibility &MR, @@ -89,12 +90,12 @@ class ObjectLinkingLayer : public RTTIExtends, ResourceKey SrcKey) = 0; /// Return any dependencies that synthetic symbols (e.g. init symbols) - /// have on locally scoped jitlink::Symbols. This is used by the - /// ObjectLinkingLayer to update the dependencies for the synthetic - /// symbols. - virtual LocalDependenciesMap - getSyntheticSymbolLocalDependencies(MaterializationResponsibility &MR) { - return LocalDependenciesMap(); + /// have on symbols in the LinkGraph. + /// This is used by the ObjectLinkingLayer to update the dependencies for + /// the synthetic symbols. + virtual SyntheticSymbolDependenciesMap + getSyntheticSymbolDependencies(MaterializationResponsibility &MR) { + return SyntheticSymbolDependenciesMap(); } }; diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/CommonOrcRuntimeTypes.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/CommonOrcRuntimeTypes.h new file mode 100644 index 0000000000000..efc4409b84f47 --- /dev/null +++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/CommonOrcRuntimeTypes.h @@ -0,0 +1,68 @@ +//===------------------- CommonOrcRuntimeTypes.h ----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Generic types usable with SPS and the ORC runtime. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_EXECUTIONENGINE_ORC_SHARED_COMMONORCRUNTIMETYPES_H +#define LLVM_EXECUTIONENGINE_ORC_SHARED_COMMONORCRUNTIMETYPES_H + +#include "llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h" + +namespace llvm { +namespace orc { +namespace shared { + +// Placeholder for future replacement for JITTargetAddress. +using ExecutorAddress = uint64_t; + +/// Represents an address range in the exceutor process. +struct ExecutorAddressRange { + ExecutorAddressRange() = default; + ExecutorAddressRange(ExecutorAddress StartAddress, ExecutorAddress EndAddress) + : StartAddress(StartAddress), EndAddress(EndAddress) {} + + bool empty() const { return StartAddress == EndAddress; } + size_t size() const { return EndAddress - StartAddress; } + + ExecutorAddress StartAddress = 0; + ExecutorAddress EndAddress = 0; +}; + +using SPSExecutorAddressRange = + SPSTuple; + +/// Serialization traits for address ranges. +template <> +class SPSSerializationTraits { +public: + static size_t size(const ExecutorAddressRange &Value) { + return SPSArgList::size( + Value.StartAddress, Value.EndAddress); + } + + static bool serialize(SPSOutputBuffer &BOB, + const ExecutorAddressRange &Value) { + return SPSArgList::serialize( + BOB, Value.StartAddress, Value.EndAddress); + } + + static bool deserialize(SPSInputBuffer &BIB, ExecutorAddressRange &Value) { + return SPSArgList::deserialize( + BIB, Value.StartAddress, Value.EndAddress); + } +}; + +using SPSExecutorAddressRangeSequence = SPSSequence; + +} // End namespace shared. +} // End namespace orc. +} // End namespace llvm. + +#endif // LLVM_EXECUTIONENGINE_ORC_SHARED_COMMONORCRUNTIMETYPES_H diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcError.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcError.h index 379dd9efefd15..8dffea70e3355 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcError.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcError.h @@ -1,4 +1,4 @@ -//===------ OrcError.h - Reject symbol lookup requests ------*- C++ -*-===// +//===--------------- OrcError.h - Orc Error Types ---------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// Define an error category, error codes, and helper utilities for Orc. +// Define an error category, error codes, and helper utilities for Orc. // //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/IR/Constants.h b/llvm/include/llvm/IR/Constants.h index 07d8e9ab5bb62..142dc21874508 100644 --- a/llvm/include/llvm/IR/Constants.h +++ b/llvm/include/llvm/IR/Constants.h @@ -58,9 +58,11 @@ class ConstantData : public Constant { protected: explicit ConstantData(Type *Ty, ValueTy VT) : Constant(Ty, VT, nullptr, 0) {} - void *operator new(size_t s) { return User::operator new(s, 0); } + void *operator new(size_t S) { return User::operator new(S, 0); } public: + void operator delete(void *Ptr) { User::operator delete(Ptr); } + ConstantData(const ConstantData &) = delete; /// Methods to support type inquiry through isa, cast, and dyn_cast. @@ -849,12 +851,14 @@ class BlockAddress final : public Constant { BlockAddress(Function *F, BasicBlock *BB); - void *operator new(size_t s) { return User::operator new(s, 2); } + void *operator new(size_t S) { return User::operator new(S, 2); } void destroyConstantImpl(); Value *handleOperandChangeImpl(Value *From, Value *To); public: + void operator delete(void *Ptr) { User::operator delete(Ptr); } + /// Return a BlockAddress for the specified function and basic block. static BlockAddress *get(Function *F, BasicBlock *BB); @@ -893,12 +897,14 @@ class DSOLocalEquivalent final : public Constant { DSOLocalEquivalent(GlobalValue *GV); - void *operator new(size_t s) { return User::operator new(s, 1); } + void *operator new(size_t S) { return User::operator new(S, 1); } void destroyConstantImpl(); Value *handleOperandChangeImpl(Value *From, Value *To); public: + void operator delete(void *Ptr) { User::operator delete(Ptr); } + /// Return a DSOLocalEquivalent for the specified global value. static DSOLocalEquivalent *get(GlobalValue *GV); diff --git a/llvm/include/llvm/IR/GlobalIndirectSymbol.h b/llvm/include/llvm/IR/GlobalIndirectSymbol.h index d996237aa3efb..e45c7529885d5 100644 --- a/llvm/include/llvm/IR/GlobalIndirectSymbol.h +++ b/llvm/include/llvm/IR/GlobalIndirectSymbol.h @@ -35,9 +35,8 @@ class GlobalIndirectSymbol : public GlobalValue { GlobalIndirectSymbol &operator=(const GlobalIndirectSymbol &) = delete; // allocate space for exactly one operand - void *operator new(size_t s) { - return User::operator new(s, 1); - } + void *operator new(size_t S) { return User::operator new(S, 1); } + void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Provide fast operand accessors DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Constant); diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h index c690306cd3d27..2f31db0fa4d7e 100644 --- a/llvm/include/llvm/IR/InstrTypes.h +++ b/llvm/include/llvm/IR/InstrTypes.h @@ -68,9 +68,8 @@ class UnaryInstruction : public Instruction { public: // allocate space for exactly one operand - void *operator new(size_t s) { - return User::operator new(s, 1); - } + void *operator new(size_t S) { return User::operator new(S, 1); } + void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Transparently provide more efficient getOperand methods. DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); @@ -203,9 +202,8 @@ class BinaryOperator : public Instruction { public: // allocate space for exactly two operands - void *operator new(size_t s) { - return User::operator new(s, 2); - } + void *operator new(size_t S) { return User::operator new(S, 2); } + void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Transparently provide more efficient getOperand methods. DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); @@ -769,9 +767,8 @@ class CmpInst : public Instruction { public: // allocate space for exactly two operands - void *operator new(size_t s) { - return User::operator new(s, 2); - } + void *operator new(size_t S) { return User::operator new(S, 2); } + void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Construct a compare instruction, given the opcode, the predicate and /// the two operands. Optionally (if InstBefore is specified) insert the @@ -1728,14 +1725,29 @@ class CallBase : public Instruction { /// Extract the byval type for a call or parameter. Type *getParamByValType(unsigned ArgNo) const { - Type *Ty = Attrs.getParamByValType(ArgNo); - return Ty ? Ty : getArgOperand(ArgNo)->getType()->getPointerElementType(); + if (auto *Ty = Attrs.getParamByValType(ArgNo)) + return Ty; + if (const Function *F = getCalledFunction()) + return F->getAttributes().getParamByValType(ArgNo); + return nullptr; } /// Extract the preallocated type for a call or parameter. Type *getParamPreallocatedType(unsigned ArgNo) const { - Type *Ty = Attrs.getParamPreallocatedType(ArgNo); - return Ty ? Ty : getArgOperand(ArgNo)->getType()->getPointerElementType(); + if (auto *Ty = Attrs.getParamPreallocatedType(ArgNo)) + return Ty; + if (const Function *F = getCalledFunction()) + return F->getAttributes().getParamPreallocatedType(ArgNo); + return nullptr; + } + + /// Extract the preallocated type for a call or parameter. + Type *getParamInAllocaType(unsigned ArgNo) const { + if (auto *Ty = Attrs.getParamInAllocaType(ArgNo)) + return Ty; + if (const Function *F = getCalledFunction()) + return F->getAttributes().getParamInAllocaType(ArgNo); + return nullptr; } /// Extract the number of dereferenceable bytes for a call or diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h index 5de72de77f839..e48a14f4b5b4b 100644 --- a/llvm/include/llvm/IR/Instructions.h +++ b/llvm/include/llvm/IR/Instructions.h @@ -333,9 +333,8 @@ class StoreInst : public Instruction { AtomicOrdering Order, SyncScope::ID SSID, BasicBlock *InsertAtEnd); // allocate space for exactly two operands - void *operator new(size_t s) { - return User::operator new(s, 2); - } + void *operator new(size_t S) { return User::operator new(S, 2); } + void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Return true if this is a store to a volatile memory location. bool isVolatile() const { return getSubclassData(); } @@ -463,9 +462,8 @@ class FenceInst : public Instruction { BasicBlock *InsertAtEnd); // allocate space for exactly zero operands - void *operator new(size_t s) { - return User::operator new(s, 0); - } + void *operator new(size_t S) { return User::operator new(S, 0); } + void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Returns the ordering constraint of this fence instruction. AtomicOrdering getOrdering() const { @@ -547,9 +545,8 @@ class AtomicCmpXchgInst : public Instruction { BasicBlock *InsertAtEnd); // allocate space for exactly three operands - void *operator new(size_t s) { - return User::operator new(s, 3); - } + void *operator new(size_t S) { return User::operator new(S, 3); } + void operator delete(void *Ptr) { User::operator delete(Ptr); } using VolatileField = BoolBitfieldElementT<0>; using WeakField = BoolBitfieldElementT; @@ -792,9 +789,8 @@ class AtomicRMWInst : public Instruction { BasicBlock *InsertAtEnd); // allocate space for exactly two operands - void *operator new(size_t s) { - return User::operator new(s, 2); - } + void *operator new(size_t S) { return User::operator new(S, 2); } + void operator delete(void *Ptr) { User::operator delete(Ptr); } using VolatileField = BoolBitfieldElementT<0>; using AtomicOrderingField = @@ -2040,7 +2036,8 @@ class ShuffleVectorInst : public Instruction { ShuffleVectorInst(Value *V1, Value *V2, ArrayRef Mask, const Twine &NameStr, BasicBlock *InsertAtEnd); - void *operator new(size_t s) { return User::operator new(s, 2); } + void *operator new(size_t S) { return User::operator new(S, 2); } + void operator delete(void *Ptr) { return User::operator delete(Ptr); } /// Swap the operands and adjust the mask to preserve the semantics /// of the instruction. @@ -2497,9 +2494,8 @@ class InsertValueInst : public Instruction { public: // allocate space for exactly two operands - void *operator new(size_t s) { - return User::operator new(s, 2); - } + void *operator new(size_t S) { return User::operator new(S, 2); } + void operator delete(void *Ptr) { User::operator delete(Ptr); } static InsertValueInst *Create(Value *Agg, Value *Val, ArrayRef Idxs, @@ -2875,9 +2871,7 @@ class LandingPadInst : public Instruction { const Twine &NameStr, BasicBlock *InsertAtEnd); // Allocate space for exactly zero operands. - void *operator new(size_t s) { - return User::operator new(s); - } + void *operator new(size_t S) { return User::operator new(S); } void growOperands(unsigned Size); void init(unsigned NumReservedValues, const Twine &NameStr); @@ -2889,6 +2883,8 @@ class LandingPadInst : public Instruction { LandingPadInst *cloneImpl() const; public: + void operator delete(void *Ptr) { User::operator delete(Ptr); } + /// Constructors - NumReservedClauses is a hint for the number of incoming /// clauses that this landingpad will have (use 0 if you really have no idea). static LandingPadInst *Create(Type *RetTy, unsigned NumReservedClauses, @@ -3207,9 +3203,7 @@ class SwitchInst : public Instruction { BasicBlock *InsertAtEnd); // allocate space for exactly zero operands - void *operator new(size_t s) { - return User::operator new(s); - } + void *operator new(size_t S) { return User::operator new(S); } void init(Value *Value, BasicBlock *Default, unsigned NumReserved); void growOperands(); @@ -3221,6 +3215,8 @@ class SwitchInst : public Instruction { SwitchInst *cloneImpl() const; public: + void operator delete(void *Ptr) { User::operator delete(Ptr); } + // -2 static const unsigned DefaultPseudoIndex = static_cast(~0L-1); @@ -3605,9 +3601,7 @@ class IndirectBrInst : public Instruction { IndirectBrInst(Value *Address, unsigned NumDests, BasicBlock *InsertAtEnd); // allocate space for exactly zero operands - void *operator new(size_t s) { - return User::operator new(s); - } + void *operator new(size_t S) { return User::operator new(S); } void init(Value *Address, unsigned NumDests); void growOperands(); @@ -3619,6 +3613,8 @@ class IndirectBrInst : public Instruction { IndirectBrInst *cloneImpl() const; public: + void operator delete(void *Ptr) { User::operator delete(Ptr); } + /// Iterator type that casts an operand to a basic block. /// /// This only makes sense because the successors are stored as adjacent @@ -4256,7 +4252,7 @@ class CatchSwitchInst : public Instruction { BasicBlock *InsertAtEnd); // allocate space for exactly zero operands - void *operator new(size_t s) { return User::operator new(s); } + void *operator new(size_t S) { return User::operator new(S); } void init(Value *ParentPad, BasicBlock *UnwindDest, unsigned NumReserved); void growOperands(unsigned Size); @@ -4268,6 +4264,8 @@ class CatchSwitchInst : public Instruction { CatchSwitchInst *cloneImpl() const; public: + void operator delete(void *Ptr) { return User::operator delete(Ptr); } + static CatchSwitchInst *Create(Value *ParentPad, BasicBlock *UnwindDest, unsigned NumHandlers, const Twine &NameStr = "", @@ -4696,9 +4694,8 @@ class UnreachableInst : public Instruction { explicit UnreachableInst(LLVMContext &C, BasicBlock *InsertAtEnd); // allocate space for exactly zero operands - void *operator new(size_t s) { - return User::operator new(s, 0); - } + void *operator new(size_t S) { return User::operator new(S, 0); } + void operator delete(void *Ptr) { User::operator delete(Ptr); } unsigned getNumSuccessors() const { return 0; } diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp index d2400d0371e3c..708325298aaeb 100644 --- a/llvm/lib/CodeGen/CodeGen.cpp +++ b/llvm/lib/CodeGen/CodeGen.cpp @@ -99,6 +99,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeRegisterCoalescerPass(Registry); initializeRenameIndependentSubregsPass(Registry); initializeSafeStackLegacyPassPass(Registry); + initializeShadowStackGCLoweringPass(Registry); initializeShrinkWrapPass(Registry); initializeSjLjEHPreparePass(Registry); initializeSlotIndexesPass(Registry); diff --git a/llvm/lib/CodeGen/GCRootLowering.cpp b/llvm/lib/CodeGen/GCRootLowering.cpp index faf0fb7f09a7a..58269e172c573 100644 --- a/llvm/lib/CodeGen/GCRootLowering.cpp +++ b/llvm/lib/CodeGen/GCRootLowering.cpp @@ -85,6 +85,7 @@ INITIALIZE_PASS_END(LowerIntrinsics, "gc-lowering", "GC Lowering", false, false) FunctionPass *llvm::createGCLoweringPass() { return new LowerIntrinsics(); } char LowerIntrinsics::ID = 0; +char &llvm::GCLoweringID = LowerIntrinsics::ID; LowerIntrinsics::LowerIntrinsics() : FunctionPass(ID) { initializeLowerIntrinsicsPass(*PassRegistry::getPassRegistry()); diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp index ffdaf9a547e66..ec40ddc1ff750 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -1076,15 +1076,12 @@ bool FastISel::lowerCallTo(CallLoweringInfo &CLI) { } MaybeAlign MemAlign = Arg.Alignment; if (Arg.IsByVal || Arg.IsInAlloca || Arg.IsPreallocated) { - PointerType *Ty = cast(Arg.Ty); - Type *ElementTy = Ty->getElementType(); - unsigned FrameSize = - DL.getTypeAllocSize(Arg.ByValType ? Arg.ByValType : ElementTy); + unsigned FrameSize = DL.getTypeAllocSize(Arg.IndirectType); // For ByVal, alignment should come from FE. BE will guess if this info // is not there, but there are cases it cannot get right. if (!MemAlign) - MemAlign = Align(TLI.getByValTypeAlignment(ElementTy, DL)); + MemAlign = Align(TLI.getByValTypeAlignment(Arg.IndirectType, DL)); Flags.setByValSize(FrameSize); } else if (!MemAlign) { MemAlign = DL.getABITypeAlign(Arg.Ty); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 941ec61264b4c..baef5e7c4a770 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -9578,18 +9578,14 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { } Align MemAlign; if (Args[i].IsByVal || Args[i].IsInAlloca || Args[i].IsPreallocated) { - PointerType *Ty = cast(Args[i].Ty); - Type *ElementTy = Ty->getElementType(); - - unsigned FrameSize = DL.getTypeAllocSize( - Args[i].ByValType ? Args[i].ByValType : ElementTy); + unsigned FrameSize = DL.getTypeAllocSize(Args[i].IndirectType); Flags.setByValSize(FrameSize); // info is not there but there are cases it cannot get right. if (auto MA = Args[i].Alignment) MemAlign = *MA; else - MemAlign = Align(getByValTypeAlignment(ElementTy, DL)); + MemAlign = Align(getByValTypeAlignment(Args[i].IndirectType, DL)); } else if (auto MA = Args[i].Alignment) { MemAlign = *MA; } else { diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 00403a9260b01..bc033b06e7a54 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -119,15 +119,18 @@ void TargetLoweringBase::ArgListEntry::setAttributes(const CallBase *Call, IsSwiftAsync = Call->paramHasAttr(ArgIdx, Attribute::SwiftAsync); IsSwiftError = Call->paramHasAttr(ArgIdx, Attribute::SwiftError); Alignment = Call->getParamStackAlign(ArgIdx); - ByValType = nullptr; + IndirectType = nullptr; + assert(IsByVal + IsPreallocated + IsInAlloca <= 1 && + "multiple ABI attributes?"); if (IsByVal) { - ByValType = Call->getParamByValType(ArgIdx); + IndirectType = Call->getParamByValType(ArgIdx); if (!Alignment) Alignment = Call->getParamAlign(ArgIdx); } - PreallocatedType = nullptr; if (IsPreallocated) - PreallocatedType = Call->getParamPreallocatedType(ArgIdx); + IndirectType = Call->getParamPreallocatedType(ArgIdx); + if (IsInAlloca) + IndirectType = Call->getParamInAllocaType(ArgIdx); } /// Generate a libcall taking the given operands as arguments and returning a diff --git a/llvm/lib/CodeGen/ShadowStackGCLowering.cpp b/llvm/lib/CodeGen/ShadowStackGCLowering.cpp index 36752ef86526d..86b559fd64130 100644 --- a/llvm/lib/CodeGen/ShadowStackGCLowering.cpp +++ b/llvm/lib/CodeGen/ShadowStackGCLowering.cpp @@ -89,6 +89,7 @@ class ShadowStackGCLowering : public FunctionPass { } // end anonymous namespace char ShadowStackGCLowering::ID = 0; +char &llvm::ShadowStackGCLoweringID = ShadowStackGCLowering::ID; INITIALIZE_PASS_BEGIN(ShadowStackGCLowering, DEBUG_TYPE, "Shadow Stack GC Lowering", false, false) diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp index 2d8ecd5025fa1..2a4f6bfd98b03 100644 --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -857,8 +857,8 @@ void TargetPassConfig::addIRPasses() { // Run GC lowering passes for builtin collectors // TODO: add a pass insertion point here - addPass(createGCLoweringPass()); - addPass(createShadowStackGCLoweringPass()); + addPass(&GCLoweringID); + addPass(&ShadowStackGCLoweringID); addPass(createLowerConstantIntrinsicsPass()); // Make sure that no unreachable blocks are instruction selected. diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp index 80df097a07410..39557a485cf28 100644 --- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp @@ -88,11 +88,15 @@ bool objCRegistrationEnabled() { void MachOJITDylibInitializers::runModInits() const { for (const auto &ModInit : ModInitSections) { - for (uint64_t I = 0; I != ModInit.NumPtrs; ++I) { - auto *InitializerAddr = jitTargetAddressToPointer( - ModInit.Address + (I * sizeof(uintptr_t))); - auto *Initializer = - jitTargetAddressToFunction(*InitializerAddr); + assert(ModInit.size() % sizeof(uintptr_t) == 0 && + "ModInit section size is not a pointer multiple?"); + for (uintptr_t * + InitPtr = + jitTargetAddressToPointer(ModInit.StartAddress), + *InitEnd = + jitTargetAddressToPointer(ModInit.EndAddress); + InitPtr != InitEnd; ++InitPtr) { + auto *Initializer = reinterpret_cast(*InitPtr); Initializer(); } } @@ -102,8 +106,11 @@ void MachOJITDylibInitializers::registerObjCSelectors() const { assert(objCRegistrationEnabled() && "ObjC registration not enabled."); for (const auto &ObjCSelRefs : ObjCSelRefsSections) { - for (uint64_t I = 0; I != ObjCSelRefs.NumPtrs; ++I) { - auto SelEntryAddr = ObjCSelRefs.Address + (I * sizeof(uintptr_t)); + assert(ObjCSelRefs.size() % sizeof(uintptr_t) == 0 && + "ObjCSelRefs section size is not a pointer multiple?"); + for (JITTargetAddress SelEntryAddr = ObjCSelRefs.StartAddress; + SelEntryAddr != ObjCSelRefs.EndAddress; + SelEntryAddr += sizeof(uintptr_t)) { const auto *SelName = *jitTargetAddressToPointer(SelEntryAddr); auto Sel = sel_registerName(SelName); @@ -128,8 +135,11 @@ Error MachOJITDylibInitializers::registerObjCClasses() const { auto ClassSelector = sel_registerName("class"); for (const auto &ObjCClassList : ObjCClassListSections) { - for (uint64_t I = 0; I != ObjCClassList.NumPtrs; ++I) { - auto ClassPtrAddr = ObjCClassList.Address + (I * sizeof(uintptr_t)); + assert(ObjCClassList.size() % sizeof(uintptr_t) == 0 && + "ObjCClassList section size is not a pointer multiple?"); + for (JITTargetAddress ClassPtrAddr = ObjCClassList.StartAddress; + ClassPtrAddr != ObjCClassList.EndAddress; + ClassPtrAddr += sizeof(uintptr_t)) { auto Cls = *jitTargetAddressToPointer(ClassPtrAddr); auto *ClassCompiled = *jitTargetAddressToPointer(ClassPtrAddr); @@ -264,37 +274,36 @@ MachOPlatform::getDeinitializerSequence(JITDylib &JD) { void MachOPlatform::registerInitInfo( JITDylib &JD, JITTargetAddress ObjCImageInfoAddr, - MachOJITDylibInitializers::SectionExtent ModInits, - MachOJITDylibInitializers::SectionExtent ObjCSelRefs, - MachOJITDylibInitializers::SectionExtent ObjCClassList) { + shared::ExecutorAddressRange ModInits, + shared::ExecutorAddressRange ObjCSelRefs, + shared::ExecutorAddressRange ObjCClassList) { std::lock_guard Lock(InitSeqsMutex); auto &InitSeq = InitSeqs[&JD]; InitSeq.setObjCImageInfoAddr(ObjCImageInfoAddr); - if (ModInits.Address) + if (ModInits.StartAddress) InitSeq.addModInitsSection(std::move(ModInits)); - if (ObjCSelRefs.Address) + if (ObjCSelRefs.StartAddress) InitSeq.addObjCSelRefsSection(std::move(ObjCSelRefs)); - if (ObjCClassList.Address) + if (ObjCClassList.StartAddress) InitSeq.addObjCClassListSection(std::move(ObjCClassList)); } -static Expected +static Expected getSectionExtent(jitlink::LinkGraph &G, StringRef SectionName) { auto *Sec = G.findSectionByName(SectionName); if (!Sec) - return MachOJITDylibInitializers::SectionExtent(); + return shared::ExecutorAddressRange(); jitlink::SectionRange R(*Sec); if (R.getSize() % G.getPointerSize() != 0) return make_error(SectionName + " section size is not a " "multiple of the pointer size", inconvertibleErrorCode()); - return MachOJITDylibInitializers::SectionExtent( - R.getStart(), R.getSize() / G.getPointerSize()); + return shared::ExecutorAddressRange{R.getStart(), R.getEnd()}; } void MachOPlatform::InitScraperPlugin::modifyPassConfig( @@ -305,17 +314,14 @@ void MachOPlatform::InitScraperPlugin::modifyPassConfig( return; Config.PrePrunePasses.push_back([this, &MR](jitlink::LinkGraph &G) -> Error { - JITLinkSymbolVector InitSectionSymbols; - preserveInitSectionIfPresent(InitSectionSymbols, G, - "__DATA,__mod_init_func"); - preserveInitSectionIfPresent(InitSectionSymbols, G, - "__DATA,__objc_selrefs"); - preserveInitSectionIfPresent(InitSectionSymbols, G, - "__DATA,__objc_classlist"); - - if (!InitSectionSymbols.empty()) { + JITLinkSymbolSet InitSectionSyms; + preserveInitSectionIfPresent(InitSectionSyms, G, "__DATA,__mod_init_func"); + preserveInitSectionIfPresent(InitSectionSyms, G, "__DATA,__objc_selrefs"); + preserveInitSectionIfPresent(InitSectionSyms, G, "__DATA,__objc_classlist"); + + if (!InitSectionSyms.empty()) { std::lock_guard Lock(InitScraperMutex); - InitSymbolDeps[&MR] = std::move(InitSectionSymbols); + InitSymbolDeps[&MR] = std::move(InitSectionSyms); } if (auto Err = processObjCImageInfo(G, MR)) @@ -326,8 +332,7 @@ void MachOPlatform::InitScraperPlugin::modifyPassConfig( Config.PostFixupPasses.push_back([this, &JD = MR.getTargetJITDylib()]( jitlink::LinkGraph &G) -> Error { - MachOJITDylibInitializers::SectionExtent ModInits, ObjCSelRefs, - ObjCClassList; + shared::ExecutorAddressRange ModInits, ObjCSelRefs, ObjCClassList; JITTargetAddress ObjCImageInfoAddr = 0; if (auto *ObjCImageInfoSec = @@ -359,23 +364,26 @@ void MachOPlatform::InitScraperPlugin::modifyPassConfig( LLVM_DEBUG({ dbgs() << "MachOPlatform: Scraped " << G.getName() << " init sections:\n"; dbgs() << " __objc_selrefs: "; - if (ObjCSelRefs.NumPtrs) - dbgs() << ObjCSelRefs.NumPtrs << " pointer(s) at " - << formatv("{0:x16}", ObjCSelRefs.Address) << "\n"; + auto NumObjCSelRefs = ObjCSelRefs.size() / sizeof(uintptr_t); + if (NumObjCSelRefs) + dbgs() << NumObjCSelRefs << " pointer(s) at " + << formatv("{0:x16}", ObjCSelRefs.StartAddress) << "\n"; else dbgs() << "none\n"; dbgs() << " __objc_classlist: "; - if (ObjCClassList.NumPtrs) - dbgs() << ObjCClassList.NumPtrs << " pointer(s) at " - << formatv("{0:x16}", ObjCClassList.Address) << "\n"; + auto NumObjCClasses = ObjCClassList.size() / sizeof(uintptr_t); + if (NumObjCClasses) + dbgs() << NumObjCClasses << " pointer(s) at " + << formatv("{0:x16}", ObjCClassList.StartAddress) << "\n"; else dbgs() << "none\n"; dbgs() << " __mod_init_func: "; - if (ModInits.NumPtrs) - dbgs() << ModInits.NumPtrs << " pointer(s) at " - << formatv("{0:x16}", ModInits.Address) << "\n"; + auto NumModInits = ModInits.size() / sizeof(uintptr_t); + if (NumModInits) + dbgs() << NumModInits << " pointer(s) at " + << formatv("{0:x16}", ModInits.StartAddress) << "\n"; else dbgs() << "none\n"; }); @@ -387,27 +395,26 @@ void MachOPlatform::InitScraperPlugin::modifyPassConfig( }); } -ObjectLinkingLayer::Plugin::LocalDependenciesMap -MachOPlatform::InitScraperPlugin::getSyntheticSymbolLocalDependencies( +ObjectLinkingLayer::Plugin::SyntheticSymbolDependenciesMap +MachOPlatform::InitScraperPlugin::getSyntheticSymbolDependencies( MaterializationResponsibility &MR) { std::lock_guard Lock(InitScraperMutex); auto I = InitSymbolDeps.find(&MR); if (I != InitSymbolDeps.end()) { - LocalDependenciesMap Result; + SyntheticSymbolDependenciesMap Result; Result[MR.getInitializerSymbol()] = std::move(I->second); InitSymbolDeps.erase(&MR); return Result; } - return LocalDependenciesMap(); + return SyntheticSymbolDependenciesMap(); } void MachOPlatform::InitScraperPlugin::preserveInitSectionIfPresent( - JITLinkSymbolVector &Symbols, jitlink::LinkGraph &G, - StringRef SectionName) { + JITLinkSymbolSet &Symbols, jitlink::LinkGraph &G, StringRef SectionName) { if (auto *Sec = G.findSectionByName(SectionName)) { auto SecBlocks = Sec->blocks(); if (!llvm::empty(SecBlocks)) - Symbols.push_back( + Symbols.insert( &G.addAnonymousSymbol(**SecBlocks.begin(), 0, 0, false, true)); } } diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp index c10aa15ef2697..a45b18544609d 100644 --- a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp @@ -331,12 +331,82 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { } private: - struct LocalSymbolNamedDependencies { + // Symbol name dependencies: + // Internal: Defined in this graph. + // External: Defined externally. + struct BlockSymbolDependencies { SymbolNameSet Internal, External; }; - using LocalSymbolNamedDependenciesMap = - DenseMap; + // Lazily populated map of blocks to BlockSymbolDependencies values. + class BlockDependenciesMap { + public: + BlockDependenciesMap(ExecutionSession &ES, + DenseMap> BlockDeps) + : ES(ES), BlockDeps(std::move(BlockDeps)) {} + + const BlockSymbolDependencies &operator[](const Block &B) { + // Check the cache first. + auto I = BlockTransitiveDepsCache.find(&B); + if (I != BlockTransitiveDepsCache.end()) + return I->second; + + // No value. Populate the cache. + BlockSymbolDependencies BTDCacheVal; + auto BDI = BlockDeps.find(&B); + assert(BDI != BlockDeps.end() && "No block dependencies"); + + for (auto *BDep : BDI->second) { + auto &BID = getBlockImmediateDeps(*BDep); + for (auto &ExternalDep : BID.External) + BTDCacheVal.External.insert(ExternalDep); + for (auto &InternalDep : BID.Internal) + BTDCacheVal.Internal.insert(InternalDep); + } + + return BlockTransitiveDepsCache + .insert(std::make_pair(&B, std::move(BTDCacheVal))) + .first->second; + } + + SymbolStringPtr &getInternedName(Symbol &Sym) { + auto I = NameCache.find(&Sym); + if (I != NameCache.end()) + return I->second; + + return NameCache.insert(std::make_pair(&Sym, ES.intern(Sym.getName()))) + .first->second; + } + + private: + BlockSymbolDependencies &getBlockImmediateDeps(Block &B) { + // Check the cache first. + auto I = BlockImmediateDepsCache.find(&B); + if (I != BlockImmediateDepsCache.end()) + return I->second; + + BlockSymbolDependencies BIDCacheVal; + for (auto &E : B.edges()) { + auto &Tgt = E.getTarget(); + if (Tgt.getScope() != Scope::Local) { + if (Tgt.isExternal()) + BIDCacheVal.External.insert(getInternedName(Tgt)); + else + BIDCacheVal.Internal.insert(getInternedName(Tgt)); + } + } + + return BlockImmediateDepsCache + .insert(std::make_pair(&B, std::move(BIDCacheVal))) + .first->second; + } + + ExecutionSession &ES; + DenseMap> BlockDeps; + DenseMap NameCache; + DenseMap BlockImmediateDepsCache; + DenseMap BlockTransitiveDepsCache; + }; Error claimOrExternalizeWeakAndCommonSymbols(LinkGraph &G) { auto &ES = Layer.getExecutionSession(); @@ -384,7 +454,7 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { Error computeNamedSymbolDependencies(LinkGraph &G) { auto &ES = MR->getTargetJITDylib().getExecutionSession(); - auto LocalDeps = computeLocalDeps(G); + auto BlockDeps = computeBlockNonLocalDeps(G); // Compute dependencies for symbols defined in the JITLink graph. for (auto *Sym : G.defined_symbols()) { @@ -395,58 +465,41 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { assert(Sym->hasName() && "Defined non-local jitlink::Symbol should have a name"); - SymbolNameSet ExternalSymDeps, InternalSymDeps; - - // Find internal and external named symbol dependencies. - for (auto &E : Sym->getBlock().edges()) { - auto &TargetSym = E.getTarget(); - - if (TargetSym.getScope() != Scope::Local) { - if (TargetSym.isExternal()) - ExternalSymDeps.insert(ES.intern(TargetSym.getName())); - else if (&TargetSym != Sym) - InternalSymDeps.insert(ES.intern(TargetSym.getName())); - } else { - assert(TargetSym.isDefined() && - "local symbols must be defined"); - auto I = LocalDeps.find(&TargetSym); - if (I != LocalDeps.end()) { - for (auto &S : I->second.External) - ExternalSymDeps.insert(S); - for (auto &S : I->second.Internal) - InternalSymDeps.insert(S); - } - } - } - - if (ExternalSymDeps.empty() && InternalSymDeps.empty()) + auto &SymDeps = BlockDeps[Sym->getBlock()]; + if (SymDeps.External.empty() && SymDeps.Internal.empty()) continue; auto SymName = ES.intern(Sym->getName()); - if (!ExternalSymDeps.empty()) - ExternalNamedSymbolDeps[SymName] = std::move(ExternalSymDeps); - if (!InternalSymDeps.empty()) - InternalNamedSymbolDeps[SymName] = std::move(InternalSymDeps); + if (!SymDeps.External.empty()) + ExternalNamedSymbolDeps[SymName] = SymDeps.External; + if (!SymDeps.Internal.empty()) + InternalNamedSymbolDeps[SymName] = SymDeps.Internal; } for (auto &P : Layer.Plugins) { - auto SyntheticLocalDeps = P->getSyntheticSymbolLocalDependencies(*MR); - if (SyntheticLocalDeps.empty()) + auto SynthDeps = P->getSyntheticSymbolDependencies(*MR); + if (SynthDeps.empty()) continue; - for (auto &KV : SyntheticLocalDeps) { + DenseSet BlockVisited; + for (auto &KV : SynthDeps) { auto &Name = KV.first; - auto &LocalDepsForName = KV.second; - for (auto *Local : LocalDepsForName) { - assert(Local->getScope() == Scope::Local && - "Dependence on non-local symbol"); - auto LocalNamedDepsItr = LocalDeps.find(Local); - if (LocalNamedDepsItr == LocalDeps.end()) - continue; - for (auto &S : LocalNamedDepsItr->second.Internal) - InternalNamedSymbolDeps[Name].insert(S); - for (auto &S : LocalNamedDepsItr->second.External) - ExternalNamedSymbolDeps[Name].insert(S); + auto &DepsForName = KV.second; + for (auto *Sym : DepsForName) { + if (Sym->getScope() == Scope::Local) { + auto &BDeps = BlockDeps[Sym->getBlock()]; + for (auto &S : BDeps.Internal) + InternalNamedSymbolDeps[Name].insert(S); + for (auto &S : BDeps.External) + ExternalNamedSymbolDeps[Name].insert(S); + } else { + if (Sym->isExternal()) + ExternalNamedSymbolDeps[Name].insert( + BlockDeps.getInternedName(*Sym)); + else + InternalNamedSymbolDeps[Name].insert( + BlockDeps.getInternedName(*Sym)); + } } } } @@ -454,81 +507,69 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { return Error::success(); } - LocalSymbolNamedDependenciesMap computeLocalDeps(LinkGraph &G) { - DenseMap> DepMap; - - // For all local symbols: - // (1) Add their named dependencies. - // (2) Add them to the worklist for further iteration if they have any - // depend on any other local symbols. - struct WorklistEntry { - WorklistEntry(Symbol *Sym, DenseSet LocalDeps) - : Sym(Sym), LocalDeps(std::move(LocalDeps)) {} - - Symbol *Sym = nullptr; - DenseSet LocalDeps; + BlockDependenciesMap computeBlockNonLocalDeps(LinkGraph &G) { + // First calculate the reachable-via-non-local-symbol blocks for each block. + struct BlockInfo { + DenseSet Dependencies; + DenseSet Dependants; + bool DependenciesChanged = true; }; - std::vector Worklist; - for (auto *Sym : G.defined_symbols()) - if (Sym->getScope() == Scope::Local) { - auto &SymNamedDeps = DepMap[Sym]; - DenseSet LocalDeps; - - for (auto &E : Sym->getBlock().edges()) { - auto &TargetSym = E.getTarget(); - if (TargetSym.getScope() != Scope::Local) - SymNamedDeps.insert(&TargetSym); - else { - assert(TargetSym.isDefined() && - "local symbols must be defined"); - LocalDeps.insert(&TargetSym); + DenseMap BlockInfos; + SmallVector WorkList; + + // Pre-allocate map entries. This prevents any iterator/reference + // invalidation in the next loop. + for (auto *B : G.blocks()) + (void)BlockInfos[B]; + + // Build initial worklist, record block dependencies/dependants and + // non-local symbol dependencies. + for (auto *B : G.blocks()) { + auto &BI = BlockInfos[B]; + for (auto &E : B->edges()) { + if (E.getTarget().getScope() == Scope::Local) { + auto &TgtB = E.getTarget().getBlock(); + if (&TgtB != B) { + BI.Dependencies.insert(&TgtB); + BlockInfos[&TgtB].Dependants.insert(B); } } - - if (!LocalDeps.empty()) - Worklist.push_back(WorklistEntry(Sym, std::move(LocalDeps))); } - // Loop over all local symbols with local dependencies, propagating - // their respective non-local dependencies. Iterate until we hit a stable - // state. - bool Changed; - do { - Changed = false; - for (auto &WLEntry : Worklist) { - auto *Sym = WLEntry.Sym; - auto &NamedDeps = DepMap[Sym]; - auto &LocalDeps = WLEntry.LocalDeps; - - for (auto *TargetSym : LocalDeps) { - auto I = DepMap.find(TargetSym); - if (I != DepMap.end()) - for (const auto &S : I->second) - Changed |= NamedDeps.insert(S).second; - } - } - } while (Changed); + // If this node has both dependants and dependencies then add it to the + // worklist to propagate the dependencies to the dependants. + if (!BI.Dependants.empty() && !BI.Dependencies.empty()) + WorkList.push_back(B); + } - // Intern the results to produce a mapping of jitlink::Symbol* to internal - // and external symbol names. - auto &ES = Layer.getExecutionSession(); - LocalSymbolNamedDependenciesMap Result; - for (auto &KV : DepMap) { - auto *Local = KV.first; - assert(Local->getScope() == Scope::Local && - "DepMap keys should all be local symbols"); - auto &LocalNamedDeps = Result[Local]; - for (auto *Named : KV.second) { - assert(Named->getScope() != Scope::Local && - "DepMap values should all be non-local symbol sets"); - if (Named->isExternal()) - LocalNamedDeps.External.insert(ES.intern(Named->getName())); - else - LocalNamedDeps.Internal.insert(ES.intern(Named->getName())); + // Propagate block-level dependencies through the block-dependence graph. + while (!WorkList.empty()) { + auto *B = WorkList.back(); + WorkList.pop_back(); + + auto &BI = BlockInfos[B]; + assert(BI.DependenciesChanged && + "Block in worklist has unchanged dependencies"); + BI.DependenciesChanged = false; + for (auto *Dependant : BI.Dependants) { + auto &DependantBI = BlockInfos[Dependant]; + for (auto *Dependency : BI.Dependencies) { + if (Dependant != Dependency && + DependantBI.Dependencies.insert(Dependency).second) + if (!DependantBI.DependenciesChanged) { + DependantBI.DependenciesChanged = true; + WorkList.push_back(Dependant); + } + } } } - return Result; + DenseMap> BlockDeps; + for (auto &KV : BlockInfos) + BlockDeps[KV.first] = std::move(KV.second.Dependencies); + + return BlockDependenciesMap(Layer.getExecutionSession(), + std::move(BlockDeps)); } void registerDependencies(const SymbolDependenceMap &QueryDeps) { diff --git a/llvm/lib/IR/ConstantsContext.h b/llvm/lib/IR/ConstantsContext.h index 7fc25a8944e6b..4056c57480816 100644 --- a/llvm/lib/IR/ConstantsContext.h +++ b/llvm/lib/IR/ConstantsContext.h @@ -51,9 +51,8 @@ class UnaryConstantExpr final : public ConstantExpr { } // allocate space for exactly one operand - void *operator new(size_t s) { - return User::operator new(s, 1); - } + void *operator new(size_t S) { return User::operator new(S, 1); } + void operator delete(void *Ptr) { User::operator delete(Ptr); } DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); @@ -79,9 +78,8 @@ class BinaryConstantExpr final : public ConstantExpr { } // allocate space for exactly two operands - void *operator new(size_t s) { - return User::operator new(s, 2); - } + void *operator new(size_t S) { return User::operator new(S, 2); } + void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Transparently provide more efficient getOperand methods. DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); @@ -106,9 +104,8 @@ class SelectConstantExpr final : public ConstantExpr { } // allocate space for exactly three operands - void *operator new(size_t s) { - return User::operator new(s, 3); - } + void *operator new(size_t S) { return User::operator new(S, 3); } + void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Transparently provide more efficient getOperand methods. DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); @@ -134,9 +131,8 @@ class ExtractElementConstantExpr final : public ConstantExpr { } // allocate space for exactly two operands - void *operator new(size_t s) { - return User::operator new(s, 2); - } + void *operator new(size_t S) { return User::operator new(S, 2); } + void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Transparently provide more efficient getOperand methods. DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); @@ -163,9 +159,8 @@ class InsertElementConstantExpr final : public ConstantExpr { } // allocate space for exactly three operands - void *operator new(size_t s) { - return User::operator new(s, 3); - } + void *operator new(size_t S) { return User::operator new(S, 3); } + void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Transparently provide more efficient getOperand methods. DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); @@ -200,7 +195,8 @@ class ShuffleVectorConstantExpr final : public ConstantExpr { SmallVector ShuffleMask; Constant *ShuffleMaskForBitcode; - void *operator new(size_t s) { return User::operator new(s, 2); } + void *operator new(size_t S) { return User::operator new(S, 2); } + void operator delete(void *Ptr) { return User::operator delete(Ptr); } /// Transparently provide more efficient getOperand methods. DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); @@ -226,9 +222,8 @@ class ExtractValueConstantExpr final : public ConstantExpr { } // allocate space for exactly one operand - void *operator new(size_t s) { - return User::operator new(s, 1); - } + void *operator new(size_t S) { return User::operator new(S, 1); } + void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Indices - These identify which value to extract. const SmallVector Indices; @@ -258,9 +253,8 @@ class InsertValueConstantExpr final : public ConstantExpr { } // allocate space for exactly one operand - void *operator new(size_t s) { - return User::operator new(s, 2); - } + void *operator new(size_t S) { return User::operator new(S, 2); } + void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Indices - These identify the position for the insertion. const SmallVector Indices; @@ -323,9 +317,8 @@ class CompareConstantExpr final : public ConstantExpr { } // allocate space for exactly two operands - void *operator new(size_t s) { - return User::operator new(s, 2); - } + void *operator new(size_t S) { return User::operator new(S, 2); } + void operator delete(void *Ptr) { return User::operator delete(Ptr); } /// Transparently provide more efficient getOperand methods. DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index e4fb9b7ae9679..5caab2e85486e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -14401,29 +14401,34 @@ static SDValue performIntrinsicCombine(SDNode *N, N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), DAG.getCondCode(ISD::SETUGT)); break; + case Intrinsic::aarch64_sve_fcmpge: case Intrinsic::aarch64_sve_cmpge: - if (!N->getOperand(2).getValueType().isFloatingPoint()) - return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), - N->getValueType(0), N->getOperand(1), N->getOperand(2), - N->getOperand(3), DAG.getCondCode(ISD::SETGE)); + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), + N->getValueType(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), DAG.getCondCode(ISD::SETGE)); break; + case Intrinsic::aarch64_sve_fcmpgt: case Intrinsic::aarch64_sve_cmpgt: - if (!N->getOperand(2).getValueType().isFloatingPoint()) - return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), - N->getValueType(0), N->getOperand(1), N->getOperand(2), - N->getOperand(3), DAG.getCondCode(ISD::SETGT)); + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), + N->getValueType(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), DAG.getCondCode(ISD::SETGT)); break; + case Intrinsic::aarch64_sve_fcmpeq: case Intrinsic::aarch64_sve_cmpeq: - if (!N->getOperand(2).getValueType().isFloatingPoint()) - return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), - N->getValueType(0), N->getOperand(1), N->getOperand(2), - N->getOperand(3), DAG.getCondCode(ISD::SETEQ)); + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), + N->getValueType(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), DAG.getCondCode(ISD::SETEQ)); break; + case Intrinsic::aarch64_sve_fcmpne: case Intrinsic::aarch64_sve_cmpne: - if (!N->getOperand(2).getValueType().isFloatingPoint()) - return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), - N->getValueType(0), N->getOperand(1), N->getOperand(2), - N->getOperand(3), DAG.getCondCode(ISD::SETNE)); + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), + N->getValueType(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), DAG.getCondCode(ISD::SETNE)); + break; + case Intrinsic::aarch64_sve_fcmpuo: + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), + N->getValueType(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), DAG.getCondCode(ISD::SETUO)); break; case Intrinsic::aarch64_sve_fadda: return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG); diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 4465ee7ce7620..0b483be3176c9 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1255,20 +1255,20 @@ let Predicates = [HasSVE] in { defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo", SETULT, SETUGT>; defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls", SETULE, SETUGE>; - defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", int_aarch64_sve_fcmpge, SETOGE, SETGE, SETOLE, SETLE>; - defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", int_aarch64_sve_fcmpgt, SETOGT, SETGT, SETOLT, SETLT>; - defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", int_aarch64_sve_fcmpeq, SETOEQ, SETEQ, SETOEQ, SETEQ>; - defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", int_aarch64_sve_fcmpne, SETONE, SETNE, SETONE, SETNE>; - defm FCMUO_PPzZZ : sve_fp_3op_p_pd_cc<0b100, "fcmuo", int_aarch64_sve_fcmpuo, SETUO, SETUO, SETUO, SETUO>; + defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", SETOGE, SETGE, SETOLE, SETLE>; + defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", SETOGT, SETGT, SETOLT, SETLT>; + defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", SETOEQ, SETEQ, SETOEQ, SETEQ>; + defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", SETONE, SETNE, SETONE, SETNE>; + defm FCMUO_PPzZZ : sve_fp_3op_p_pd_cc<0b100, "fcmuo", SETUO, SETUO, SETUO, SETUO>; defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge", int_aarch64_sve_facge>; defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt", int_aarch64_sve_facgt>; - defm FCMGE_PPzZ0 : sve_fp_2op_p_pd<0b000, "fcmge">; - defm FCMGT_PPzZ0 : sve_fp_2op_p_pd<0b001, "fcmgt">; - defm FCMLT_PPzZ0 : sve_fp_2op_p_pd<0b010, "fcmlt">; - defm FCMLE_PPzZ0 : sve_fp_2op_p_pd<0b011, "fcmle">; - defm FCMEQ_PPzZ0 : sve_fp_2op_p_pd<0b100, "fcmeq">; - defm FCMNE_PPzZ0 : sve_fp_2op_p_pd<0b110, "fcmne">; + defm FCMGE_PPzZ0 : sve_fp_2op_p_pd<0b000, "fcmge", SETOGE, SETGE, SETOLE, SETLE>; + defm FCMGT_PPzZ0 : sve_fp_2op_p_pd<0b001, "fcmgt", SETOGT, SETGT, SETOLT, SETLT>; + defm FCMLT_PPzZ0 : sve_fp_2op_p_pd<0b010, "fcmlt", SETOLT, SETLT, SETOGT, SETGT>; + defm FCMLE_PPzZ0 : sve_fp_2op_p_pd<0b011, "fcmle", SETOLE, SETLE, SETOGE, SETGE>; + defm FCMEQ_PPzZ0 : sve_fp_2op_p_pd<0b100, "fcmeq", SETOEQ, SETEQ, SETOEQ, SETEQ>; + defm FCMNE_PPzZ0 : sve_fp_2op_p_pd<0b110, "fcmne", SETONE, SETNE, SETONE, SETNE>; defm WHILELT_PWW : sve_int_while4_rr<0b010, "whilelt", int_aarch64_sve_whilelt>; defm WHILELE_PWW : sve_int_while4_rr<0b011, "whilele", int_aarch64_sve_whilele>; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index f60f6cacf2c3b..1e44a267c8b0b 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -4394,6 +4394,14 @@ multiclass SVE_SETCC_Pat; } +multiclass SVE_SETCC_Pat_With_Zero { + def : Pat<(predvt (AArch64setcc_z predvt:$Op1, intvt:$Op2, (SVEDup0), cc)), + (cmp $Op1, $Op2)>; + def : Pat<(predvt (AArch64setcc_z predvt:$Op1, (SVEDup0), intvt:$Op2, invcc)), + (cmp $Op1, $Op2)>; +} + multiclass sve_int_cmp_0 opc, string asm, CondCode cc, CondCode invcc> { def _B : sve_int_cmp<0b0, 0b00, opc, asm, PPR8, ZPR8, ZPR8>; def _H : sve_int_cmp<0b0, 0b01, opc, asm, PPR16, ZPR16, ZPR16>; @@ -4754,10 +4762,13 @@ multiclass sve_fp_3op_p_pd opc, string asm, SDPatternOperator op> { def : SVE_3_Op_Pat(NAME # _D)>; } -multiclass sve_fp_3op_p_pd_cc opc, string asm, SDPatternOperator op, +multiclass sve_fp_3op_p_pd_cc opc, string asm, CondCode cc1, CondCode cc2, - CondCode invcc1, CondCode invcc2> -: sve_fp_3op_p_pd { + CondCode invcc1, CondCode invcc2> { + def _H : sve_fp_3op_p_pd<0b01, opc, asm, PPR16, ZPR16>; + def _S : sve_fp_3op_p_pd<0b10, opc, asm, PPR32, ZPR32>; + def _D : sve_fp_3op_p_pd<0b11, opc, asm, PPR64, ZPR64>; + defm : SVE_SETCC_Pat(NAME # _H)>; defm : SVE_SETCC_Pat(NAME # _H)>; defm : SVE_SETCC_Pat(NAME # _H)>; @@ -4797,10 +4808,26 @@ class sve_fp_2op_p_pd sz, bits<3> opc, string asm, PPRRegOp pprty, let Inst{3-0} = Pd; } -multiclass sve_fp_2op_p_pd opc, string asm> { +multiclass sve_fp_2op_p_pd opc, string asm, + CondCode cc1, CondCode cc2, + CondCode invcc1, CondCode invcc2> { def _H : sve_fp_2op_p_pd<0b01, opc, asm, PPR16, ZPR16>; def _S : sve_fp_2op_p_pd<0b10, opc, asm, PPR32, ZPR32>; def _D : sve_fp_2op_p_pd<0b11, opc, asm, PPR64, ZPR64>; + + defm : SVE_SETCC_Pat_With_Zero(NAME # _H)>; + defm : SVE_SETCC_Pat_With_Zero(NAME # _H)>; + defm : SVE_SETCC_Pat_With_Zero(NAME # _H)>; + defm : SVE_SETCC_Pat_With_Zero(NAME # _S)>; + defm : SVE_SETCC_Pat_With_Zero(NAME # _S)>; + defm : SVE_SETCC_Pat_With_Zero(NAME # _D)>; + + defm : SVE_SETCC_Pat_With_Zero(NAME # _H)>; + defm : SVE_SETCC_Pat_With_Zero(NAME # _H)>; + defm : SVE_SETCC_Pat_With_Zero(NAME # _H)>; + defm : SVE_SETCC_Pat_With_Zero(NAME # _S)>; + defm : SVE_SETCC_Pat_With_Zero(NAME # _S)>; + defm : SVE_SETCC_Pat_With_Zero(NAME # _D)>; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index f611c893cde4c..7f74204229c20 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -763,6 +763,9 @@ class AMDGPUPassConfig : public TargetPassConfig { // anything. disablePass(&StackMapLivenessID); disablePass(&FuncletLayoutID); + // Garbage collection is not supported. + disablePass(&GCLoweringID); + disablePass(&ShadowStackGCLoweringID); } AMDGPUTargetMachine &getAMDGPUTargetMachine() const { diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp index 7beef25794b16..acfee63d203ab 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp @@ -365,7 +365,7 @@ void SIOptimizeVGPRLiveRange::updateLiveRangeInElseRegion( unsigned BBNum = MBB->getNumber(); if (OldVarInfo.AliveBlocks.test(BBNum)) { NewVarInfo.AliveBlocks.set(BBNum); - LLVM_DEBUG(dbgs() << "Removing ALiveBlock " << printMBBReference(*MBB) + LLVM_DEBUG(dbgs() << "Removing AliveBlock " << printMBBReference(*MBB) << '\n'); OldVarInfo.AliveBlocks.reset(BBNum); } diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index 0792b303b8309..b24c061af7ab7 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -137,7 +137,6 @@ def MIReadVGPR : SchedReadVariant<[ // The latency values are 1 / (operations / cycle) / 4. multiclass SICommonWriteRes { - let RetireOOO = 1 in { // llvm-mca specific flag def : HWWriteRes; def : HWWriteRes; def : HWWriteRes; // Can be between 2 and 64 @@ -160,7 +159,6 @@ multiclass SICommonWriteRes { def : HWWriteRes; let ResourceCycles = [16] in def : HWWriteRes; - } // End RetireOOO = 1 def : ReadAdvance; @@ -184,7 +182,6 @@ let SchedModel = SIFullSpeedModel in { defm : SICommonWriteRes; -let RetireOOO = 1 in { // llvm-mca specific flag def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; @@ -192,7 +189,6 @@ def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; -} // End RetireOOO = 1 def : InstRW<[WriteCopy], (instrs COPY)>; @@ -202,7 +198,6 @@ let SchedModel = SIQuarterSpeedModel in { defm : SICommonWriteRes; -let RetireOOO = 1 in { // llvm-mca specific flag def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; @@ -210,7 +205,6 @@ def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; -} // End RetireOOO = 1 def : InstRW<[WriteCopy], (instrs COPY)>; def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>; @@ -224,7 +218,6 @@ let SchedModel = SIDPFullSpeedModel in { defm : SICommonWriteRes; -let RetireOOO = 1 in { // llvm-mca specific flag def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; @@ -232,7 +225,6 @@ def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; -} // End RetireOOO = 1 def : InstRW<[WriteCopy], (instrs COPY)>; def : InstRW<[Write64Bit], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>; @@ -248,7 +240,6 @@ let SchedModel = GFX10SpeedModel in { // The latency values are 1 / (operations / cycle). // Add 1 stall cycle for VGPR read. -let RetireOOO = 1 in { // llvm-mca specific flag def : HWWriteRes; def : HWWriteRes; def : HWWriteRes; @@ -268,7 +259,6 @@ def : HWWriteRes; def : HWWriteRes; def : HWWriteRes; def : HWWriteRes; -} // End RetireOOO = 1 def : InstRW<[WriteCopy], (instrs COPY)>; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 99b8cd5d20d30..39cf24b00ac3d 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -14524,18 +14524,15 @@ SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN, int NumEltsIn = SToVLHS ? SToVLHS.getValueType().getVectorNumElements() : SToVRHS.getValueType().getVectorNumElements(); int NumEltsOut = ShuffV.size(); - unsigned InElemSizeInBits = - SToVLHS ? SToVLHS.getValueType().getScalarSizeInBits() - : SToVRHS.getValueType().getScalarSizeInBits(); - unsigned OutElemSizeInBits = SToVLHS - ? LHS.getValueType().getScalarSizeInBits() - : RHS.getValueType().getScalarSizeInBits(); - // The width of the "valid lane" (i.e. the lane that contains the value that // is vectorized) needs to be expressed in terms of the number of elements // of the shuffle. It is thereby the ratio of the values before and after // any bitcast. - unsigned ValidLaneWidth = InElemSizeInBits / OutElemSizeInBits; + unsigned ValidLaneWidth = + SToVLHS ? SToVLHS.getValueType().getScalarSizeInBits() / + LHS.getValueType().getScalarSizeInBits() + : SToVRHS.getValueType().getVectorNumElements() / + RHS.getValueType().getScalarSizeInBits(); // Initially assume that neither input is permuted. These will be adjusted // accordingly if either input is. @@ -14548,9 +14545,10 @@ SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN, // ISD::SCALAR_TO_VECTOR. // On big endian systems, this only makes sense for element sizes smaller // than 64 bits since for 64-bit elements, all instructions already put - // the value into element zero. + // the value into element zero. Since scalar size of LHS and RHS may differ + // after isScalarToVec, this should be checked using their own sizes. if (SToVLHS) { - if (!IsLittleEndian && InElemSizeInBits >= 64) + if (!IsLittleEndian && SToVLHS.getValueType().getScalarSizeInBits() >= 64) return Res; // Set up the values for the shuffle vector fixup. LHSMaxIdx = NumEltsOut / NumEltsIn; @@ -14560,7 +14558,7 @@ SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN, LHS = SToVLHS; } if (SToVRHS) { - if (!IsLittleEndian && InElemSizeInBits >= 64) + if (!IsLittleEndian && SToVRHS.getValueType().getScalarSizeInBits() >= 64) return Res; RHSMinIdx = NumEltsOut; RHSMaxIdx = NumEltsOut / NumEltsIn + RHSMinIdx; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td index 18250cf8ef850..7a0c524d63b0d 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td @@ -93,6 +93,14 @@ defm EQZ_I64 : I<(outs I32:$dst), (ins I64:$src), (outs), (ins), [(set I32:$dst, (setcc I64:$src, 0, SETEQ))], "i64.eqz \t$dst, $src", "i64.eqz", 0x50>; +// Optimize away an explicit mask on a shift count. +def : Pat<(shl I32:$lhs, (and I32:$rhs, 31)), (SHL_I32 I32:$lhs, I32:$rhs)>; +def : Pat<(sra I32:$lhs, (and I32:$rhs, 31)), (SHR_S_I32 I32:$lhs, I32:$rhs)>; +def : Pat<(srl I32:$lhs, (and I32:$rhs, 31)), (SHR_U_I32 I32:$lhs, I32:$rhs)>; +def : Pat<(shl I64:$lhs, (and I64:$rhs, 63)), (SHL_I64 I64:$lhs, I64:$rhs)>; +def : Pat<(sra I64:$lhs, (and I64:$rhs, 63)), (SHR_S_I64 I64:$lhs, I64:$rhs)>; +def : Pat<(srl I64:$lhs, (and I64:$rhs, 63)), (SHR_U_I64 I64:$lhs, I64:$rhs)>; + // Optimize away an explicit mask on a rotate count. def : Pat<(rotl I32:$lhs, (and I32:$rhs, 31)), (ROTL_I32 I32:$lhs, I32:$rhs)>; def : Pat<(rotr I32:$lhs, (and I32:$rhs, 31)), (ROTR_I32 I32:$lhs, I32:$rhs)>; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 2c35b4944fc47..d7058ff049362 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -898,6 +898,35 @@ defm SHL : SIMDShiftInt; defm SHR_S : SIMDShiftInt; defm SHR_U : SIMDShiftInt; +// Optimize away an explicit mask on a shift count. +def : Pat<(wasm_shl (v16i8 V128:$lhs), (and I32:$rhs, 7)), + (SHL_I8x16 V128:$lhs, I32:$rhs)>; +def : Pat<(wasm_shr_s (v16i8 V128:$lhs), (and I32:$rhs, 7)), + (SHR_S_I8x16 V128:$lhs, I32:$rhs)>; +def : Pat<(wasm_shr_u (v16i8 V128:$lhs), (and I32:$rhs, 7)), + (SHR_U_I8x16 V128:$lhs, I32:$rhs)>; + +def : Pat<(wasm_shl (v8i16 V128:$lhs), (and I32:$rhs, 15)), + (SHL_I16x8 V128:$lhs, I32:$rhs)>; +def : Pat<(wasm_shr_s (v8i16 V128:$lhs), (and I32:$rhs, 15)), + (SHR_S_I16x8 V128:$lhs, I32:$rhs)>; +def : Pat<(wasm_shr_u (v8i16 V128:$lhs), (and I32:$rhs, 15)), + (SHR_U_I16x8 V128:$lhs, I32:$rhs)>; + +def : Pat<(wasm_shl (v4i32 V128:$lhs), (and I32:$rhs, 31)), + (SHL_I32x4 V128:$lhs, I32:$rhs)>; +def : Pat<(wasm_shr_s (v4i32 V128:$lhs), (and I32:$rhs, 31)), + (SHR_S_I32x4 V128:$lhs, I32:$rhs)>; +def : Pat<(wasm_shr_u (v4i32 V128:$lhs), (and I32:$rhs, 31)), + (SHR_U_I32x4 V128:$lhs, I32:$rhs)>; + +def : Pat<(wasm_shl (v2i64 V128:$lhs), (trunc (and I64:$rhs, 63))), + (SHL_I64x2 V128:$lhs, (I32_WRAP_I64 I64:$rhs))>; +def : Pat<(wasm_shr_s (v2i64 V128:$lhs), (trunc (and I64:$rhs, 63))), + (SHR_S_I64x2 V128:$lhs, (I32_WRAP_I64 I64:$rhs))>; +def : Pat<(wasm_shr_u (v2i64 V128:$lhs), (trunc (and I64:$rhs, 63))), + (SHR_U_I64x2 V128:$lhs, (I32_WRAP_I64 I64:$rhs))>; + //===----------------------------------------------------------------------===// // Integer binary arithmetic //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sve-fcmp.ll b/llvm/test/CodeGen/AArch64/sve-fcmp.ll index f2e109762ffd3..3c752ab5fa25d 100644 --- a/llvm/test/CodeGen/AArch64/sve-fcmp.ll +++ b/llvm/test/CodeGen/AArch64/sve-fcmp.ll @@ -308,3 +308,117 @@ define @ne_fast( %x, %y = fcmp fast one %x, %x2 ret %y } +define @oeq_zero( %x) { +; CHECK-LABEL: oeq_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: ret + %y = fcmp oeq %x, zeroinitializer + ret %y +} +define @ogt_zero( %x) { +; CHECK-LABEL: ogt_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcmgt p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: ret + %y = fcmp ogt %x, zeroinitializer + ret %y +} +define @oge_zero( %x) { +; CHECK-LABEL: oge_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcmge p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: ret + %y = fcmp oge %x, zeroinitializer + ret %y +} +define @olt_zero( %x) { +; CHECK-LABEL: olt_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcmlt p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: ret + %y = fcmp olt %x, zeroinitializer + ret %y +} +define @ole_zero( %x) { +; CHECK-LABEL: ole_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcmle p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: ret + %y = fcmp ole %x, zeroinitializer + ret %y +} +define @one_zero( %x) { +; CHECK-LABEL: one_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: ret + %y = fcmp one %x, zeroinitializer + ret %y +} +define @ueq_zero( %x) { +; CHECK-LABEL: ueq_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcmne p1.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: not p0.b, p0/z, p1.b +; CHECK-NEXT: ret + %y = fcmp ueq %x, zeroinitializer + ret %y +} +define @ugt_zero( %x) { +; CHECK-LABEL: ugt_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcmle p1.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: not p0.b, p0/z, p1.b +; CHECK-NEXT: ret + %y = fcmp ugt %x, zeroinitializer + ret %y +} +define @uge_zero( %x) { +; CHECK-LABEL: uge_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcmlt p1.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: not p0.b, p0/z, p1.b +; CHECK-NEXT: ret + %y = fcmp uge %x, zeroinitializer + ret %y +} +define @ult_zero( %x) { +; CHECK-LABEL: ult_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: not p0.b, p0/z, p1.b +; CHECK-NEXT: ret + %y = fcmp ult %x, zeroinitializer + ret %y +} +define @ule_zero( %x) { +; CHECK-LABEL: ule_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcmgt p1.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: not p0.b, p0/z, p1.b +; CHECK-NEXT: ret + %y = fcmp ule %x, zeroinitializer + ret %y +} +define @une_zero( %x) { +; CHECK-LABEL: une_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: not p0.b, p0/z, p1.b +; CHECK-NEXT: ret + %y = fcmp une %x, zeroinitializer + ret %y +} diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll index 5ae9f9ecbb419..d8b08461b6ee1 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -617,8 +617,7 @@ define void @masked_gather_v16f16(<16 x half>* %a, <16 x half*>* %b) #0 { ; VBITS_GE_1024-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl16 ; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_1024-NEXT: mov [[ZERO:z[0-9]+]].h, #0 -; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]].h +; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 ; VBITS_GE_1024-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[UZP1]].h, [[UZP1]].h @@ -638,8 +637,7 @@ define void @masked_gather_v32f16(<32 x half>* %a, <32 x half*>* %b) #0 { ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].h, #0 -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]].h +; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 ; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[UZP1]].h, [[UZP1]].h @@ -702,8 +700,7 @@ define void @masked_gather_v8f32(<8 x float>* %a, <8 x float*>* %b) #0 { ; VBITS_GE_512-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].d, vl8 ; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_512-NEXT: mov [[ZERO:z[0-9]+]].s, #0 -; VBITS_GE_512-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]].s +; VBITS_GE_512-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 ; VBITS_GE_512-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_512-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_512-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0] @@ -722,8 +719,7 @@ define void @masked_gather_v16f32(<16 x float>* %a, <16 x float*>* %b) #0 { ; VBITS_GE_1024-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl16 ; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_1024-NEXT: mov [[ZERO:z[0-9]+]].s, #0 -; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]].s +; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 ; VBITS_GE_1024-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_1024-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_1024-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0] @@ -742,8 +738,7 @@ define void @masked_gather_v32f32(<32 x float>* %a, <32 x float*>* %b) #0 { ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0 -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]].s +; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 ; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_2048-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0] @@ -795,8 +790,7 @@ define void @masked_gather_v4f64(<4 x double>* %a, <4 x double*>* %b) #0 { ; CHECK: ptrue [[PG0:p[0-9]+]].d, vl4 ; CHECK-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0] ; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; CHECK-NEXT: mov [[ZERO:z[0-9]+]].d, #0 -; CHECK-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, [[ZERO]].d +; CHECK-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0.0 ; CHECK-NEXT: ld1d { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; CHECK-NEXT: st1d { [[RES]].d }, [[PG0]], [x0] ; CHECK-NEXT: ret @@ -813,8 +807,7 @@ define void @masked_gather_v8f64(<8 x double>* %a, <8 x double*>* %b) #0 { ; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].d, vl8 ; VBITS_GE_512-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0] ; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_GE_512-NEXT: mov [[ZERO:z[0-9]+]].d, #0 -; VBITS_GE_512-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, [[ZERO]].d +; VBITS_GE_512-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0.0 ; VBITS_GE_512-NEXT: ld1d { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG0]], [x0] ; VBITS_GE_512-NEXT: ret @@ -831,8 +824,7 @@ define void @masked_gather_v16f64(<16 x double>* %a, <16 x double*>* %b) #0 { ; VBITS_GE_1024: ptrue [[PG0:p[0-9]+]].d, vl16 ; VBITS_GE_1024-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0] ; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_GE_1024-NEXT: mov [[ZERO:z[0-9]+]].d, #0 -; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, [[ZERO]].d +; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0.0 ; VBITS_GE_1024-NEXT: ld1d { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG0]], [x0] ; VBITS_GE_1024-NEXT: ret @@ -849,8 +841,7 @@ define void @masked_gather_v32f64(<32 x double>* %a, <32 x double*>* %b) #0 { ; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].d, #0 -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, [[ZERO]].d +; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0.0 ; VBITS_GE_2048-NEXT: ld1d { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG0]], [x0] ; VBITS_GE_2048-NEXT: ret @@ -871,8 +862,7 @@ define void @masked_gather_32b_scaled_sext(<32 x half>* %a, <32 x i32>* %b, half ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].h, #0 -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]] +; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 ; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, sxtw #1] ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h ; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0] @@ -893,8 +883,7 @@ define void @masked_gather_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].h, #0 -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]] +; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 ; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, uxtw #1] ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h ; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0] @@ -915,8 +904,7 @@ define void @masked_gather_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8 ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].h, #0 -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]] +; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 ; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, sxtw] ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h ; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0] @@ -938,8 +926,7 @@ define void @masked_gather_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8 ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].h, #0 -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]] +; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 ; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, uxtw] ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h ; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0] @@ -961,8 +948,7 @@ define void @masked_gather_64b_scaled(<32 x float>* %a, <32 x i64>* %b, float* % ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0 -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]] +; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 ; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[PTRS]].d, lsl #2] ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_2048-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0] @@ -982,8 +968,7 @@ define void @masked_gather_64b_unscaled(<32 x float>* %a, <32 x i64>* %b, i8* %b ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0 -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]] +; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 ; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[PTRS]].d] ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_2048-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0] @@ -1006,9 +991,8 @@ define void @masked_gather_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 %o ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] ; VBITS_GE_2048-NEXT: mov [[OFF:z[0-9]+]].d, x2 -; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0 ; VBITS_GE_2048-NEXT: add [[PTRS_ADD:z[0-9]+]].d, [[PG1]]/m, [[PTRS]].d, [[OFF]].d -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]] +; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 ; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS_ADD]].d] ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_2048-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0] @@ -1031,9 +1015,8 @@ define void @masked_gather_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) #0 { ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] ; VBITS_GE_2048-NEXT: mov [[OFF:z[0-9]+]].d, #4 -; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0 ; VBITS_GE_2048-NEXT: add [[PTRS_ADD:z[0-9]+]].d, [[PG1]]/m, [[PTRS]].d, [[OFF]].d -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]] +; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 ; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS_ADD]].d] ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_2048-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0] @@ -1054,9 +1037,8 @@ define void @masked_gather_passthru(<32 x float>* %a, <32 x float*>* %b, <32 x f ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0 -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]] ; VBITS_GE_2048-NEXT: ld1w { [[PT:z[0-9]+]].s }, [[PG0]]/z, [x2] +; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 ; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_2048-NEXT: sel [[SEL:z[0-9]+]].s, [[PG1]], [[UZP]].s, [[PT]].s @@ -1077,8 +1059,7 @@ define void @masked_gather_passthru_0(<32 x float>* %a, <32 x float*>* %b) #0 { ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0 -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]] +; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 ; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_2048-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll index aa79ea7992b70..5dc40e399d0ec 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -562,8 +562,7 @@ define void @masked_scatter_v16f16(<16 x half>* %a, <16 x half*>* %b) #0 { ; VBITS_GE_1024-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl16 ; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_1024-NEXT: mov [[ZERO:z[0-9]+]].h, #0 -; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]] +; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 ; VBITS_GE_1024-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[VALS]].h ; VBITS_GE_1024-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s ; VBITS_GE_1024-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d] @@ -581,8 +580,7 @@ define void @masked_scatter_v32f16(<32 x half>* %a, <32 x half*>* %b) #0 { ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].h, #0 -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]] +; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 ; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[VALS]].h ; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s ; VBITS_GE_2048-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d] @@ -639,8 +637,7 @@ define void @masked_scatter_v8f32(<8 x float>* %a, <8 x float*>* %b) #0 { ; VBITS_GE_512-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].d, vl8 ; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_512-NEXT: mov [[ZERO:z[0-9]+]].s, #0 -; VBITS_GE_512-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]] +; VBITS_GE_512-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 ; VBITS_GE_512-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s ; VBITS_GE_512-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_512-NEXT: ret @@ -657,8 +654,7 @@ define void @masked_scatter_v16f32(<16 x float>* %a, <16 x float*>* %b) #0 { ; VBITS_GE_1024-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl16 ; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_1024-NEXT: mov [[ZERO:z[0-9]+]].s, #0 -; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]] +; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 ; VBITS_GE_1024-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s ; VBITS_GE_1024-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_1024-NEXT: ret @@ -675,8 +671,7 @@ define void @masked_scatter_v32f32(<32 x float>* %a, <32 x float*>* %b) #0 { ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0 -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]] +; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 ; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s ; VBITS_GE_2048-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_2048-NEXT: ret @@ -723,8 +718,7 @@ define void @masked_scatter_v4f64(<4 x double>* %a, <4 x double*>* %b) #0 { ; CHECK: ptrue [[PG0:p[0-9]+]].d, vl4 ; CHECK-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0] ; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; CHECK-NEXT: mov [[ZERO:z[0-9]+]].d, #0 -; CHECK-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, [[ZERO]].d +; CHECK-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0.0 ; CHECK-NEXT: st1d { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; CHECK-NEXT: ret %vals = load <4 x double>, <4 x double>* %a @@ -739,8 +733,7 @@ define void @masked_scatter_v8f64(<8 x double>* %a, <8 x double*>* %b) #0 { ; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].d, vl8 ; VBITS_GE_512-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0] ; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_GE_512-NEXT: mov [[ZERO:z[0-9]+]].d, #0 -; VBITS_GE_512-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, [[ZERO]] +; VBITS_GE_512-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0.0 ; VBITS_GE_512-NEXT: st1d { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_512-NEXT: ret %vals = load <8 x double>, <8 x double>* %a @@ -755,8 +748,7 @@ define void @masked_scatter_v16f64(<16 x double>* %a, <16 x double*>* %b) #0 { ; VBITS_GE_1024: ptrue [[PG0:p[0-9]+]].d, vl16 ; VBITS_GE_1024-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0] ; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_GE_1024-NEXT: mov [[ZERO:z[0-9]+]].d, #0 -; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, [[ZERO]] +; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0.0 ; VBITS_GE_1024-NEXT: st1d { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_1024-NEXT: ret %vals = load <16 x double>, <16 x double>* %a @@ -771,8 +763,7 @@ define void @masked_scatter_v32f64(<32 x double>* %a, <32 x double*>* %b) #0 { ; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].d, #0 -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, [[ZERO]] +; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0.0 ; VBITS_GE_2048-NEXT: st1d { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x double>, <32 x double>* %a @@ -791,8 +782,7 @@ define void @masked_scatter_32b_scaled_sext(<32 x half>* %a, <32 x i32>* %b, hal ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].h, #0 -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]] +; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 ; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[VALS]].h ; VBITS_GE_2048-NEXT: st1h { [[VALS]].s }, [[MASK]], [x2, [[PTRS]].s, sxtw #1] ; VBITS_GE_2048-NEXT: ret @@ -811,8 +801,7 @@ define void @masked_scatter_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, hal ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].h, #0 -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]] +; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 ; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[VALS]].h ; VBITS_GE_2048-NEXT: st1h { [[VALS]].s }, [[MASK]], [x2, [[PTRS]].s, uxtw #1] ; VBITS_GE_2048-NEXT: ret @@ -831,8 +820,7 @@ define void @masked_scatter_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].h, #0 -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]] +; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 ; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[VALS]].h ; VBITS_GE_2048-NEXT: st1h { [[VALS]].s }, [[MASK]], [x2, [[PTRS]].s, sxtw] ; VBITS_GE_2048-NEXT: ret @@ -852,8 +840,7 @@ define void @masked_scatter_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].h, #0 -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]] +; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 ; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[VALS]].h ; VBITS_GE_2048-NEXT: st1h { [[VALS]].s }, [[MASK]], [x2, [[PTRS]].s, uxtw] ; VBITS_GE_2048-NEXT: ret @@ -873,8 +860,7 @@ define void @masked_scatter_64b_scaled(<32 x float>* %a, <32 x i64>* %b, float* ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0 -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]] +; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 ; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s ; VBITS_GE_2048-NEXT: st1w { [[VALS]].d }, [[MASK]], [x2, [[PTRS]].d, lsl #2] ; VBITS_GE_2048-NEXT: ret @@ -892,8 +878,7 @@ define void @masked_scatter_64b_unscaled(<32 x float>* %a, <32 x i64>* %b, i8* % ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0 -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]] +; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 ; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s ; VBITS_GE_2048-NEXT: st1w { [[VALS]].d }, [[MASK]], [x2, [[PTRS]].d] ; VBITS_GE_2048-NEXT: ret @@ -914,8 +899,7 @@ define void @masked_scatter_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 % ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] ; VBITS_GE_2048-NEXT: mov [[OFF:z[0-9]+]].d, x2 -; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0 -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]] +; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 ; VBITS_GE_2048-NEXT: add [[PTRS_ADD:z[0-9]+]].d, [[PG1]]/m, [[PTRS]].d, [[OFF]].d ; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s ; VBITS_GE_2048-NEXT: st1w { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS_ADD]].d] @@ -937,8 +921,7 @@ define void @masked_scatter_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) #0 { ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] ; VBITS_GE_2048-NEXT: mov [[OFF:z[0-9]+]].d, #4 -; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0 -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]] +; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 ; VBITS_GE_2048-NEXT: add [[PTRS_ADD:z[0-9]+]].d, [[PG1]]/m, [[PTRS]].d, [[OFF]].d ; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s ; VBITS_GE_2048-NEXT: st1w { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS_ADD]].d] diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-fp-compares.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-fp-compares.ll index 3200f14680bf3..ceda1c2b05121 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-fp-compares.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-fp-compares.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s ; @@ -6,8 +7,9 @@ define @facge_h( %pg, %a, %b) { ; CHECK-LABEL: facge_h: -; CHECK: facge p0.h, p0/z, z0.h, z1.h -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: facge p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.facge.nxv8f16( %pg, %a, %b) @@ -16,8 +18,9 @@ define @facge_h( %pg, %a, define @facge_s( %pg, %a, %b) { ; CHECK-LABEL: facge_s: -; CHECK: facge p0.s, p0/z, z0.s, z1.s -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: facge p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.facge.nxv4f32( %pg, %a, %b) @@ -26,8 +29,9 @@ define @facge_s( %pg, %a define @facge_d( %pg, %a, %b) { ; CHECK-LABEL: facge_d: -; CHECK: facge p0.d, p0/z, z0.d, z1.d -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: facge p0.d, p0/z, z0.d, z1.d +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.facge.nxv2f64( %pg, %a, %b) @@ -40,8 +44,9 @@ define @facge_d( %pg, % define @facgt_h( %pg, %a, %b) { ; CHECK-LABEL: facgt_h: -; CHECK: facgt p0.h, p0/z, z0.h, z1.h -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: facgt p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.facgt.nxv8f16( %pg, %a, %b) @@ -50,8 +55,9 @@ define @facgt_h( %pg, %a, define @facgt_s( %pg, %a, %b) { ; CHECK-LABEL: facgt_s: -; CHECK: facgt p0.s, p0/z, z0.s, z1.s -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: facgt p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.facgt.nxv4f32( %pg, %a, %b) @@ -60,8 +66,9 @@ define @facgt_s( %pg, %a define @facgt_d( %pg, %a, %b) { ; CHECK-LABEL: facgt_d: -; CHECK: facgt p0.d, p0/z, z0.d, z1.d -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: facgt p0.d, p0/z, z0.d, z1.d +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.facgt.nxv2f64( %pg, %a, %b) @@ -74,8 +81,9 @@ define @facgt_d( %pg, % define @fcmeq_h( %pg, %a, %b) { ; CHECK-LABEL: fcmeq_h: -; CHECK: fcmeq p0.h, p0/z, z0.h, z1.h -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.fcmpeq.nxv8f16( %pg, %a, %b) @@ -84,8 +92,9 @@ define @fcmeq_h( %pg, %a, define @fcmeq_s( %pg, %a, %b) { ; CHECK-LABEL: fcmeq_s: -; CHECK: fcmeq p0.s, p0/z, z0.s, z1.s -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.fcmpeq.nxv4f32( %pg, %a, %b) @@ -94,22 +103,35 @@ define @fcmeq_s( %pg, %a define @fcmeq_d( %pg, %a, %b) { ; CHECK-LABEL: fcmeq_d: -; CHECK: fcmeq p0.d, p0/z, z0.d, z1.d -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: fcmeq p0.d, p0/z, z0.d, z1.d +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.fcmpeq.nxv2f64( %pg, %a, %b) ret %out } +define @fcmeq_zero( %pg, %a) { +; CHECK-LABEL: fcmeq_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: fcmeq p0.d, p0/z, z0.d, #0.0 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fcmpeq.nxv2f64( %pg, + %a, + zeroinitializer) + ret %out +} + ; ; FCMGE ; define @fcmge_h( %pg, %a, %b) { ; CHECK-LABEL: fcmge_h: -; CHECK: fcmge p0.h, p0/z, z0.h, z1.h -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: fcmge p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.fcmpge.nxv8f16( %pg, %a, %b) @@ -118,8 +140,9 @@ define @fcmge_h( %pg, %a, define @fcmge_s( %pg, %a, %b) { ; CHECK-LABEL: fcmge_s: -; CHECK: fcmge p0.s, p0/z, z0.s, z1.s -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: fcmge p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.fcmpge.nxv4f32( %pg, %a, %b) @@ -128,22 +151,34 @@ define @fcmge_s( %pg, %a define @fcmge_d( %pg, %a, %b) { ; CHECK-LABEL: fcmge_d: -; CHECK: fcmge p0.d, p0/z, z0.d, z1.d -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: fcmge p0.d, p0/z, z0.d, z1.d +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.fcmpge.nxv2f64( %pg, %a, %b) ret %out } +define @fcmge_zero( %pg, %a) { +; CHECK-LABEL: fcmge_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: fcmge p0.d, p0/z, z0.d, #0.0 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fcmpge.nxv2f64( %pg, + %a, + zeroinitializer) + ret %out +} ; ; FCMGT ; define @fcmgt_h( %pg, %a, %b) { ; CHECK-LABEL: fcmgt_h: -; CHECK: fcmgt p0.h, p0/z, z0.h, z1.h -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: fcmgt p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.fcmpgt.nxv8f16( %pg, %a, %b) @@ -152,8 +187,9 @@ define @fcmgt_h( %pg, %a, define @fcmgt_s( %pg, %a, %b) { ; CHECK-LABEL: fcmgt_s: -; CHECK: fcmgt p0.s, p0/z, z0.s, z1.s -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: fcmgt p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.fcmpgt.nxv4f32( %pg, %a, %b) @@ -162,22 +198,34 @@ define @fcmgt_s( %pg, %a define @fcmgt_d( %pg, %a, %b) { ; CHECK-LABEL: fcmgt_d: -; CHECK: fcmgt p0.d, p0/z, z0.d, z1.d -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: fcmgt p0.d, p0/z, z0.d, z1.d +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.fcmpgt.nxv2f64( %pg, %a, %b) ret %out } +define @fcmgt_zero( %pg, %a) { +; CHECK-LABEL: fcmgt_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: fcmgt p0.d, p0/z, z0.d, #0.0 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fcmpgt.nxv2f64( %pg, + %a, + zeroinitializer) + ret %out +} ; ; FCMNE ; define @fcmne_h( %pg, %a, %b) { ; CHECK-LABEL: fcmne_h: -; CHECK: fcmne p0.h, p0/z, z0.h, z1.h -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: fcmne p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.fcmpne.nxv8f16( %pg, %a, %b) @@ -186,8 +234,9 @@ define @fcmne_h( %pg, %a, define @fcmne_s( %pg, %a, %b) { ; CHECK-LABEL: fcmne_s: -; CHECK: fcmne p0.s, p0/z, z0.s, z1.s -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.fcmpne.nxv4f32( %pg, %a, %b) @@ -196,22 +245,35 @@ define @fcmne_s( %pg, %a define @fcmne_d( %pg, %a, %b) { ; CHECK-LABEL: fcmne_d: -; CHECK: fcmne p0.d, p0/z, z0.d, z1.d -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: fcmne p0.d, p0/z, z0.d, z1.d +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.fcmpne.nxv2f64( %pg, %a, %b) ret %out } +define @fcmne_zero( %pg, %a) { +; CHECK-LABEL: fcmne_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: fcmne p0.d, p0/z, z0.d, #0.0 +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.fcmpne.nxv2f64( %pg, + %a, + zeroinitializer) + ret %out +} + ; ; FCMPUO ; define @fcmuo_h( %pg, %a, %b) { ; CHECK-LABEL: fcmuo_h: -; CHECK: fcmuo p0.h, p0/z, z0.h, z1.h -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: fcmuo p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.fcmpuo.nxv8f16( %pg, %a, %b) @@ -220,8 +282,9 @@ define @fcmuo_h( %pg, %a, define @fcmuo_s( %pg, %a, %b) { ; CHECK-LABEL: fcmuo_s: -; CHECK: fcmuo p0.s, p0/z, z0.s, z1.s -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: fcmuo p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.fcmpuo.nxv4f32( %pg, %a, %b) @@ -230,8 +293,9 @@ define @fcmuo_s( %pg, %a define @fcmuo_d( %pg, %a, %b) { ; CHECK-LABEL: fcmuo_d: -; CHECK: fcmuo p0.d, p0/z, z0.d, z1.d -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: fcmuo p0.d, p0/z, z0.d, z1.d +; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.fcmpuo.nxv2f64( %pg, %a, %b) diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 4d42307327658..80d05799281a3 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -22,8 +22,8 @@ ; GCN-O0-NEXT: Target Transform Information ; GCN-O0-NEXT: Assumption Cache Tracker ; GCN-O0-NEXT: Profile summary info -; GCN-O0-NEXT: Create Garbage Collector Module Metadata ; GCN-O0-NEXT: Argument Register Usage Information Storage +; GCN-O0-NEXT: Create Garbage Collector Module Metadata ; GCN-O0-NEXT: Register Usage Information Storage ; GCN-O0-NEXT: Machine Branch Probability Analysis ; GCN-O0-NEXT: ModulePass Manager @@ -43,9 +43,7 @@ ; GCN-O0-NEXT: Lower OpenCL enqueued blocks ; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O0-NEXT: FunctionPass Manager -; GCN-O0-NEXT: Expand Atomic instructions -; GCN-O0-NEXT: Lower Garbage Collection Instructions -; GCN-O0-NEXT: Shadow Stack GC Lowering +; GCN-O0-NEXT: Expand Atomic instructions ; GCN-O0-NEXT: Lower constant intrinsics ; GCN-O0-NEXT: Remove unreachable blocks from the CFG ; GCN-O0-NEXT: Expand vector predication intrinsics @@ -165,8 +163,8 @@ ; GCN-O1-NEXT: External Alias Analysis ; GCN-O1-NEXT: Type-Based Alias Analysis ; GCN-O1-NEXT: Scoped NoAlias Alias Analysis -; GCN-O1-NEXT: Create Garbage Collector Module Metadata ; GCN-O1-NEXT: Argument Register Usage Information Storage +; GCN-O1-NEXT: Create Garbage Collector Module Metadata ; GCN-O1-NEXT: Machine Branch Probability Analysis ; GCN-O1-NEXT: Register Usage Information Storage ; GCN-O1-NEXT: ModulePass Manager @@ -209,8 +207,6 @@ ; GCN-O1-NEXT: Lazy Branch Probability Analysis ; GCN-O1-NEXT: Lazy Block Frequency Analysis ; GCN-O1-NEXT: Expand memcmp() to load/stores -; GCN-O1-NEXT: Lower Garbage Collection Instructions -; GCN-O1-NEXT: Shadow Stack GC Lowering ; GCN-O1-NEXT: Lower constant intrinsics ; GCN-O1-NEXT: Remove unreachable blocks from the CFG ; GCN-O1-NEXT: Natural Loop Information @@ -413,8 +409,8 @@ ; GCN-O1-OPTS-NEXT: External Alias Analysis ; GCN-O1-OPTS-NEXT: Type-Based Alias Analysis ; GCN-O1-OPTS-NEXT: Scoped NoAlias Alias Analysis -; GCN-O1-OPTS-NEXT: Create Garbage Collector Module Metadata ; GCN-O1-OPTS-NEXT: Argument Register Usage Information Storage +; GCN-O1-OPTS-NEXT: Create Garbage Collector Module Metadata ; GCN-O1-OPTS-NEXT: Machine Branch Probability Analysis ; GCN-O1-OPTS-NEXT: Register Usage Information Storage ; GCN-O1-OPTS-NEXT: ModulePass Manager @@ -475,8 +471,6 @@ ; GCN-O1-OPTS-NEXT: Lazy Branch Probability Analysis ; GCN-O1-OPTS-NEXT: Lazy Block Frequency Analysis ; GCN-O1-OPTS-NEXT: Expand memcmp() to load/stores -; GCN-O1-OPTS-NEXT: Lower Garbage Collection Instructions -; GCN-O1-OPTS-NEXT: Shadow Stack GC Lowering ; GCN-O1-OPTS-NEXT: Lower constant intrinsics ; GCN-O1-OPTS-NEXT: Remove unreachable blocks from the CFG ; GCN-O1-OPTS-NEXT: Natural Loop Information @@ -694,8 +688,8 @@ ; GCN-O2-NEXT: External Alias Analysis ; GCN-O2-NEXT: Type-Based Alias Analysis ; GCN-O2-NEXT: Scoped NoAlias Alias Analysis -; GCN-O2-NEXT: Create Garbage Collector Module Metadata ; GCN-O2-NEXT: Argument Register Usage Information Storage +; GCN-O2-NEXT: Create Garbage Collector Module Metadata ; GCN-O2-NEXT: Machine Branch Probability Analysis ; GCN-O2-NEXT: Register Usage Information Storage ; GCN-O2-NEXT: ModulePass Manager @@ -756,8 +750,6 @@ ; GCN-O2-NEXT: Lazy Branch Probability Analysis ; GCN-O2-NEXT: Lazy Block Frequency Analysis ; GCN-O2-NEXT: Expand memcmp() to load/stores -; GCN-O2-NEXT: Lower Garbage Collection Instructions -; GCN-O2-NEXT: Shadow Stack GC Lowering ; GCN-O2-NEXT: Lower constant intrinsics ; GCN-O2-NEXT: Remove unreachable blocks from the CFG ; GCN-O2-NEXT: Natural Loop Information @@ -976,8 +968,8 @@ ; GCN-O3-NEXT: External Alias Analysis ; GCN-O3-NEXT: Type-Based Alias Analysis ; GCN-O3-NEXT: Scoped NoAlias Alias Analysis -; GCN-O3-NEXT: Create Garbage Collector Module Metadata ; GCN-O3-NEXT: Argument Register Usage Information Storage +; GCN-O3-NEXT: Create Garbage Collector Module Metadata ; GCN-O3-NEXT: Machine Branch Probability Analysis ; GCN-O3-NEXT: Register Usage Information Storage ; GCN-O3-NEXT: ModulePass Manager @@ -1043,8 +1035,6 @@ ; GCN-O3-NEXT: Lazy Branch Probability Analysis ; GCN-O3-NEXT: Lazy Block Frequency Analysis ; GCN-O3-NEXT: Expand memcmp() to load/stores -; GCN-O3-NEXT: Lower Garbage Collection Instructions -; GCN-O3-NEXT: Shadow Stack GC Lowering ; GCN-O3-NEXT: Lower constant intrinsics ; GCN-O3-NEXT: Remove unreachable blocks from the CFG ; GCN-O3-NEXT: Natural Loop Information diff --git a/llvm/test/CodeGen/PowerPC/load-and-splat.ll b/llvm/test/CodeGen/PowerPC/load-and-splat.ll index c011e45412045..3ec8468dcd364 100644 --- a/llvm/test/CodeGen/PowerPC/load-and-splat.ll +++ b/llvm/test/CodeGen/PowerPC/load-and-splat.ll @@ -5,6 +5,10 @@ ; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ ; RUN: -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s \ ; RUN: -check-prefix=P8 +; RUN: llc -mcpu=pwr7 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mtriple=powerpc64-unknown-unknown < %s | FileCheck %s \ +; RUN: -check-prefix=P7 + define dso_local void @test(<2 x double>* nocapture %c, double* nocapture readonly %a) local_unnamed_addr { ; P9-LABEL: test: ; P9: # %bb.0: # %entry @@ -19,6 +23,13 @@ define dso_local void @test(<2 x double>* nocapture %c, double* nocapture readon ; P8-NEXT: lxvdsx vs0, 0, r4 ; P8-NEXT: stxvd2x vs0, 0, r3 ; P8-NEXT: blr +; +; P7-LABEL: test: +; P7: # %bb.0: # %entry +; P7-NEXT: addi r4, r4, 24 +; P7-NEXT: lxvdsx vs0, 0, r4 +; P7-NEXT: stxvd2x vs0, 0, r3 +; P7-NEXT: blr entry: %arrayidx = getelementptr inbounds double, double* %a, i64 3 %0 = load double, double* %arrayidx, align 8 @@ -43,6 +54,16 @@ define dso_local void @test2(<4 x float>* nocapture %c, float* nocapture readonl ; P8-NEXT: xxspltw v2, vs0, 1 ; P8-NEXT: stvx v2, 0, r3 ; P8-NEXT: blr +; +; P7-LABEL: test2: +; P7: # %bb.0: # %entry +; P7-NEXT: lwz r4, 12(r4) +; P7-NEXT: addi r5, r1, -16 +; P7-NEXT: stw r4, -16(r1) +; P7-NEXT: lxvw4x vs0, 0, r5 +; P7-NEXT: xxspltw vs0, vs0, 0 +; P7-NEXT: stxvw4x vs0, 0, r3 +; P7-NEXT: blr entry: %arrayidx = getelementptr inbounds float, float* %a, i64 3 %0 = load float, float* %arrayidx, align 4 @@ -67,6 +88,16 @@ define dso_local void @test3(<4 x i32>* nocapture %c, i32* nocapture readonly %a ; P8-NEXT: xxspltw v2, vs0, 1 ; P8-NEXT: stvx v2, 0, r3 ; P8-NEXT: blr +; +; P7-LABEL: test3: +; P7: # %bb.0: # %entry +; P7-NEXT: lwz r4, 12(r4) +; P7-NEXT: addi r5, r1, -16 +; P7-NEXT: stw r4, -16(r1) +; P7-NEXT: lxvw4x vs0, 0, r5 +; P7-NEXT: xxspltw vs0, vs0, 0 +; P7-NEXT: stxvw4x vs0, 0, r3 +; P7-NEXT: blr entry: %arrayidx = getelementptr inbounds i32, i32* %a, i64 3 %0 = load i32, i32* %arrayidx, align 4 @@ -90,6 +121,16 @@ define dso_local void @test4(<2 x i64>* nocapture %c, i64* nocapture readonly %a ; P8-NEXT: lxvdsx vs0, 0, r4 ; P8-NEXT: stxvd2x vs0, 0, r3 ; P8-NEXT: blr +; +; P7-LABEL: test4: +; P7: # %bb.0: # %entry +; P7-NEXT: ld r4, 24(r4) +; P7-NEXT: addi r5, r1, -16 +; P7-NEXT: std r4, -8(r1) +; P7-NEXT: std r4, -16(r1) +; P7-NEXT: lxvd2x vs0, 0, r5 +; P7-NEXT: stxvd2x vs0, 0, r3 +; P7-NEXT: blr entry: %arrayidx = getelementptr inbounds i64, i64* %a, i64 3 %0 = load i64, i64* %arrayidx, align 8 @@ -110,6 +151,15 @@ define <16 x i8> @unadjusted_lxvwsx(i32* %s, i32* %t) { ; P8-NEXT: lfiwzx f0, 0, r3 ; P8-NEXT: xxspltw v2, vs0, 1 ; P8-NEXT: blr +; +; P7-LABEL: unadjusted_lxvwsx: +; P7: # %bb.0: # %entry +; P7-NEXT: lwz r3, 0(r3) +; P7-NEXT: addi r4, r1, -16 +; P7-NEXT: stw r3, -16(r1) +; P7-NEXT: lxvw4x vs0, 0, r4 +; P7-NEXT: xxspltw v2, vs0, 0 +; P7-NEXT: blr entry: %0 = bitcast i32* %s to <4 x i8>* %1 = load <4 x i8>, <4 x i8>* %0, align 4 @@ -129,6 +179,15 @@ define <16 x i8> @adjusted_lxvwsx(i64* %s, i64* %t) { ; P8-NEXT: lfdx f0, 0, r3 ; P8-NEXT: xxspltw v2, vs0, 0 ; P8-NEXT: blr +; +; P7-LABEL: adjusted_lxvwsx: +; P7: # %bb.0: # %entry +; P7-NEXT: ld r3, 0(r3) +; P7-NEXT: addi r4, r1, -16 +; P7-NEXT: std r3, -16(r1) +; P7-NEXT: lxvw4x vs0, 0, r4 +; P7-NEXT: xxspltw v2, vs0, 1 +; P7-NEXT: blr entry: %0 = bitcast i64* %s to <8 x i8>* %1 = load <8 x i8>, <8 x i8>* %0, align 8 @@ -147,6 +206,12 @@ define <16 x i8> @unadjusted_lxvwsx_v16i8(<16 x i8> *%s, <16 x i8> %t) { ; P8-NEXT: lvx v2, 0, r3 ; P8-NEXT: xxspltw v2, v2, 3 ; P8-NEXT: blr +; +; P7-LABEL: unadjusted_lxvwsx_v16i8: +; P7: # %bb.0: # %entry +; P7-NEXT: lxvw4x vs0, 0, r3 +; P7-NEXT: xxspltw v2, vs0, 0 +; P7-NEXT: blr entry: %0 = load <16 x i8>, <16 x i8>* %s, align 16 %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> @@ -165,6 +230,12 @@ define <16 x i8> @adjusted_lxvwsx_v16i8(<16 x i8> *%s, <16 x i8> %t) { ; P8-NEXT: lvx v2, 0, r3 ; P8-NEXT: xxspltw v2, v2, 2 ; P8-NEXT: blr +; +; P7-LABEL: adjusted_lxvwsx_v16i8: +; P7: # %bb.0: # %entry +; P7-NEXT: lxvw4x vs0, 0, r3 +; P7-NEXT: xxspltw v2, vs0, 1 +; P7-NEXT: blr entry: %0 = load <16 x i8>, <16 x i8>* %s, align 16 %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> @@ -183,6 +254,12 @@ define <16 x i8> @adjusted_lxvwsx_v16i8_2(<16 x i8> *%s, <16 x i8> %t) { ; P8-NEXT: lvx v2, 0, r3 ; P8-NEXT: xxspltw v2, v2, 1 ; P8-NEXT: blr +; +; P7-LABEL: adjusted_lxvwsx_v16i8_2: +; P7: # %bb.0: # %entry +; P7-NEXT: lxvw4x vs0, 0, r3 +; P7-NEXT: xxspltw v2, vs0, 2 +; P7-NEXT: blr entry: %0 = load <16 x i8>, <16 x i8>* %s, align 16 %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> @@ -201,6 +278,12 @@ define <16 x i8> @adjusted_lxvwsx_v16i8_3(<16 x i8> *%s, <16 x i8> %t) { ; P8-NEXT: lvx v2, 0, r3 ; P8-NEXT: xxspltw v2, v2, 0 ; P8-NEXT: blr +; +; P7-LABEL: adjusted_lxvwsx_v16i8_3: +; P7: # %bb.0: # %entry +; P7-NEXT: lxvw4x vs0, 0, r3 +; P7-NEXT: xxspltw v2, vs0, 3 +; P7-NEXT: blr entry: %0 = load <16 x i8>, <16 x i8>* %s, align 16 %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> @@ -217,6 +300,11 @@ define <16 x i8> @unadjusted_lxvdsx(i64* %s, i64* %t) { ; P8: # %bb.0: # %entry ; P8-NEXT: lxvdsx v2, 0, r3 ; P8-NEXT: blr +; +; P7-LABEL: unadjusted_lxvdsx: +; P7: # %bb.0: # %entry +; P7-NEXT: lxvdsx v2, 0, r3 +; P7-NEXT: blr entry: %0 = bitcast i64* %s to <8 x i8>* %1 = load <8 x i8>, <8 x i8>* %0, align 8 @@ -234,6 +322,11 @@ define <16 x i8> @unadjusted_lxvdsx_v16i8(<16 x i8> *%s, <16 x i8> %t) { ; P8: # %bb.0: # %entry ; P8-NEXT: lxvdsx v2, 0, r3 ; P8-NEXT: blr +; +; P7-LABEL: unadjusted_lxvdsx_v16i8: +; P7: # %bb.0: # %entry +; P7-NEXT: lxvdsx v2, 0, r3 +; P7-NEXT: blr entry: %0 = load <16 x i8>, <16 x i8>* %s, align 16 %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> @@ -252,6 +345,12 @@ define <16 x i8> @adjusted_lxvdsx_v16i8(<16 x i8> *%s, <16 x i8> %t) { ; P8-NEXT: addi r3, r3, 8 ; P8-NEXT: lxvdsx v2, 0, r3 ; P8-NEXT: blr +; +; P7-LABEL: adjusted_lxvdsx_v16i8: +; P7: # %bb.0: # %entry +; P7-NEXT: addi r3, r3, 8 +; P7-NEXT: lxvdsx v2, 0, r3 +; P7-NEXT: blr entry: %0 = load <16 x i8>, <16 x i8>* %s, align 16 %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> diff --git a/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll b/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll index 6a251e076005a..27ee2fda1f1b0 100644 --- a/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll +++ b/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll @@ -2561,3 +2561,36 @@ entry: ret double %vecext ; FIXME: add check patterns when variable element extraction is implemented } + +; To check when LHS is i32 to vector and RHS is i64 to vector, +; the combination should be skipped properly. +define <2 x i64> @buildi2(i64 %arg, i32 %arg1) { +; CHECK-LABEL: buildi2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi r4, r4, 32 +; CHECK-NEXT: mtfprd f1, r3 +; CHECK-NEXT: mtfprd f0, r4 +; CHECK-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-NEXT: blr +; +; CHECK-LE-LABEL: buildi2: +; CHECK-LE: # %bb.0: # %entry +; CHECK-LE-NEXT: mtfprwz f0, r4 +; CHECK-LE-NEXT: mtfprd f1, r3 +; CHECK-LE-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-NEXT: blr +; +; CHECK-AIX-LABEL: buildi2: +; CHECK-AIX: # %bb.0: # %entry +; CHECK-AIX-NEXT: sldi 4, 4, 32 +; CHECK-AIX-NEXT: mtfprd 1, 3 +; CHECK-AIX-NEXT: mtfprd 0, 4 +; CHECK-AIX-NEXT: xxmrghd 34, 0, 1 +; CHECK-AIX-NEXT: blr +entry: + %lhs.i32 = insertelement <4 x i32> undef, i32 %arg1, i32 0 + %rhs = insertelement <2 x i64> undef, i64 %arg, i32 0 + %lhs = bitcast <4 x i32> %lhs.i32 to <2 x i64> + %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> + ret <2 x i64> %shuffle +} diff --git a/llvm/test/CodeGen/WebAssembly/masked-shifts.ll b/llvm/test/CodeGen/WebAssembly/masked-shifts.ll new file mode 100644 index 0000000000000..75db5e190bd22 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/masked-shifts.ll @@ -0,0 +1,531 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s + +;; Check that masked shift counts are optimized out. + +;; TODO: optimize the *_late functions. + +target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" +target triple = "wasm32-unknown-unknown" + +define i32 @shl_i32(i32 %v, i32 %x) { +; CHECK-LABEL: shl_i32: +; CHECK: .functype shl_i32 (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.shl +; CHECK-NEXT: # fallthrough-return + %m = and i32 %x, 31 + %a = shl i32 %v, %m + ret i32 %a +} + +define i32 @sra_i32(i32 %v, i32 %x) { +; CHECK-LABEL: sra_i32: +; CHECK: .functype sra_i32 (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.shr_s +; CHECK-NEXT: # fallthrough-return + %m = and i32 %x, 31 + %a = ashr i32 %v, %m + ret i32 %a +} + +define i32 @srl_i32(i32 %v, i32 %x) { +; CHECK-LABEL: srl_i32: +; CHECK: .functype srl_i32 (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.shr_u +; CHECK-NEXT: # fallthrough-return + %m = and i32 %x, 31 + %a = lshr i32 %v, %m + ret i32 %a +} + +define i64 @shl_i64(i64 %v, i64 %x) { +; CHECK-LABEL: shl_i64: +; CHECK: .functype shl_i64 (i64, i64) -> (i64) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i64.shl +; CHECK-NEXT: # fallthrough-return + %m = and i64 %x, 63 + %a = shl i64 %v, %m + ret i64 %a +} + +define i64 @sra_i64(i64 %v, i64 %x) { +; CHECK-LABEL: sra_i64: +; CHECK: .functype sra_i64 (i64, i64) -> (i64) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i64.shr_s +; CHECK-NEXT: # fallthrough-return + %m = and i64 %x, 63 + %a = ashr i64 %v, %m + ret i64 %a +} + +define i64 @srl_i64(i64 %v, i64 %x) { +; CHECK-LABEL: srl_i64: +; CHECK: .functype srl_i64 (i64, i64) -> (i64) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i64.shr_u +; CHECK-NEXT: # fallthrough-return + %m = and i64 %x, 63 + %a = lshr i64 %v, %m + ret i64 %a +} + +define <16 x i8> @shl_v16i8(<16 x i8> %v, i8 %x) { +; CHECK-LABEL: shl_v16i8: +; CHECK: .functype shl_v16i8 (v128, i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.shl +; CHECK-NEXT: # fallthrough-return + %m = and i8 %x, 7 + %t = insertelement <16 x i8> undef, i8 %m, i32 0 + %s = shufflevector <16 x i8> %t, <16 x i8> undef, + <16 x i32> + %a = shl <16 x i8> %v, %s + ret <16 x i8> %a +} + +define <16 x i8> @shl_v16i8_late(<16 x i8> %v, i8 %x) { +; CHECK-LABEL: shl_v16i8_late: +; CHECK: .functype shl_v16i8_late (v128, i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.splat +; CHECK-NEXT: v128.const 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i8x16.extract_lane_u 0 +; CHECK-NEXT: i8x16.shl +; CHECK-NEXT: # fallthrough-return + %t = insertelement <16 x i8> undef, i8 %x, i32 0 + %s = shufflevector <16 x i8> %t, <16 x i8> undef, + <16 x i32> + %m = and <16 x i8> %s, + %a = shl <16 x i8> %v, %m + ret <16 x i8> %a +} + +define <16 x i8> @ashr_v16i8(<16 x i8> %v, i8 %x) { +; CHECK-LABEL: ashr_v16i8: +; CHECK: .functype ashr_v16i8 (v128, i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.shr_s +; CHECK-NEXT: # fallthrough-return + %m = and i8 %x, 7 + %t = insertelement <16 x i8> undef, i8 %m, i32 0 + %s = shufflevector <16 x i8> %t, <16 x i8> undef, + <16 x i32> + %a = ashr <16 x i8> %v, %s + ret <16 x i8> %a +} + +define <16 x i8> @ashr_v16i8_late(<16 x i8> %v, i8 %x) { +; CHECK-LABEL: ashr_v16i8_late: +; CHECK: .functype ashr_v16i8_late (v128, i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.splat +; CHECK-NEXT: v128.const 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i8x16.extract_lane_u 0 +; CHECK-NEXT: i8x16.shr_s +; CHECK-NEXT: # fallthrough-return + %t = insertelement <16 x i8> undef, i8 %x, i32 0 + %s = shufflevector <16 x i8> %t, <16 x i8> undef, + <16 x i32> + %m = and <16 x i8> %s, + %a = ashr <16 x i8> %v, %m + ret <16 x i8> %a +} + +define <16 x i8> @lshr_v16i8(<16 x i8> %v, i8 %x) { +; CHECK-LABEL: lshr_v16i8: +; CHECK: .functype lshr_v16i8 (v128, i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.shr_u +; CHECK-NEXT: # fallthrough-return + %m = and i8 %x, 7 + %t = insertelement <16 x i8> undef, i8 %m, i32 0 + %s = shufflevector <16 x i8> %t, <16 x i8> undef, + <16 x i32> + %a = lshr <16 x i8> %v, %s + ret <16 x i8> %a +} + +define <16 x i8> @lshr_v16i8_late(<16 x i8> %v, i8 %x) { +; CHECK-LABEL: lshr_v16i8_late: +; CHECK: .functype lshr_v16i8_late (v128, i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.splat +; CHECK-NEXT: v128.const 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i8x16.extract_lane_u 0 +; CHECK-NEXT: i8x16.shr_u +; CHECK-NEXT: # fallthrough-return + %t = insertelement <16 x i8> undef, i8 %x, i32 0 + %s = shufflevector <16 x i8> %t, <16 x i8> undef, + <16 x i32> + %m = and <16 x i8> %s, + %a = lshr <16 x i8> %v, %m + ret <16 x i8> %a +} + +define <8 x i16> @shl_v8i16(<8 x i16> %v, i16 %x) { +; CHECK-LABEL: shl_v8i16: +; CHECK: .functype shl_v8i16 (v128, i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i16x8.shl +; CHECK-NEXT: # fallthrough-return + %m = and i16 %x, 15 + %t = insertelement <8 x i16> undef, i16 %m, i32 0 + %s = shufflevector <8 x i16> %t, <8 x i16> undef, + <8 x i32> + %a = shl <8 x i16> %v, %s + ret <8 x i16> %a +} + +define <8 x i16> @shl_v8i16_late(<8 x i16> %v, i16 %x) { +; CHECK-LABEL: shl_v8i16_late: +; CHECK: .functype shl_v8i16_late (v128, i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i16x8.splat +; CHECK-NEXT: v128.const 15, 15, 15, 15, 15, 15, 15, 15 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i16x8.extract_lane_u 0 +; CHECK-NEXT: i16x8.shl +; CHECK-NEXT: # fallthrough-return + %t = insertelement <8 x i16> undef, i16 %x, i32 0 + %s = shufflevector <8 x i16> %t, <8 x i16> undef, + <8 x i32> + %m = and <8 x i16> %s, + + %a = shl <8 x i16> %v, %m + ret <8 x i16> %a +} + +define <8 x i16> @ashr_v8i16(<8 x i16> %v, i16 %x) { +; CHECK-LABEL: ashr_v8i16: +; CHECK: .functype ashr_v8i16 (v128, i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i16x8.shr_s +; CHECK-NEXT: # fallthrough-return + %m = and i16 %x, 15 + %t = insertelement <8 x i16> undef, i16 %m, i32 0 + %s = shufflevector <8 x i16> %t, <8 x i16> undef, + <8 x i32> + %a = ashr <8 x i16> %v, %s + ret <8 x i16> %a +} + +define <8 x i16> @ashr_v8i16_late(<8 x i16> %v, i16 %x) { +; CHECK-LABEL: ashr_v8i16_late: +; CHECK: .functype ashr_v8i16_late (v128, i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i16x8.splat +; CHECK-NEXT: v128.const 15, 15, 15, 15, 15, 15, 15, 15 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i16x8.extract_lane_u 0 +; CHECK-NEXT: i16x8.shr_s +; CHECK-NEXT: # fallthrough-return + %t = insertelement <8 x i16> undef, i16 %x, i32 0 + %s = shufflevector <8 x i16> %t, <8 x i16> undef, + <8 x i32> + %m = and <8 x i16> %s, + + %a = ashr <8 x i16> %v, %m + ret <8 x i16> %a +} + +define <8 x i16> @lshr_v8i16(<8 x i16> %v, i16 %x) { +; CHECK-LABEL: lshr_v8i16: +; CHECK: .functype lshr_v8i16 (v128, i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i16x8.shr_u +; CHECK-NEXT: # fallthrough-return + %m = and i16 %x, 15 + %t = insertelement <8 x i16> undef, i16 %m, i32 0 + %s = shufflevector <8 x i16> %t, <8 x i16> undef, + <8 x i32> + %a = lshr <8 x i16> %v, %s + ret <8 x i16> %a +} + +define <8 x i16> @lshr_v8i16_late(<8 x i16> %v, i16 %x) { +; CHECK-LABEL: lshr_v8i16_late: +; CHECK: .functype lshr_v8i16_late (v128, i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i16x8.splat +; CHECK-NEXT: v128.const 15, 15, 15, 15, 15, 15, 15, 15 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i16x8.extract_lane_u 0 +; CHECK-NEXT: i16x8.shr_u +; CHECK-NEXT: # fallthrough-return + %t = insertelement <8 x i16> undef, i16 %x, i32 0 + %s = shufflevector <8 x i16> %t, <8 x i16> undef, + <8 x i32> + %m = and <8 x i16> %s, + + %a = lshr <8 x i16> %v, %m + ret <8 x i16> %a +} + +define <4 x i32> @shl_v4i32(<4 x i32> %v, i32 %x) { +; CHECK-LABEL: shl_v4i32: +; CHECK: .functype shl_v4i32 (v128, i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.shl +; CHECK-NEXT: # fallthrough-return + %m = and i32 %x, 31 + %t = insertelement <4 x i32> undef, i32 %m, i32 0 + %s = shufflevector <4 x i32> %t, <4 x i32> undef, + <4 x i32> + %a = shl <4 x i32> %v, %s + ret <4 x i32> %a +} + +define <4 x i32> @shl_v4i32_late(<4 x i32> %v, i32 %x) { +; CHECK-LABEL: shl_v4i32_late: +; CHECK: .functype shl_v4i32_late (v128, i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: v128.const 31, 31, 31, 31 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i32x4.extract_lane 0 +; CHECK-NEXT: i32x4.shl +; CHECK-NEXT: # fallthrough-return + %t = insertelement <4 x i32> undef, i32 %x, i32 0 + %s = shufflevector <4 x i32> %t, <4 x i32> undef, + <4 x i32> + %m = and <4 x i32> %s, + %a = shl <4 x i32> %v, %m + ret <4 x i32> %a +} + +define <4 x i32> @ashr_v4i32(<4 x i32> %v, i32 %x) { +; CHECK-LABEL: ashr_v4i32: +; CHECK: .functype ashr_v4i32 (v128, i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.shr_s +; CHECK-NEXT: # fallthrough-return + %m = and i32 %x, 31 + %t = insertelement <4 x i32> undef, i32 %m, i32 0 + %s = shufflevector <4 x i32> %t, <4 x i32> undef, + <4 x i32> + %a = ashr <4 x i32> %v, %s + ret <4 x i32> %a +} + +define <4 x i32> @ashr_v4i32_late(<4 x i32> %v, i32 %x) { +; CHECK-LABEL: ashr_v4i32_late: +; CHECK: .functype ashr_v4i32_late (v128, i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: v128.const 31, 31, 31, 31 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i32x4.extract_lane 0 +; CHECK-NEXT: i32x4.shr_s +; CHECK-NEXT: # fallthrough-return + %t = insertelement <4 x i32> undef, i32 %x, i32 0 + %s = shufflevector <4 x i32> %t, <4 x i32> undef, + <4 x i32> + %m = and <4 x i32> %s, + %a = ashr <4 x i32> %v, %m + ret <4 x i32> %a +} + +define <4 x i32> @lshr_v4i32(<4 x i32> %v, i32 %x) { +; CHECK-LABEL: lshr_v4i32: +; CHECK: .functype lshr_v4i32 (v128, i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.shr_u +; CHECK-NEXT: # fallthrough-return + %m = and i32 %x, 31 + %t = insertelement <4 x i32> undef, i32 %m, i32 0 + %s = shufflevector <4 x i32> %t, <4 x i32> undef, + <4 x i32> + %a = lshr <4 x i32> %v, %s + ret <4 x i32> %a +} + +define <4 x i32> @lshr_v4i32_late(<4 x i32> %v, i32 %x) { +; CHECK-LABEL: lshr_v4i32_late: +; CHECK: .functype lshr_v4i32_late (v128, i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: v128.const 31, 31, 31, 31 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i32x4.extract_lane 0 +; CHECK-NEXT: i32x4.shr_u +; CHECK-NEXT: # fallthrough-return + %t = insertelement <4 x i32> undef, i32 %x, i32 0 + %s = shufflevector <4 x i32> %t, <4 x i32> undef, + <4 x i32> + %m = and <4 x i32> %s, + %a = lshr <4 x i32> %v, %m + ret <4 x i32> %a +} + +define <2 x i64> @shl_v2i64(<2 x i64> %v, i64 %x) { +; CHECK-LABEL: shl_v2i64: +; CHECK: .functype shl_v2i64 (v128, i64) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.wrap_i64 +; CHECK-NEXT: i64x2.shl +; CHECK-NEXT: # fallthrough-return + %m = and i64 %x, 63 + %t = insertelement <2 x i64> undef, i64 %m, i32 0 + %s = shufflevector <2 x i64> %t, <2 x i64> undef, <2 x i32> + %a = shl <2 x i64> %v, %s + ret <2 x i64> %a +} + +define <2 x i64> @shl_v2i64_late(<2 x i64> %v, i64 %x) { +; CHECK-LABEL: shl_v2i64_late: +; CHECK: .functype shl_v2i64_late (v128, i64) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i64x2.splat +; CHECK-NEXT: v128.const 63, 63 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i32.wrap_i64 +; CHECK-NEXT: i64x2.shl +; CHECK-NEXT: # fallthrough-return + %t = insertelement <2 x i64> undef, i64 %x, i32 0 + %s = shufflevector <2 x i64> %t, <2 x i64> undef, <2 x i32> + %m = and <2 x i64> %s, + %a = shl <2 x i64> %v, %m + ret <2 x i64> %a +} + +define <2 x i64> @ashr_v2i64(<2 x i64> %v, i64 %x) { +; CHECK-LABEL: ashr_v2i64: +; CHECK: .functype ashr_v2i64 (v128, i64) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.wrap_i64 +; CHECK-NEXT: i64x2.shr_s +; CHECK-NEXT: # fallthrough-return + %m = and i64 %x, 63 + %t = insertelement <2 x i64> undef, i64 %m, i32 0 + %s = shufflevector <2 x i64> %t, <2 x i64> undef, <2 x i32> + %a = ashr <2 x i64> %v, %s + ret <2 x i64> %a +} + +define <2 x i64> @ashr_v2i64_late(<2 x i64> %v, i64 %x) { +; CHECK-LABEL: ashr_v2i64_late: +; CHECK: .functype ashr_v2i64_late (v128, i64) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i64x2.splat +; CHECK-NEXT: v128.const 63, 63 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i32.wrap_i64 +; CHECK-NEXT: i64x2.shr_s +; CHECK-NEXT: # fallthrough-return + %t = insertelement <2 x i64> undef, i64 %x, i32 0 + %s = shufflevector <2 x i64> %t, <2 x i64> undef, <2 x i32> + %m = and <2 x i64> %s, + %a = ashr <2 x i64> %v, %m + ret <2 x i64> %a +} + +define <2 x i64> @lshr_v2i64(<2 x i64> %v, i64 %x) { +; CHECK-LABEL: lshr_v2i64: +; CHECK: .functype lshr_v2i64 (v128, i64) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.wrap_i64 +; CHECK-NEXT: i64x2.shr_u +; CHECK-NEXT: # fallthrough-return + %m = and i64 %x, 63 + %t = insertelement <2 x i64> undef, i64 %m, i32 0 + %s = shufflevector <2 x i64> %t, <2 x i64> undef, <2 x i32> + %a = lshr <2 x i64> %v, %s + ret <2 x i64> %a +} + +define <2 x i64> @lshr_v2i64_late(<2 x i64> %v, i64 %x) { +; CHECK-LABEL: lshr_v2i64_late: +; CHECK: .functype lshr_v2i64_late (v128, i64) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i64x2.splat +; CHECK-NEXT: v128.const 63, 63 +; CHECK-NEXT: v128.and +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i32.wrap_i64 +; CHECK-NEXT: i64x2.shr_u +; CHECK-NEXT: # fallthrough-return + %t = insertelement <2 x i64> undef, i64 %x, i32 0 + %s = shufflevector <2 x i64> %t, <2 x i64> undef, <2 x i32> + %m = and <2 x i64> %s, + %a = lshr <2 x i64> %v, %m + ret <2 x i64> %a +} diff --git a/llvm/test/Transforms/LoopDeletion/eval_first_iteration.ll b/llvm/test/Transforms/LoopDeletion/eval_first_iteration.ll index 468382db00b46..8d04f5285ece6 100644 --- a/llvm/test/Transforms/LoopDeletion/eval_first_iteration.ll +++ b/llvm/test/Transforms/LoopDeletion/eval_first_iteration.ll @@ -1046,3 +1046,192 @@ done: ; preds = %backedge %sum.next.lcssa = phi i32 [ %sum.next, %backedge ] ret i32 %sum.next.lcssa } + + + +; Switch tests + +; Here switch will always jump to the default label +define i32 @test_switch_ne_default() { +; CHECK-LABEL: @test_switch_ne_default( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SUM_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[SUB:%.*]] = sub i32 4, [[SUM]] +; CHECK-NEXT: switch i32 [[SUB]], label [[DEFAULT:%.*]] [ +; CHECK-NEXT: i32 0, label [[ONZERO:%.*]] +; CHECK-NEXT: i32 1, label [[ONONE:%.*]] +; CHECK-NEXT: i32 2, label [[ONTWO:%.*]] +; CHECK-NEXT: ] +; CHECK: default: +; CHECK-NEXT: br label [[BACKEDGE]] +; CHECK: onzero: +; CHECK-NEXT: br label [[BACKEDGE]] +; CHECK: onone: +; CHECK-NEXT: br label [[BACKEDGE]] +; CHECK: ontwo: +; CHECK-NEXT: br label [[BACKEDGE]] +; CHECK: backedge: +; CHECK-NEXT: [[MERGE_PHI:%.*]] = phi i32 [ [[SUB]], [[DEFAULT]] ], [ 0, [[ONZERO]] ], [ 1, [[ONONE]] ], [ 2, [[ONTWO]] ] +; CHECK-NEXT: [[SUM_NEXT]] = add i32 [[SUM]], [[MERGE_PHI]] +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ne i32 [[SUM_NEXT]], 4 +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[DONE:%.*]] +; CHECK: done: +; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], [[BACKEDGE]] ] +; CHECK-NEXT: ret i32 [[SUM_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: ; preds = %backedge, %entry + %sum = phi i32 [ 0, %entry ], [ %sum.next, %backedge ] + %sub = sub i32 4, %sum + switch i32 %sub, label %default [ + i32 0, label %onzero + i32 1, label %onone + i32 2, label %ontwo + ] + +default: ; preds = %loop + br label %backedge + +onzero: ; preds = %loop + br label %backedge + +onone: ; preds = %loop + br label %backedge + +ontwo: ; preds = %loop + br label %backedge + +backedge: ; preds = %ontwo, %onone, %onzero, %default + %merge.phi = phi i32 [ %sub, %default ], [ 0, %onzero ], [ 1, %onone ], [ 2, %ontwo ] + %sum.next = add i32 %sum, %merge.phi + %loop.cond = icmp ne i32 %sum.next, 4 + br i1 %loop.cond, label %loop, label %done + +done: ; preds = %backedge + %sum.next.lcssa = phi i32 [ %sum.next, %backedge ] + ret i32 %sum.next.lcssa +} + +; Here switch will always jump to the %ontwo label +define i32 @test_switch_ne_one_case() { +; CHECK-LABEL: @test_switch_ne_one_case( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SUM_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[SUB:%.*]] = sub i32 4, [[SUM]] +; CHECK-NEXT: switch i32 [[SUB]], label [[DEFAULT:%.*]] [ +; CHECK-NEXT: i32 0, label [[ONZERO:%.*]] +; CHECK-NEXT: i32 1, label [[ONONE:%.*]] +; CHECK-NEXT: i32 4, label [[ONTWO:%.*]] +; CHECK-NEXT: ] +; CHECK: default: +; CHECK-NEXT: br label [[BACKEDGE]] +; CHECK: onzero: +; CHECK-NEXT: br label [[BACKEDGE]] +; CHECK: onone: +; CHECK-NEXT: br label [[BACKEDGE]] +; CHECK: ontwo: +; CHECK-NEXT: br label [[BACKEDGE]] +; CHECK: backedge: +; CHECK-NEXT: [[MERGE_PHI:%.*]] = phi i32 [ 2, [[DEFAULT]] ], [ 0, [[ONZERO]] ], [ 1, [[ONONE]] ], [ [[SUB]], [[ONTWO]] ] +; CHECK-NEXT: [[SUM_NEXT]] = add i32 [[SUM]], [[MERGE_PHI]] +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ne i32 [[SUM_NEXT]], 4 +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[DONE:%.*]] +; CHECK: done: +; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], [[BACKEDGE]] ] +; CHECK-NEXT: ret i32 [[SUM_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: ; preds = %backedge, %entry + %sum = phi i32 [ 0, %entry ], [ %sum.next, %backedge ] + %sub = sub i32 4, %sum + switch i32 %sub, label %default [ + i32 0, label %onzero + i32 1, label %onone + i32 4, label %ontwo + ] + +default: ; preds = %loop + br label %backedge + +onzero: ; preds = %loop + br label %backedge + +onone: ; preds = %loop + br label %backedge + +ontwo: ; preds = %loop + br label %backedge + +backedge: ; preds = %ontwo, %onone, %onzero, %default + %merge.phi = phi i32 [ 2, %default ], [ 0, %onzero ], [ 1, %onone ], [ %sub, %ontwo ] + %sum.next = add i32 %sum, %merge.phi + %loop.cond = icmp ne i32 %sum.next, 4 + br i1 %loop.cond, label %loop, label %done + +done: ; preds = %backedge + %sum.next.lcssa = phi i32 [ %sum.next, %backedge ] + ret i32 %sum.next.lcssa +} + +; Here switch will always jump to the %backedge label, but there are two jumps to this label in switch +define i32 @test_switch_ne_one_case_identical_jumps() { +; CHECK-LABEL: @test_switch_ne_one_case_identical_jumps( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SUM_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[SUB:%.*]] = sub i32 2, [[SUM]] +; CHECK-NEXT: switch i32 [[SUB]], label [[DEFAULT:%.*]] [ +; CHECK-NEXT: i32 0, label [[FIRST_BLOCK:%.*]] +; CHECK-NEXT: i32 1, label [[BACKEDGE]] +; CHECK-NEXT: i32 2, label [[BACKEDGE]] +; CHECK-NEXT: ] +; CHECK: default: +; CHECK-NEXT: br label [[BACKEDGE]] +; CHECK: first_block: +; CHECK-NEXT: br label [[BACKEDGE]] +; CHECK: backedge: +; CHECK-NEXT: [[MERGE_PHI:%.*]] = phi i32 [ 0, [[DEFAULT]] ], [ 1, [[FIRST_BLOCK]] ], [ [[SUB]], [[LOOP]] ], [ [[SUB]], [[LOOP]] ] +; CHECK-NEXT: [[SUM_NEXT]] = add i32 [[SUM]], [[MERGE_PHI]] +; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ne i32 [[SUM_NEXT]], 2 +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[DONE:%.*]] +; CHECK: done: +; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], [[BACKEDGE]] ] +; CHECK-NEXT: ret i32 [[SUM_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: ; preds = %backedge, %entry + %sum = phi i32 [ 0, %entry ], [ %sum.next, %backedge ] + %sub = sub i32 2, %sum + switch i32 %sub, label %default [ + i32 0, label %first_block + i32 1, label %backedge + i32 2, label %backedge + ] + +default: ; preds = %loop + br label %backedge + +first_block: ; preds = %loop + br label %backedge + +backedge: ; preds = %first_block, %default, %loop, %loop + %merge.phi = phi i32 [ 0, %default ], [ 1, %first_block ], [ %sub, %loop ], [ %sub, %loop ] + %sum.next = add i32 %sum, %merge.phi + %loop.cond = icmp ne i32 %sum.next, 2 + br i1 %loop.cond, label %loop, label %done + +done: ; preds = %backedge + %sum.next.lcssa = phi i32 [ %sum.next, %backedge ] + ret i32 %sum.next.lcssa +} diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s index 00b429ef6d67d..0ffdad05cfa67 100644 --- a/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s +++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s @@ -41,12 +41,12 @@ v_sqrt_f64 v[4:5], v[4:5] # CHECK: Iterations: 1 # CHECK-NEXT: Instructions: 28 -# CHECK-NEXT: Total Cycles: 205 +# CHECK-NEXT: Total Cycles: 224 # CHECK-NEXT: Total uOps: 29 # CHECK: Dispatch Width: 1 -# CHECK-NEXT: uOps Per Cycle: 0.14 -# CHECK-NEXT: IPC: 0.14 +# CHECK-NEXT: uOps Per Cycle: 0.13 +# CHECK-NEXT: IPC: 0.13 # CHECK-NEXT: Block RThroughput: 29.0 # CHECK: Instruction Info: @@ -133,37 +133,37 @@ v_sqrt_f64 v[4:5], v[4:5] # CHECK-NEXT: - - - 1.00 - 1.00 1.00 - v_sqrt_f64_e32 v[4:5], v[4:5] # CHECK: Timeline view: -# CHECK-NEXT: 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 -# CHECK-NEXT: Index 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 01234 - -# CHECK: [0,0] DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_i32_f64_e32 v0, v[0:1] -# CHECK-NEXT: [0,1] .DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_f64_i32_e32 v[2:3], v2 -# CHECK-NEXT: [0,2] . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_f32_f64_e32 v4, v[4:5] -# CHECK-NEXT: [0,3] . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_f64_f32_e32 v[6:7], v6 -# CHECK-NEXT: [0,4] . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_u32_f64_e32 v8, v[8:9] -# CHECK-NEXT: [0,5] . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_f64_u32_e32 v[10:11], v10 -# CHECK-NEXT: [0,6] . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_frexp_exp_i32_f64_e32 v0, v[0:1] -# CHECK-NEXT: [0,7] . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_frexp_mant_f64_e32 v[2:3], v[2:3] -# CHECK-NEXT: [0,8] . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_fract_f64_e32 v[4:5], v[4:5] -# CHECK-NEXT: [0,9] . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_trunc_f64_e32 v[0:1], v[0:1] -# CHECK-NEXT: [0,10] . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_ceil_f64_e32 v[2:3], v[2:3] -# CHECK-NEXT: [0,11] . . . . . . . . . .DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_rndne_f64_e32 v[4:5], v[4:5] -# CHECK-NEXT: [0,12] . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . . . . . v_floor_f64_e32 v[6:7], v[6:7] -# CHECK-NEXT: [0,13] . . . . . . . . . . . . . .DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . v_fma_f64 v[0:1], v[0:1], v[0:1], v[0:1] -# CHECK-NEXT: [0,14] . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . v_add_f64 v[2:3], v[2:3], v[2:3] -# CHECK-NEXT: [0,15] . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . v_mul_f64 v[4:5], v[4:5], v[4:5] -# CHECK-NEXT: [0,16] . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . v_min_f64 v[6:7], v[6:7], v[6:7] -# CHECK-NEXT: [0,17] . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . v_max_f64 v[8:9], v[8:9], v[8:9] -# CHECK-NEXT: [0,18] . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . v_div_fmas_f64 v[0:1], v[0:1], v[0:1], v[0:1] -# CHECK-NEXT: [0,19] . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . v_div_fixup_f64 v[0:1], v[0:1], v[0:1], v[0:1] -# CHECK-NEXT: [0,20] . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . v_ldexp_f64 v[2:3], v[2:3], v0 -# CHECK-NEXT: [0,21] . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1] -# CHECK-NEXT: [0,22] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . v_trig_preop_f64 v[2:3], v[2:3], v0 -# CHECK-NEXT: [0,23] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .DeeeeeeeeeeeeeeeeeeeeeE . . . . . . v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[0:1] -# CHECK-NEXT: [0,24] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . v_cmp_class_f64_e64 vcc_lo, v[2:3], s0 -# CHECK-NEXT: [0,25] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeeeE . v_rcp_f64_e32 v[0:1], v[0:1] -# CHECK-NEXT: [0,26] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeeeE. v_rsq_f64_e32 v[2:3], v[2:3] -# CHECK-NEXT: [0,27] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeeeE v_sqrt_f64_e32 v[4:5], v[4:5] +# CHECK-NEXT: 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 +# CHECK-NEXT: Index 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123456789 0123 + +# CHECK: [0,0] DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_i32_f64_e32 v0, v[0:1] +# CHECK-NEXT: [0,1] .DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_f64_i32_e32 v[2:3], v2 +# CHECK-NEXT: [0,2] . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_f32_f64_e32 v4, v[4:5] +# CHECK-NEXT: [0,3] . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_f64_f32_e32 v[6:7], v6 +# CHECK-NEXT: [0,4] . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_u32_f64_e32 v8, v[8:9] +# CHECK-NEXT: [0,5] . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_cvt_f64_u32_e32 v[10:11], v10 +# CHECK-NEXT: [0,6] . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_frexp_exp_i32_f64_e32 v0, v[0:1] +# CHECK-NEXT: [0,7] . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_frexp_mant_f64_e32 v[2:3], v[2:3] +# CHECK-NEXT: [0,8] . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_fract_f64_e32 v[4:5], v[4:5] +# CHECK-NEXT: [0,9] . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_trunc_f64_e32 v[0:1], v[0:1] +# CHECK-NEXT: [0,10] . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_ceil_f64_e32 v[2:3], v[2:3] +# CHECK-NEXT: [0,11] . . . . . . . . . .DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_rndne_f64_e32 v[4:5], v[4:5] +# CHECK-NEXT: [0,12] . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_floor_f64_e32 v[6:7], v[6:7] +# CHECK-NEXT: [0,13] . . . . . . . . . . . . . .DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . . v_fma_f64 v[0:1], v[0:1], v[0:1], v[0:1] +# CHECK-NEXT: [0,14] . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . . . . . . . . . . . . . . v_add_f64 v[2:3], v[2:3], v[2:3] +# CHECK-NEXT: [0,15] . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . v_mul_f64 v[4:5], v[4:5], v[4:5] +# CHECK-NEXT: [0,16] . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . v_min_f64 v[6:7], v[6:7], v[6:7] +# CHECK-NEXT: [0,17] . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . . . . . v_max_f64 v[8:9], v[8:9], v[8:9] +# CHECK-NEXT: [0,18] . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . . . . . v_div_fmas_f64 v[0:1], v[0:1], v[0:1], v[0:1] +# CHECK-NEXT: [0,19] . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . . . . . . . . . . . . . . v_div_fixup_f64 v[0:1], v[0:1], v[0:1], v[0:1] +# CHECK-NEXT: [0,20] . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . . . . . v_ldexp_f64 v[2:3], v[2:3], v0 +# CHECK-NEXT: [0,21] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE. . . . . . . . . . . v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1] +# CHECK-NEXT: [0,22] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . v_trig_preop_f64 v[2:3], v[2:3], v0 +# CHECK-NEXT: [0,23] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeE . . . . . . v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[0:1] +# CHECK-NEXT: [0,24] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .DeeeeeeeeeeeeeeeeeeeeeE . . v_cmp_class_f64_e64 vcc_lo, v[2:3], s0 +# CHECK-NEXT: [0,25] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeeeE . v_rcp_f64_e32 v[0:1], v[0:1] +# CHECK-NEXT: [0,26] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeeeE. v_rsq_f64_e32 v[2:3], v[2:3] +# CHECK-NEXT: [0,27] . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . DeeeeeeeeeeeeeeeeeeeeeeeE v_sqrt_f64_e32 v[4:5], v[4:5] # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx9-retireooo.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx9-retireooo.s deleted file mode 100644 index 706ed36f9e980..0000000000000 --- a/llvm/test/tools/llvm-mca/AMDGPU/gfx9-retireooo.s +++ /dev/null @@ -1,233 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py -# RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx900 --timeline --iterations=1 --timeline-max-cycles=0 < %s | FileCheck %s - -s_load_dwordx2 s[2:3], s[0:1], 0x24 -s_load_dwordx2 s[0:1], s[0:1], 0x2c -s_waitcnt lgkmcnt(0) -v_mov_b32_e32 v0, s2 -v_mov_b32_e32 v1, s3 -flat_load_dword v2, v[0:1] -flat_load_dword v3, v[0:1] offset:8 -flat_load_dword v4, v[0:1] offset:16 -flat_load_dword v5, v[0:1] offset:24 -v_mov_b32_e32 v0, s0 -v_mov_b32_e32 v1, s1 -v_mov_b32_e32 v6, s6 -v_mov_b32_e32 v7, s7 -v_mov_b32_e32 v8, s8 -v_mov_b32_e32 v9, s9 -v_mov_b32_e32 v10, s10 -v_mov_b32_e32 v11, s11 -v_mov_b32_e32 v12, s12 -v_mov_b32_e32 v13, s13 -v_mov_b32_e32 v14, s14 -v_mov_b32_e32 v15, s15 -v_mov_b32_e32 v16, s16 -v_mov_b32_e32 v17, s17 -v_mov_b32_e32 v18, s18 -v_mov_b32_e32 v19, s19 -v_mov_b32_e32 v20, s20 -v_mov_b32_e32 v21, s21 -v_mov_b32_e32 v22, s22 -v_mov_b32_e32 v23, s23 -v_mov_b32_e32 v24, s24 -v_mov_b32_e32 v25, s25 -v_mov_b32_e32 v26, s26 -v_mov_b32_e32 v27, s27 -v_mov_b32_e32 v28, s28 -v_mov_b32_e32 v29, s29 -s_waitcnt vmcnt(0) lgkmcnt(0) - -# CHECK: Iterations: 1 -# CHECK-NEXT: Instructions: 36 -# CHECK-NEXT: Total Cycles: 94 -# CHECK-NEXT: Total uOps: 36 - -# CHECK: Dispatch Width: 1 -# CHECK-NEXT: uOps Per Cycle: 0.38 -# CHECK-NEXT: IPC: 0.38 -# CHECK-NEXT: Block RThroughput: 36.0 - -# CHECK: Instruction Info: -# CHECK-NEXT: [1]: #uOps -# CHECK-NEXT: [2]: Latency -# CHECK-NEXT: [3]: RThroughput -# CHECK-NEXT: [4]: MayLoad -# CHECK-NEXT: [5]: MayStore -# CHECK-NEXT: [6]: HasSideEffects (U) - -# CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 1 5 1.00 * s_load_dwordx2 s[2:3], s[0:1], 0x24 -# CHECK-NEXT: 1 5 1.00 * s_load_dwordx2 s[0:1], s[0:1], 0x2c -# CHECK-NEXT: 1 1 1.00 U s_waitcnt lgkmcnt(0) -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v0, s2 -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v1, s3 -# CHECK-NEXT: 1 80 1.00 * U flat_load_dword v2, v[0:1] -# CHECK-NEXT: 1 80 1.00 * U flat_load_dword v3, v[0:1] offset:8 -# CHECK-NEXT: 1 80 1.00 * U flat_load_dword v4, v[0:1] offset:16 -# CHECK-NEXT: 1 80 1.00 * U flat_load_dword v5, v[0:1] offset:24 -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v0, s0 -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v1, s1 -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v6, s6 -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v7, s7 -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v8, s8 -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v9, s9 -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v10, s10 -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v11, s11 -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v12, s12 -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v13, s13 -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v14, s14 -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v15, s15 -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v16, s16 -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v17, s17 -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v18, s18 -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v19, s19 -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v20, s20 -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v21, s21 -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v22, s22 -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v23, s23 -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v24, s24 -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v25, s25 -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v26, s26 -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v27, s27 -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v28, s28 -# CHECK-NEXT: 1 1 1.00 U v_mov_b32_e32 v29, s29 -# CHECK-NEXT: 1 1 1.00 U s_waitcnt vmcnt(0) lgkmcnt(0) - -# CHECK: Resources: -# CHECK-NEXT: [0] - HWBranch -# CHECK-NEXT: [1] - HWExport -# CHECK-NEXT: [2] - HWLGKM -# CHECK-NEXT: [3] - HWSALU -# CHECK-NEXT: [4] - HWVALU -# CHECK-NEXT: [5] - HWVMEM -# CHECK-NEXT: [6] - HWXDL - -# CHECK: Resource pressure per iteration: -# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] -# CHECK-NEXT: - - 2.00 2.00 28.00 4.00 - - -# CHECK: Resource pressure by instruction: -# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: - - 1.00 - - - - s_load_dwordx2 s[2:3], s[0:1], 0x24 -# CHECK-NEXT: - - 1.00 - - - - s_load_dwordx2 s[0:1], s[0:1], 0x2c -# CHECK-NEXT: - - - 1.00 - - - s_waitcnt lgkmcnt(0) -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v0, s2 -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v1, s3 -# CHECK-NEXT: - - - - - 1.00 - flat_load_dword v2, v[0:1] -# CHECK-NEXT: - - - - - 1.00 - flat_load_dword v3, v[0:1] offset:8 -# CHECK-NEXT: - - - - - 1.00 - flat_load_dword v4, v[0:1] offset:16 -# CHECK-NEXT: - - - - - 1.00 - flat_load_dword v5, v[0:1] offset:24 -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v0, s0 -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v1, s1 -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v6, s6 -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v7, s7 -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v8, s8 -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v9, s9 -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v10, s10 -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v11, s11 -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v12, s12 -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v13, s13 -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v14, s14 -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v15, s15 -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v16, s16 -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v17, s17 -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v18, s18 -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v19, s19 -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v20, s20 -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v21, s21 -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v22, s22 -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v23, s23 -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v24, s24 -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v25, s25 -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v26, s26 -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v27, s27 -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v28, s28 -# CHECK-NEXT: - - - - 1.00 - - v_mov_b32_e32 v29, s29 -# CHECK-NEXT: - - - 1.00 - - - s_waitcnt vmcnt(0) lgkmcnt(0) - -# CHECK: Timeline view: -# CHECK-NEXT: 0123456789 0123456789 0123456789 0123456789 0123 -# CHECK-NEXT: Index 0123456789 0123456789 0123456789 0123456789 0123456789 - -# CHECK: [0,0] DeeeeE . . . . . . . . . . . . . . . . . . s_load_dwordx2 s[2:3], s[0:1], 0x24 -# CHECK-NEXT: [0,1] .DeeeeE . . . . . . . . . . . . . . . . . . s_load_dwordx2 s[0:1], s[0:1], 0x2c -# CHECK-NEXT: [0,2] . .DE . . . . . . . . . . . . . . . . . . s_waitcnt lgkmcnt(0) -# CHECK-NEXT: [0,3] . . DE . . . . . . . . . . . . . . . . . . v_mov_b32_e32 v0, s2 -# CHECK-NEXT: [0,4] . . DE. . . . . . . . . . . . . . . . . . v_mov_b32_e32 v1, s3 -# CHECK-NEXT: [0,5] . . DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE. . flat_load_dword v2, v[0:1] -# CHECK-NEXT: [0,6] . . DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE . flat_load_dword v3, v[0:1] offset:8 -# CHECK-NEXT: [0,7] . . .DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE . flat_load_dword v4, v[0:1] offset:16 -# CHECK-NEXT: [0,8] . . . DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeE. flat_load_dword v5, v[0:1] offset:24 -# CHECK-NEXT: [0,9] . . . DE. . . . . . . . . . . . . . . . . v_mov_b32_e32 v0, s0 -# CHECK-NEXT: [0,10] . . . DE . . . . . . . . . . . . . . . . v_mov_b32_e32 v1, s1 -# CHECK-NEXT: [0,11] . . . DE . . . . . . . . . . . . . . . . v_mov_b32_e32 v6, s6 -# CHECK-NEXT: [0,12] . . . .DE . . . . . . . . . . . . . . . . v_mov_b32_e32 v7, s7 -# CHECK-NEXT: [0,13] . . . . DE . . . . . . . . . . . . . . . . v_mov_b32_e32 v8, s8 -# CHECK-NEXT: [0,14] . . . . DE. . . . . . . . . . . . . . . . v_mov_b32_e32 v9, s9 -# CHECK-NEXT: [0,15] . . . . DE . . . . . . . . . . . . . . . v_mov_b32_e32 v10, s10 -# CHECK-NEXT: [0,16] . . . . DE . . . . . . . . . . . . . . . v_mov_b32_e32 v11, s11 -# CHECK-NEXT: [0,17] . . . . .DE . . . . . . . . . . . . . . . v_mov_b32_e32 v12, s12 -# CHECK-NEXT: [0,18] . . . . . DE . . . . . . . . . . . . . . . v_mov_b32_e32 v13, s13 -# CHECK-NEXT: [0,19] . . . . . DE. . . . . . . . . . . . . . . v_mov_b32_e32 v14, s14 -# CHECK-NEXT: [0,20] . . . . . DE . . . . . . . . . . . . . . v_mov_b32_e32 v15, s15 -# CHECK-NEXT: [0,21] . . . . . DE . . . . . . . . . . . . . . v_mov_b32_e32 v16, s16 -# CHECK-NEXT: [0,22] . . . . . .DE . . . . . . . . . . . . . . v_mov_b32_e32 v17, s17 -# CHECK-NEXT: [0,23] . . . . . . DE . . . . . . . . . . . . . . v_mov_b32_e32 v18, s18 -# CHECK-NEXT: [0,24] . . . . . . DE. . . . . . . . . . . . . . v_mov_b32_e32 v19, s19 -# CHECK-NEXT: [0,25] . . . . . . DE . . . . . . . . . . . . . v_mov_b32_e32 v20, s20 -# CHECK-NEXT: [0,26] . . . . . . DE . . . . . . . . . . . . . v_mov_b32_e32 v21, s21 -# CHECK-NEXT: [0,27] . . . . . . .DE . . . . . . . . . . . . . v_mov_b32_e32 v22, s22 -# CHECK-NEXT: [0,28] . . . . . . . DE . . . . . . . . . . . . . v_mov_b32_e32 v23, s23 -# CHECK-NEXT: [0,29] . . . . . . . DE. . . . . . . . . . . . . v_mov_b32_e32 v24, s24 -# CHECK-NEXT: [0,30] . . . . . . . DE . . . . . . . . . . . . v_mov_b32_e32 v25, s25 -# CHECK-NEXT: [0,31] . . . . . . . DE . . . . . . . . . . . . v_mov_b32_e32 v26, s26 -# CHECK-NEXT: [0,32] . . . . . . . .DE . . . . . . . . . . . . v_mov_b32_e32 v27, s27 -# CHECK-NEXT: [0,33] . . . . . . . . DE . . . . . . . . . . . . v_mov_b32_e32 v28, s28 -# CHECK-NEXT: [0,34] . . . . . . . . DE. . . . . . . . . . . . v_mov_b32_e32 v29, s29 -# CHECK-NEXT: [0,35] . . . . . . . . . . . . . . . . . . . DE s_waitcnt vmcnt(0) lgkmcnt(0) - -# CHECK: Average Wait times (based on the timeline view): -# CHECK-NEXT: [0]: Executions -# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue -# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready -# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage - -# CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 1 0.0 0.0 0.0 s_load_dwordx2 s[2:3], s[0:1], 0x24 -# CHECK-NEXT: 1. 1 0.0 0.0 0.0 s_load_dwordx2 s[0:1], s[0:1], 0x2c -# CHECK-NEXT: 2. 1 0.0 0.0 0.0 s_waitcnt lgkmcnt(0) -# CHECK-NEXT: 3. 1 0.0 0.0 0.0 v_mov_b32_e32 v0, s2 -# CHECK-NEXT: 4. 1 0.0 0.0 0.0 v_mov_b32_e32 v1, s3 -# CHECK-NEXT: 5. 1 0.0 0.0 0.0 flat_load_dword v2, v[0:1] -# CHECK-NEXT: 6. 1 0.0 0.0 0.0 flat_load_dword v3, v[0:1] offset:8 -# CHECK-NEXT: 7. 1 0.0 0.0 0.0 flat_load_dword v4, v[0:1] offset:16 -# CHECK-NEXT: 8. 1 0.0 0.0 0.0 flat_load_dword v5, v[0:1] offset:24 -# CHECK-NEXT: 9. 1 0.0 0.0 0.0 v_mov_b32_e32 v0, s0 -# CHECK-NEXT: 10. 1 0.0 0.0 0.0 v_mov_b32_e32 v1, s1 -# CHECK-NEXT: 11. 1 0.0 0.0 0.0 v_mov_b32_e32 v6, s6 -# CHECK-NEXT: 12. 1 0.0 0.0 0.0 v_mov_b32_e32 v7, s7 -# CHECK-NEXT: 13. 1 0.0 0.0 0.0 v_mov_b32_e32 v8, s8 -# CHECK-NEXT: 14. 1 0.0 0.0 0.0 v_mov_b32_e32 v9, s9 -# CHECK-NEXT: 15. 1 0.0 0.0 0.0 v_mov_b32_e32 v10, s10 -# CHECK-NEXT: 16. 1 0.0 0.0 0.0 v_mov_b32_e32 v11, s11 -# CHECK-NEXT: 17. 1 0.0 0.0 0.0 v_mov_b32_e32 v12, s12 -# CHECK-NEXT: 18. 1 0.0 0.0 0.0 v_mov_b32_e32 v13, s13 -# CHECK-NEXT: 19. 1 0.0 0.0 0.0 v_mov_b32_e32 v14, s14 -# CHECK-NEXT: 20. 1 0.0 0.0 0.0 v_mov_b32_e32 v15, s15 -# CHECK-NEXT: 21. 1 0.0 0.0 0.0 v_mov_b32_e32 v16, s16 -# CHECK-NEXT: 22. 1 0.0 0.0 0.0 v_mov_b32_e32 v17, s17 -# CHECK-NEXT: 23. 1 0.0 0.0 0.0 v_mov_b32_e32 v18, s18 -# CHECK-NEXT: 24. 1 0.0 0.0 0.0 v_mov_b32_e32 v19, s19 -# CHECK-NEXT: 25. 1 0.0 0.0 0.0 v_mov_b32_e32 v20, s20 -# CHECK-NEXT: 26. 1 0.0 0.0 0.0 v_mov_b32_e32 v21, s21 -# CHECK-NEXT: 27. 1 0.0 0.0 0.0 v_mov_b32_e32 v22, s22 -# CHECK-NEXT: 28. 1 0.0 0.0 0.0 v_mov_b32_e32 v23, s23 -# CHECK-NEXT: 29. 1 0.0 0.0 0.0 v_mov_b32_e32 v24, s24 -# CHECK-NEXT: 30. 1 0.0 0.0 0.0 v_mov_b32_e32 v25, s25 -# CHECK-NEXT: 31. 1 0.0 0.0 0.0 v_mov_b32_e32 v26, s26 -# CHECK-NEXT: 32. 1 0.0 0.0 0.0 v_mov_b32_e32 v27, s27 -# CHECK-NEXT: 33. 1 0.0 0.0 0.0 v_mov_b32_e32 v28, s28 -# CHECK-NEXT: 34. 1 0.0 0.0 0.0 v_mov_b32_e32 v29, s29 -# CHECK-NEXT: 35. 1 0.0 0.0 0.0 s_waitcnt vmcnt(0) lgkmcnt(0) -# CHECK-NEXT: 1 0.0 0.0 0.0 diff --git a/llvm/test/tools/llvm-nm/just-symbols.test b/llvm/test/tools/llvm-nm/just-symbols.test index 24284610ea89f..e0926c8a93209 100644 --- a/llvm/test/tools/llvm-nm/just-symbols.test +++ b/llvm/test/tools/llvm-nm/just-symbols.test @@ -7,7 +7,7 @@ # RUN: llvm-nm --just-symbol-name %t.o | diff %t.txt - # RUN: llvm-nm --format=just-symbols %t.o | diff %t.txt - # RUN: llvm-nm --format=sysv -j %t.o | diff %t.txt - -# RUN: llvm-nm -j --format=posix %t.o | not diff -q %t.txt %t1.txt +# RUN: llvm-nm -j --format=posix %t.o | not diff -q %t.txt - # RUN: FileCheck %s --input-file=%t.txt --implicit-check-not={{.}} --check-prefix=COMMON diff --git a/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.cpp b/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.cpp index 6ee77fa2b3845..a655f3faf1bf2 100644 --- a/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.cpp +++ b/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.cpp @@ -19,311 +19,15 @@ namespace llvm { namespace mca { -void AMDGPUInstrPostProcess::postProcessInstruction( - std::unique_ptr &Inst, const MCInst &MCI) { - switch (MCI.getOpcode()) { - case AMDGPU::S_WAITCNT: - case AMDGPU::S_WAITCNT_EXPCNT: - case AMDGPU::S_WAITCNT_LGKMCNT: - case AMDGPU::S_WAITCNT_VMCNT: - case AMDGPU::S_WAITCNT_VSCNT: - case AMDGPU::S_WAITCNT_EXPCNT_gfx10: - case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: - case AMDGPU::S_WAITCNT_VMCNT_gfx10: - case AMDGPU::S_WAITCNT_VSCNT_gfx10: - case AMDGPU::S_WAITCNT_gfx10: - case AMDGPU::S_WAITCNT_gfx6_gfx7: - case AMDGPU::S_WAITCNT_vi: - return processWaitCnt(Inst, MCI); - } -} - -// s_waitcnt instructions encode important information as immediate operands -// which are lost during the MCInst -> mca::Instruction lowering. -void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr &Inst, - const MCInst &MCI) { - for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) { - MCAOperand Op; - const MCOperand &MCOp = MCI.getOperand(Idx); - if (MCOp.isReg()) { - Op = MCAOperand::createReg(MCOp.getReg()); - } else if (MCOp.isImm()) { - Op = MCAOperand::createImm(MCOp.getImm()); - } - Op.setIndex(Idx); - Inst->addOperand(Op); - } -} - AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI, const SourceMgr &SrcMgr, const MCInstrInfo &MCII) - : CustomBehaviour(STI, SrcMgr, MCII) { - generateWaitCntInfo(); -} + : CustomBehaviour(STI, SrcMgr, MCII) {} unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef IssuedInst, const InstRef &IR) { - const Instruction &Inst = *IR.getInstruction(); - unsigned Opcode = Inst.getOpcode(); - - // llvm-mca is generally run on fully compiled assembly so we wouldn't see any - // pseudo instructions here. However, there are plans for the future to make - // it possible to use mca within backend passes. As such, I have left the - // pseudo version of s_waitcnt within this switch statement. - switch (Opcode) { - default: - return 0; - case AMDGPU::S_WAITCNT: // This instruction - case AMDGPU::S_WAITCNT_EXPCNT: - case AMDGPU::S_WAITCNT_LGKMCNT: - case AMDGPU::S_WAITCNT_VMCNT: - case AMDGPU::S_WAITCNT_VSCNT: // to this instruction are all pseudo. - case AMDGPU::S_WAITCNT_EXPCNT_gfx10: - case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: - case AMDGPU::S_WAITCNT_VMCNT_gfx10: - case AMDGPU::S_WAITCNT_VSCNT_gfx10: - case AMDGPU::S_WAITCNT_gfx10: - case AMDGPU::S_WAITCNT_gfx6_gfx7: - case AMDGPU::S_WAITCNT_vi: - // s_endpgm also behaves as if there is an implicit - // s_waitcnt 0, but I'm not sure if it would be appropriate - // to model this in llvm-mca based on how the iterations work - // while simulating the pipeline over and over. - return handleWaitCnt(IssuedInst, IR); - } - return 0; } -unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef IssuedInst, - const InstRef &IR) { - // Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr. - // I do not know how that instruction works so I did not attempt to model it. - // set the max values to begin - unsigned Vmcnt = 63; - unsigned Expcnt = 7; - unsigned Lgkmcnt = 31; - unsigned Vscnt = 63; - unsigned CurrVmcnt = 0; - unsigned CurrExpcnt = 0; - unsigned CurrLgkmcnt = 0; - unsigned CurrVscnt = 0; - unsigned CyclesToWaitVm = ~0U; - unsigned CyclesToWaitExp = ~0U; - unsigned CyclesToWaitLgkm = ~0U; - unsigned CyclesToWaitVs = ~0U; - - computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt); - - // We will now look at each of the currently executing instructions - // to find out if this wait instruction still needs to wait. - for (auto I = IssuedInst.begin(), E = IssuedInst.end(); I != E; I++) { - const InstRef &PrevIR = *I; - const Instruction &PrevInst = *PrevIR.getInstruction(); - const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size(); - const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex]; - const int CyclesLeft = PrevInst.getCyclesLeft(); - assert(CyclesLeft != UNKNOWN_CYCLES && - "We should know how many cycles are left for this instruction"); - if (PrevInstWaitInfo.VmCnt) { - CurrVmcnt++; - if ((unsigned)CyclesLeft < CyclesToWaitVm) - CyclesToWaitVm = CyclesLeft; - } - if (PrevInstWaitInfo.ExpCnt) { - CurrExpcnt++; - if ((unsigned)CyclesLeft < CyclesToWaitExp) - CyclesToWaitExp = CyclesLeft; - } - if (PrevInstWaitInfo.LgkmCnt) { - CurrLgkmcnt++; - if ((unsigned)CyclesLeft < CyclesToWaitLgkm) - CyclesToWaitLgkm = CyclesLeft; - } - if (PrevInstWaitInfo.VsCnt) { - CurrVscnt++; - if ((unsigned)CyclesLeft < CyclesToWaitVs) - CyclesToWaitVs = CyclesLeft; - } - } - - unsigned CyclesToWait = ~0U; - if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait) - CyclesToWait = CyclesToWaitVm; - if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait) - CyclesToWait = CyclesToWaitExp; - if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait) - CyclesToWait = CyclesToWaitLgkm; - if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait) - CyclesToWait = CyclesToWaitVs; - - // We may underestimate how many cycles we need to wait, but this - // isn't a big deal. Our return value is just how many cycles until - // this function gets run again. So as long as we don't overestimate - // the wait time, we'll still end up stalling at this instruction - // for the correct number of cycles. - - if (CyclesToWait == ~0U) - return 0; - return CyclesToWait; -} - -void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt, - unsigned &Expcnt, unsigned &Lgkmcnt, - unsigned &Vscnt) { - AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU()); - const Instruction &Inst = *IR.getInstruction(); - unsigned Opcode = Inst.getOpcode(); - - switch (Opcode) { - case AMDGPU::S_WAITCNT_EXPCNT_gfx10: - case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: - case AMDGPU::S_WAITCNT_VMCNT_gfx10: - case AMDGPU::S_WAITCNT_VSCNT_gfx10: { - // Should probably be checking for nullptr - // here, but I'm not sure how I should handle the case - // where we see a nullptr. - const MCAOperand *OpReg = Inst.getOperand(0); - const MCAOperand *OpImm = Inst.getOperand(1); - assert(OpReg && OpReg->isReg() && "First operand should be a register."); - assert(OpImm && OpImm->isImm() && "Second operand should be an immediate."); - if (OpReg->getReg() != AMDGPU::SGPR_NULL) { - // Instruction is using a real register. - // Since we can't know what value this register will have, - // we can't compute what the value of this wait should be. - WithColor::warning() << "The register component of " - << MCII.getName(Opcode) << " will be completely " - << "ignored. So the wait may not be accurate.\n"; - } - switch (Opcode) { - // Redundant switch so I don't have to repeat the code above - // for each case. There are more clever ways to avoid this - // extra switch and anyone can feel free to implement one of them. - case AMDGPU::S_WAITCNT_EXPCNT_gfx10: - Expcnt = OpImm->getImm(); - break; - case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: - Lgkmcnt = OpImm->getImm(); - break; - case AMDGPU::S_WAITCNT_VMCNT_gfx10: - Vmcnt = OpImm->getImm(); - break; - case AMDGPU::S_WAITCNT_VSCNT_gfx10: - Vscnt = OpImm->getImm(); - break; - } - return; - } - case AMDGPU::S_WAITCNT_gfx10: - case AMDGPU::S_WAITCNT_gfx6_gfx7: - case AMDGPU::S_WAITCNT_vi: - unsigned WaitCnt = Inst.getOperand(0)->getImm(); - AMDGPU::decodeWaitcnt(IV, WaitCnt, Vmcnt, Expcnt, Lgkmcnt); - return; - } -} - -void AMDGPUCustomBehaviour::generateWaitCntInfo() { - // The core logic from this function is taken from - // SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions - // that are being looked at are in the MachineInstr format, whereas we have - // access to the MCInst format. The side effects of this are that we can't use - // the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst) - // functions. Therefore, we conservatively assume that these functions will - // return true. This may cause a few instructions to be incorrectly tagged - // with an extra CNT. However, these are instructions that do interact with at - // least one CNT so giving them an extra CNT shouldn't cause issues in most - // scenarios. - AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU()); - InstrWaitCntInfo.resize(SrcMgr.size()); - - int Index = 0; - for (auto I = SrcMgr.begin(), E = SrcMgr.end(); I != E; ++I, ++Index) { - const std::unique_ptr &Inst = *I; - unsigned Opcode = Inst->getOpcode(); - const MCInstrDesc &MCID = MCII.get(Opcode); - if ((MCID.TSFlags & SIInstrFlags::DS) && - (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) { - InstrWaitCntInfo[Index].LgkmCnt = true; - if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, AMDGPU::OpName::gds)) - InstrWaitCntInfo[Index].ExpCnt = true; - } else if (MCID.TSFlags & SIInstrFlags::FLAT) { - // We conservatively assume that mayAccessVMEMThroughFlat(Inst) - // and mayAccessLDSThroughFlat(Inst) would both return true for this - // instruction. We have to do this because those functions use - // information about the memory operands that we don't have access to. - InstrWaitCntInfo[Index].LgkmCnt = true; - if (!STI.hasFeature(AMDGPU::FeatureVscnt)) - InstrWaitCntInfo[Index].VmCnt = true; - else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) - InstrWaitCntInfo[Index].VmCnt = true; - else - InstrWaitCntInfo[Index].VsCnt = true; - } else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opcode)) { - if (!STI.hasFeature(AMDGPU::FeatureVscnt)) - InstrWaitCntInfo[Index].VmCnt = true; - else if ((MCID.mayLoad() && - !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) || - ((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() && - !MCID.mayStore())) - InstrWaitCntInfo[Index].VmCnt = true; - else if (MCID.mayStore()) - InstrWaitCntInfo[Index].VsCnt = true; - - // (IV.Major < 7) is meant to represent - // GCNTarget.vmemWriteNeedsExpWaitcnt() - // which is defined as - // { return getGeneration() < SEA_ISLANDS; } - if (IV.Major < 7 && - (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet))) - InstrWaitCntInfo[Index].ExpCnt = true; - } else if (MCID.TSFlags & SIInstrFlags::SMRD) { - InstrWaitCntInfo[Index].LgkmCnt = true; - } else if (MCID.TSFlags & SIInstrFlags::EXP) { - InstrWaitCntInfo[Index].ExpCnt = true; - } else { - switch (Opcode) { - case AMDGPU::S_SENDMSG: - case AMDGPU::S_SENDMSGHALT: - case AMDGPU::S_MEMTIME: - case AMDGPU::S_MEMREALTIME: - InstrWaitCntInfo[Index].LgkmCnt = true; - break; - } - } - } -} - -// taken from SIInstrInfo::isVMEM() -bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) { - return MCID.TSFlags & SIInstrFlags::MUBUF || - MCID.TSFlags & SIInstrFlags::MTBUF || - MCID.TSFlags & SIInstrFlags::MIMG; -} - -// taken from SIInstrInfo::hasModifiersSet() -bool AMDGPUCustomBehaviour::hasModifiersSet( - const std::unique_ptr &Inst, unsigned OpName) const { - int Idx = AMDGPU::getNamedOperandIdx(Inst->getOpcode(), OpName); - if (Idx == -1) - return false; - - const MCAOperand *Op = Inst->getOperand(Idx); - if (Op == nullptr || !Op->isImm() || !Op->getImm()) - return false; - - return true; -} - -// taken from SIInstrInfo::isAlwaysGDS() -bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const { - return Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::DS_GWS_INIT || - Opcode == AMDGPU::DS_GWS_SEMA_V || Opcode == AMDGPU::DS_GWS_SEMA_BR || - Opcode == AMDGPU::DS_GWS_SEMA_P || - Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL || - Opcode == AMDGPU::DS_GWS_BARRIER; -} - } // namespace mca } // namespace llvm diff --git a/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.h b/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.h index e1efafa427fd5..0dd21c7b4c446 100644 --- a/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.h +++ b/llvm/tools/llvm-mca/lib/AMDGPU/AMDGPUCustomBehaviour.h @@ -23,8 +23,6 @@ namespace llvm { namespace mca { class AMDGPUInstrPostProcess : public InstrPostProcess { - void processWaitCnt(std::unique_ptr &Inst, const MCInst &MCI); - public: AMDGPUInstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII) : InstrPostProcess(STI, MCII) {} @@ -32,54 +30,10 @@ class AMDGPUInstrPostProcess : public InstrPostProcess { ~AMDGPUInstrPostProcess() {} void postProcessInstruction(std::unique_ptr &Inst, - const MCInst &MCI) override; -}; - -struct WaitCntInfo { - bool VmCnt = false; - bool ExpCnt = false; - bool LgkmCnt = false; - bool VsCnt = false; + const MCInst &MCI) override {} }; class AMDGPUCustomBehaviour : public CustomBehaviour { - /// Whenever MCA would like to dispatch an s_waitcnt instructions, - /// we must check all the instruction that are still executing to see if - /// they modify the same CNT as we need to wait for. This vector - /// gets built in the constructor and contains 1 WaitCntInfo struct - /// for each instruction within the SrcManager. Each element - /// tells us which CNTs that instruction may interact with. - /// We conservatively assume some instructions interact with more - /// CNTs than they do in reality, so we will occasionally wait - /// longer than necessary, but we shouldn't ever wait for shorter. - std::vector InstrWaitCntInfo; - - /// This method gets called from the constructor and is - /// where we setup the InstrWaitCntInfo vector. - /// The core logic for determining which CNTs an instruction - /// interacts with is taken from SIInsertWaitcnts::updateEventWaitcntAfter(). - /// Unfortunately, some of the logic from that function is not avalable to us - /// in this scope so we conservatively end up assuming that some - /// instructions interact with more CNTs than they do in reality. - void generateWaitCntInfo(); - /// Helper function used in generateWaitCntInfo() - bool hasModifiersSet(const std::unique_ptr &Inst, - unsigned OpName) const; - /// Helper function used in generateWaitCntInfo() - bool isAlwaysGDS(uint16_t Opcode) const; - /// Helper function used in generateWaitCntInfo() - bool isVMEM(const MCInstrDesc &MCID); - /// This method gets called from checkCustomHazard when mca is attempting to - /// dispatch an s_waitcnt instruction (or one of its variants). The method - /// looks at each of the instructions that are still executing in the pipeline - /// to determine if the waitcnt should force a wait. - unsigned handleWaitCnt(ArrayRef IssuedInst, const InstRef &IR); - /// Based on the type of s_waitcnt instruction we are looking at, and what its - /// operands are, this method will set the values for each of the cnt - /// references provided as arguments. - void computeWaitCnt(const InstRef &IR, unsigned &Vmcnt, unsigned &Expcnt, - unsigned &Lgkmcnt, unsigned &Vscnt); - public: AMDGPUCustomBehaviour(const MCSubtargetInfo &STI, const SourceMgr &SrcMgr, const MCInstrInfo &MCII); diff --git a/llvm/unittests/IR/AttributesTest.cpp b/llvm/unittests/IR/AttributesTest.cpp index f260f0f9bf864..d29df4cd3425b 100644 --- a/llvm/unittests/IR/AttributesTest.cpp +++ b/llvm/unittests/IR/AttributesTest.cpp @@ -7,8 +7,12 @@ //===----------------------------------------------------------------------===// #include "llvm/IR/Attributes.h" -#include "llvm/IR/LLVMContext.h" +#include "llvm/AsmParser/Parser.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/SourceMgr.h" #include "gtest/gtest.h" using namespace llvm; @@ -252,4 +256,44 @@ TEST(Attributes, AttributeListPrinting) { } } +TEST(Attributes, MismatchedABIAttrs) { + const char *IRString = R"IR( + declare void @f1(i32* byval(i32)) + define void @g() { + call void @f1(i32* null) + ret void + } + declare void @f2(i32* preallocated(i32)) + define void @h() { + call void @f2(i32* null) + ret void + } + declare void @f3(i32* inalloca(i32)) + define void @i() { + call void @f3(i32* null) + ret void + } + )IR"; + + SMDiagnostic Err; + LLVMContext Context; + std::unique_ptr M = parseAssemblyString(IRString, Err, Context); + ASSERT_TRUE(M); + + { + auto *I = cast(&M->getFunction("g")->getEntryBlock().front()); + ASSERT_TRUE(I->isByValArgument(0)); + ASSERT_TRUE(I->getParamByValType(0)); + } + { + auto *I = cast(&M->getFunction("h")->getEntryBlock().front()); + ASSERT_TRUE(I->getParamPreallocatedType(0)); + } + { + auto *I = cast(&M->getFunction("i")->getEntryBlock().front()); + ASSERT_TRUE(I->isInAllocaArgument(0)); + ASSERT_TRUE(I->getParamInAllocaType(0)); + } +} + } // end anonymous namespace diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/hwasan/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/hwasan/BUILD.gn index 1718523875250..7deb8a3d3ed70 100644 --- a/llvm/utils/gn/secondary/compiler-rt/lib/hwasan/BUILD.gn +++ b/llvm/utils/gn/secondary/compiler-rt/lib/hwasan/BUILD.gn @@ -52,6 +52,7 @@ source_set("sources") { "hwasan_dynamic_shadow.cpp", "hwasan_dynamic_shadow.h", "hwasan_exceptions.cpp", + "hwasan_fuchsia.cpp", "hwasan_flags.h", "hwasan_globals.cpp", "hwasan_globals.h", diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-mca/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-mca/BUILD.gn index dcea89146765b..458598b682f86 100644 --- a/llvm/utils/gn/secondary/llvm/tools/llvm-mca/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/tools/llvm-mca/BUILD.gn @@ -1,3 +1,5 @@ +import("//llvm/lib/Target/targets.gni") + executable("llvm-mca") { deps = [ "//llvm/lib/MC", @@ -30,4 +32,9 @@ executable("llvm-mca") { "Views/View.cpp", "llvm-mca.cpp", ] + defines = [] + if (llvm_build_AMDGPU) { + deps += [ "//llvm/tools/llvm-mca/lib/AMDGPU" ] + defines += [ "HAS_AMDGPU" ] + } } diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-mca/lib/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-mca/lib/AMDGPU/BUILD.gn new file mode 100644 index 0000000000000..3bde981c58add --- /dev/null +++ b/llvm/utils/gn/secondary/llvm/tools/llvm-mca/lib/AMDGPU/BUILD.gn @@ -0,0 +1,15 @@ +static_library("AMDGPU") { + output_name = "LLVMMCACustomBehaviourAMDGPU" + deps = [ + "//llvm/lib/IR", + "//llvm/lib/Support", + "//llvm/lib/Target/AMDGPU", + + # llvm-mca/libAMDGPU reaches inside the Target/AMDGPU tablegen internals + # and must depend on these Target/AMDGPU-internal build targets. + "//llvm/lib/Target/AMDGPU/MCTargetDesc", + "//llvm/lib/Target/AMDGPU/Utils", + ] + include_dirs = [ "//llvm/lib/Target/AMDGPU" ] + sources = [ "AMDGPUCustomBehaviour.cpp" ] +} diff --git a/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h b/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h new file mode 100644 index 0000000000000..323c9cfeb97f5 --- /dev/null +++ b/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h @@ -0,0 +1,193 @@ +//===- Pattern.h - Pattern for conversion to the LLVM dialect ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_CONVERSION_LLVMCOMMON_PATTERN_H +#define MLIR_CONVERSION_LLVMCOMMON_PATTERN_H + +#include "mlir/Conversion/LLVMCommon/MemRefBuilder.h" +#include "mlir/Conversion/LLVMCommon/TypeConverter.h" +#include "mlir/Transforms/DialectConversion.h" + +namespace mlir { + +namespace LLVM { +namespace detail { +/// Replaces the given operation "op" with a new operation of type "targetOp" +/// and given operands. +LogicalResult oneToOneRewrite(Operation *op, StringRef targetOp, + ValueRange operands, + LLVMTypeConverter &typeConverter, + ConversionPatternRewriter &rewriter); +} // namespace detail +} // namespace LLVM + +/// Base class for operation conversions targeting the LLVM IR dialect. It +/// provides the conversion patterns with access to the LLVMTypeConverter and +/// the LowerToLLVMOptions. The class captures the LLVMTypeConverter and the +/// LowerToLLVMOptions by reference meaning the references have to remain alive +/// during the entire pattern lifetime. +class ConvertToLLVMPattern : public ConversionPattern { +public: + ConvertToLLVMPattern(StringRef rootOpName, MLIRContext *context, + LLVMTypeConverter &typeConverter, + PatternBenefit benefit = 1); + +protected: + /// Returns the LLVM dialect. + LLVM::LLVMDialect &getDialect() const; + + LLVMTypeConverter *getTypeConverter() const; + + /// Gets the MLIR type wrapping the LLVM integer type whose bit width is + /// defined by the used type converter. + Type getIndexType() const; + + /// Gets the MLIR type wrapping the LLVM integer type whose bit width + /// corresponds to that of a LLVM pointer type. + Type getIntPtrType(unsigned addressSpace = 0) const; + + /// Gets the MLIR type wrapping the LLVM void type. + Type getVoidType() const; + + /// Get the MLIR type wrapping the LLVM i8* type. + Type getVoidPtrType() const; + + /// Create a constant Op producing a value of `resultType` from an index-typed + /// integer attribute. + static Value createIndexAttrConstant(OpBuilder &builder, Location loc, + Type resultType, int64_t value); + + /// Create an LLVM dialect operation defining the given index constant. + Value createIndexConstant(ConversionPatternRewriter &builder, Location loc, + uint64_t value) const; + + // This is a strided getElementPtr variant that linearizes subscripts as: + // `base_offset + index_0 * stride_0 + ... + index_n * stride_n`. + Value getStridedElementPtr(Location loc, MemRefType type, Value memRefDesc, + ValueRange indices, + ConversionPatternRewriter &rewriter) const; + + /// Returns if the given memref has identity maps and the element type is + /// convertible to LLVM. + bool isConvertibleAndHasIdentityMaps(MemRefType type) const; + + /// Returns the type of a pointer to an element of the memref. + Type getElementPtrType(MemRefType type) const; + + /// Computes sizes, strides and buffer size in bytes of `memRefType` with + /// identity layout. Emits constant ops for the static sizes of `memRefType`, + /// and uses `dynamicSizes` for the others. Emits instructions to compute + /// strides and buffer size from these sizes. + /// + /// For example, memref<4x?xf32> emits: + /// `sizes[0]` = llvm.mlir.constant(4 : index) : i64 + /// `sizes[1]` = `dynamicSizes[0]` + /// `strides[1]` = llvm.mlir.constant(1 : index) : i64 + /// `strides[0]` = `sizes[0]` + /// %size = llvm.mul `sizes[0]`, `sizes[1]` : i64 + /// %nullptr = llvm.mlir.null : !llvm.ptr + /// %gep = llvm.getelementptr %nullptr[%size] + /// : (!llvm.ptr, i64) -> !llvm.ptr + /// `sizeBytes` = llvm.ptrtoint %gep : !llvm.ptr to i64 + void getMemRefDescriptorSizes(Location loc, MemRefType memRefType, + ValueRange dynamicSizes, + ConversionPatternRewriter &rewriter, + SmallVectorImpl &sizes, + SmallVectorImpl &strides, + Value &sizeBytes) const; + + /// Computes the size of type in bytes. + Value getSizeInBytes(Location loc, Type type, + ConversionPatternRewriter &rewriter) const; + + /// Computes total number of elements for the given shape. + Value getNumElements(Location loc, ArrayRef shape, + ConversionPatternRewriter &rewriter) const; + + /// Creates and populates a canonical memref descriptor struct. + MemRefDescriptor + createMemRefDescriptor(Location loc, MemRefType memRefType, + Value allocatedPtr, Value alignedPtr, + ArrayRef sizes, ArrayRef strides, + ConversionPatternRewriter &rewriter) const; +}; + +/// Utility class for operation conversions targeting the LLVM dialect that +/// match exactly one source operation. +template +class ConvertOpToLLVMPattern : public ConvertToLLVMPattern { +public: + explicit ConvertOpToLLVMPattern(LLVMTypeConverter &typeConverter, + PatternBenefit benefit = 1) + : ConvertToLLVMPattern(SourceOp::getOperationName(), + &typeConverter.getContext(), typeConverter, + benefit) {} + + /// Wrappers around the RewritePattern methods that pass the derived op type. + void rewrite(Operation *op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const final { + rewrite(cast(op), operands, rewriter); + } + LogicalResult match(Operation *op) const final { + return match(cast(op)); + } + LogicalResult + matchAndRewrite(Operation *op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const final { + return matchAndRewrite(cast(op), operands, rewriter); + } + + /// Rewrite and Match methods that operate on the SourceOp type. These must be + /// overridden by the derived pattern class. + virtual void rewrite(SourceOp op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const { + llvm_unreachable("must override rewrite or matchAndRewrite"); + } + virtual LogicalResult match(SourceOp op) const { + llvm_unreachable("must override match or matchAndRewrite"); + } + virtual LogicalResult + matchAndRewrite(SourceOp op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const { + if (succeeded(match(op))) { + rewrite(op, operands, rewriter); + return success(); + } + return failure(); + } + +private: + using ConvertToLLVMPattern::match; + using ConvertToLLVMPattern::matchAndRewrite; +}; + +/// Generic implementation of one-to-one conversion from "SourceOp" to +/// "TargetOp" where the latter belongs to the LLVM dialect or an equivalent. +/// Upholds a convention that multi-result operations get converted into an +/// operation returning the LLVM IR structure type, in which case individual +/// values must be extracted from using LLVM::ExtractValueOp before being used. +template +class OneToOneConvertToLLVMPattern : public ConvertOpToLLVMPattern { +public: + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + using Super = OneToOneConvertToLLVMPattern; + + /// Converts the type of the result to an LLVM type, pass operands as is, + /// preserve attributes. + LogicalResult + matchAndRewrite(SourceOp op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + return LLVM::detail::oneToOneRewrite(op, TargetOp::getOperationName(), + operands, *this->getTypeConverter(), + rewriter); + } +}; + +} // namespace mlir + +#endif // MLIR_CONVERSION_LLVMCOMMON_PATTERN_H diff --git a/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h b/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h new file mode 100644 index 0000000000000..383516ac3cd6d --- /dev/null +++ b/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h @@ -0,0 +1,85 @@ +//===- VectorPattern.h - Conversion pattern to the LLVM dialect -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_CONVERSION_LLVMCOMMON_VECTORPATTERN_H +#define MLIR_CONVERSION_LLVMCOMMON_VECTORPATTERN_H + +#include "mlir/Conversion/LLVMCommon/Pattern.h" +#include "mlir/Transforms/DialectConversion.h" + +namespace mlir { + +namespace LLVM { +namespace detail { +// Helper struct to "unroll" operations on n-D vectors in terms of operations on +// 1-D LLVM vectors. +struct NDVectorTypeInfo { + // LLVM array struct which encodes n-D vectors. + Type llvmNDVectorTy; + // LLVM vector type which encodes the inner 1-D vector type. + Type llvm1DVectorTy; + // Multiplicity of llvmNDVectorTy to llvm1DVectorTy. + SmallVector arraySizes; +}; + +// For >1-D vector types, extracts the necessary information to iterate over all +// 1-D subvectors in the underlying llrepresentation of the n-D vector +// Iterates on the llvm array type until we hit a non-array type (which is +// asserted to be an llvm vector type). +NDVectorTypeInfo extractNDVectorTypeInfo(VectorType vectorType, + LLVMTypeConverter &converter); + +// Express `linearIndex` in terms of coordinates of `basis`. +// Returns the empty vector when linearIndex is out of the range [0, P] where +// P is the product of all the basis coordinates. +// +// Prerequisites: +// Basis is an array of nonnegative integers (signed type inherited from +// vector shape type). +SmallVector getCoordinates(ArrayRef basis, + unsigned linearIndex); + +// Iterate of linear index, convert to coords space and insert splatted 1-D +// vector in each position. +void nDVectorIterate(const NDVectorTypeInfo &info, OpBuilder &builder, + function_ref fun); + +LogicalResult handleMultidimensionalVectors( + Operation *op, ValueRange operands, LLVMTypeConverter &typeConverter, + std::function createOperand, + ConversionPatternRewriter &rewriter); + +LogicalResult vectorOneToOneRewrite(Operation *op, StringRef targetOp, + ValueRange operands, + LLVMTypeConverter &typeConverter, + ConversionPatternRewriter &rewriter); +} // namespace detail +} // namespace LLVM + +/// Basic lowering implementation to rewrite Ops with just one result to the +/// LLVM Dialect. This supports higher-dimensional vector types. +template +class VectorConvertToLLVMPattern : public ConvertOpToLLVMPattern { +public: + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + using Super = VectorConvertToLLVMPattern; + + LogicalResult + matchAndRewrite(SourceOp op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + static_assert( + std::is_base_of, SourceOp>::value, + "expected single result op"); + return LLVM::detail::vectorOneToOneRewrite( + op, TargetOp::getOperationName(), operands, *this->getTypeConverter(), + rewriter); + } +}; +} // namespace mlir + +#endif // MLIR_CONVERSION_LLVMCOMMON_VECTORPATTERN_H diff --git a/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h b/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h index a76e91ae3d006..604556f3e0a47 100644 --- a/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h +++ b/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h @@ -15,167 +15,37 @@ #ifndef MLIR_CONVERSION_STANDARDTOLLVM_CONVERTSTANDARDTOLLVM_H #define MLIR_CONVERSION_STANDARDTOLLVM_CONVERTSTANDARDTOLLVM_H -#include "mlir/Conversion/LLVMCommon/MemRefBuilder.h" -#include "mlir/Conversion/LLVMCommon/TypeConverter.h" -#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h" -#include "mlir/Transforms/DialectConversion.h" - -namespace llvm { -class IntegerType; -class LLVMContext; -class Module; -class Type; -} // namespace llvm +#include "mlir/Conversion/LLVMCommon/Pattern.h" namespace mlir { -class BaseMemRefType; -class ComplexType; -class DataLayoutAnalysis; class LLVMTypeConverter; -class UnrankedMemRefType; - -namespace LLVM { -class LLVMDialect; -class LLVMPointerType; -} // namespace LLVM - -// ------------------ - -/// Base class for operation conversions targeting the LLVM IR dialect. It -/// provides the conversion patterns with access to the LLVMTypeConverter and -/// the LowerToLLVMOptions. The class captures the LLVMTypeConverter and the -/// LowerToLLVMOptions by reference meaning the references have to remain alive -/// during the entire pattern lifetime. -class ConvertToLLVMPattern : public ConversionPattern { -public: - ConvertToLLVMPattern(StringRef rootOpName, MLIRContext *context, - LLVMTypeConverter &typeConverter, - PatternBenefit benefit = 1); - -protected: - /// Returns the LLVM dialect. - LLVM::LLVMDialect &getDialect() const; - - LLVMTypeConverter *getTypeConverter() const; - - /// Gets the MLIR type wrapping the LLVM integer type whose bit width is - /// defined by the used type converter. - Type getIndexType() const; - - /// Gets the MLIR type wrapping the LLVM integer type whose bit width - /// corresponds to that of a LLVM pointer type. - Type getIntPtrType(unsigned addressSpace = 0) const; - - /// Gets the MLIR type wrapping the LLVM void type. - Type getVoidType() const; - - /// Get the MLIR type wrapping the LLVM i8* type. - Type getVoidPtrType() const; - - /// Create an LLVM dialect operation defining the given index constant. - Value createIndexConstant(ConversionPatternRewriter &builder, Location loc, - uint64_t value) const; - - // This is a strided getElementPtr variant that linearizes subscripts as: - // `base_offset + index_0 * stride_0 + ... + index_n * stride_n`. - Value getStridedElementPtr(Location loc, MemRefType type, Value memRefDesc, - ValueRange indices, - ConversionPatternRewriter &rewriter) const; - - /// Returns if the given memref has identity maps and the element type is - /// convertible to LLVM. - bool isConvertibleAndHasIdentityMaps(MemRefType type) const; - - /// Returns the type of a pointer to an element of the memref. - Type getElementPtrType(MemRefType type) const; - - /// Computes sizes, strides and buffer size in bytes of `memRefType` with - /// identity layout. Emits constant ops for the static sizes of `memRefType`, - /// and uses `dynamicSizes` for the others. Emits instructions to compute - /// strides and buffer size from these sizes. - /// - /// For example, memref<4x?xf32> emits: - /// `sizes[0]` = llvm.mlir.constant(4 : index) : i64 - /// `sizes[1]` = `dynamicSizes[0]` - /// `strides[1]` = llvm.mlir.constant(1 : index) : i64 - /// `strides[0]` = `sizes[0]` - /// %size = llvm.mul `sizes[0]`, `sizes[1]` : i64 - /// %nullptr = llvm.mlir.null : !llvm.ptr - /// %gep = llvm.getelementptr %nullptr[%size] - /// : (!llvm.ptr, i64) -> !llvm.ptr - /// `sizeBytes` = llvm.ptrtoint %gep : !llvm.ptr to i64 - void getMemRefDescriptorSizes(Location loc, MemRefType memRefType, - ValueRange dynamicSizes, - ConversionPatternRewriter &rewriter, - SmallVectorImpl &sizes, - SmallVectorImpl &strides, - Value &sizeBytes) const; - - /// Computes the size of type in bytes. - Value getSizeInBytes(Location loc, Type type, - ConversionPatternRewriter &rewriter) const; - - /// Computes total number of elements for the given shape. - Value getNumElements(Location loc, ArrayRef shape, - ConversionPatternRewriter &rewriter) const; - - /// Creates and populates a canonical memref descriptor struct. - MemRefDescriptor - createMemRefDescriptor(Location loc, MemRefType memRefType, - Value allocatedPtr, Value alignedPtr, - ArrayRef sizes, ArrayRef strides, - ConversionPatternRewriter &rewriter) const; -}; - -/// Utility class for operation conversions targeting the LLVM dialect that -/// match exactly one source operation. -template -class ConvertOpToLLVMPattern : public ConvertToLLVMPattern { -public: - explicit ConvertOpToLLVMPattern(LLVMTypeConverter &typeConverter, - PatternBenefit benefit = 1) - : ConvertToLLVMPattern(SourceOp::getOperationName(), - &typeConverter.getContext(), typeConverter, - benefit) {} - - /// Wrappers around the RewritePattern methods that pass the derived op type. - void rewrite(Operation *op, ArrayRef operands, - ConversionPatternRewriter &rewriter) const final { - rewrite(cast(op), operands, rewriter); - } - LogicalResult match(Operation *op) const final { - return match(cast(op)); - } - LogicalResult - matchAndRewrite(Operation *op, ArrayRef operands, - ConversionPatternRewriter &rewriter) const final { - return matchAndRewrite(cast(op), operands, rewriter); - } - - /// Rewrite and Match methods that operate on the SourceOp type. These must be - /// overridden by the derived pattern class. - virtual void rewrite(SourceOp op, ArrayRef operands, - ConversionPatternRewriter &rewriter) const { - llvm_unreachable("must override rewrite or matchAndRewrite"); - } - virtual LogicalResult match(SourceOp op) const { - llvm_unreachable("must override match or matchAndRewrite"); - } - virtual LogicalResult - matchAndRewrite(SourceOp op, ArrayRef operands, - ConversionPatternRewriter &rewriter) const { - if (succeeded(match(op))) { - rewrite(op, operands, rewriter); - return success(); - } - return failure(); - } - -private: - using ConvertToLLVMPattern::match; - using ConvertToLLVMPattern::matchAndRewrite; -}; +class RewritePatternSet; + +/// Collect a set of patterns to convert memory-related operations from the +/// Standard dialect to the LLVM dialect, excluding non-memory-related +/// operations and FuncOp. +void populateStdToLLVMMemoryConversionPatterns(LLVMTypeConverter &converter, + RewritePatternSet &patterns); + +/// Collect a set of patterns to convert from the Standard dialect to the LLVM +/// dialect, excluding the memory-related operations. +void populateStdToLLVMNonMemoryConversionPatterns(LLVMTypeConverter &converter, + RewritePatternSet &patterns); + +/// Collect the default pattern to convert a FuncOp to the LLVM dialect. If +/// `emitCWrappers` is set, the pattern will also produce functions +/// that pass memref descriptors by pointer-to-structure in addition to the +/// default unpacked form. +void populateStdToLLVMFuncOpConversionPattern(LLVMTypeConverter &converter, + RewritePatternSet &patterns); + +/// Collect the patterns to convert from the Standard dialect to LLVM. The +/// conversion patterns capture the LLVMTypeConverter and the LowerToLLVMOptions +/// by reference meaning the references have to remain alive during the entire +/// pattern lifetime. +void populateStdToLLVMConversionPatterns(LLVMTypeConverter &converter, + RewritePatternSet &patterns); /// Lowering for AllocOp and AllocaOp. struct AllocLikeOpLLVMLowering : public ConvertToLLVMPattern { @@ -226,64 +96,6 @@ struct AllocLikeOpLLVMLowering : public ConvertToLLVMPattern { ConversionPatternRewriter &rewriter) const override; }; -namespace LLVM { -namespace detail { -/// Replaces the given operation "op" with a new operation of type "targetOp" -/// and given operands. -LogicalResult oneToOneRewrite(Operation *op, StringRef targetOp, - ValueRange operands, - LLVMTypeConverter &typeConverter, - ConversionPatternRewriter &rewriter); - -LogicalResult vectorOneToOneRewrite(Operation *op, StringRef targetOp, - ValueRange operands, - LLVMTypeConverter &typeConverter, - ConversionPatternRewriter &rewriter); -} // namespace detail -} // namespace LLVM - -/// Generic implementation of one-to-one conversion from "SourceOp" to -/// "TargetOp" where the latter belongs to the LLVM dialect or an equivalent. -/// Upholds a convention that multi-result operations get converted into an -/// operation returning the LLVM IR structure type, in which case individual -/// values must be extracted from using LLVM::ExtractValueOp before being used. -template -class OneToOneConvertToLLVMPattern : public ConvertOpToLLVMPattern { -public: - using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; - using Super = OneToOneConvertToLLVMPattern; - - /// Converts the type of the result to an LLVM type, pass operands as is, - /// preserve attributes. - LogicalResult - matchAndRewrite(SourceOp op, ArrayRef operands, - ConversionPatternRewriter &rewriter) const override { - return LLVM::detail::oneToOneRewrite(op, TargetOp::getOperationName(), - operands, *this->getTypeConverter(), - rewriter); - } -}; - -/// Basic lowering implementation to rewrite Ops with just one result to the -/// LLVM Dialect. This supports higher-dimensional vector types. -template -class VectorConvertToLLVMPattern : public ConvertOpToLLVMPattern { -public: - using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; - using Super = VectorConvertToLLVMPattern; - - LogicalResult - matchAndRewrite(SourceOp op, ArrayRef operands, - ConversionPatternRewriter &rewriter) const override { - static_assert( - std::is_base_of, SourceOp>::value, - "expected single result op"); - return LLVM::detail::vectorOneToOneRewrite( - op, TargetOp::getOperationName(), operands, *this->getTypeConverter(), - rewriter); - } -}; - /// Derived class that automatically populates legalization information for /// different LLVM ops. class LLVMConversionTarget : public ConversionTarget { diff --git a/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h b/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h index 560794a0a925f..6d809d97234e1 100644 --- a/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h +++ b/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h @@ -12,38 +12,10 @@ #include namespace mlir { -class LLVMTypeConverter; class LowerToLLVMOptions; class ModuleOp; template class OperationPass; -class RewritePatternSet; -using OwningRewritePatternList = RewritePatternSet; - -/// Collect a set of patterns to convert memory-related operations from the -/// Standard dialect to the LLVM dialect, excluding non-memory-related -/// operations and FuncOp. -void populateStdToLLVMMemoryConversionPatterns(LLVMTypeConverter &converter, - RewritePatternSet &patterns); - -/// Collect a set of patterns to convert from the Standard dialect to the LLVM -/// dialect, excluding the memory-related operations. -void populateStdToLLVMNonMemoryConversionPatterns(LLVMTypeConverter &converter, - RewritePatternSet &patterns); - -/// Collect the default pattern to convert a FuncOp to the LLVM dialect. If -/// `emitCWrappers` is set, the pattern will also produce functions -/// that pass memref descriptors by pointer-to-structure in addition to the -/// default unpacked form. -void populateStdToLLVMFuncOpConversionPattern(LLVMTypeConverter &converter, - RewritePatternSet &patterns); - -/// Collect the patterns to convert from the Standard dialect to LLVM. The -/// conversion patterns capture the LLVMTypeConverter and the LowerToLLVMOptions -/// by reference meaning the references have to remain alive during the entire -/// pattern lifetime. -void populateStdToLLVMConversionPatterns(LLVMTypeConverter &converter, - RewritePatternSet &patterns); /// Creates a pass to convert the Standard dialect into the LLVMIR dialect. /// stdlib malloc/free is used by default for allocating memrefs allocated with diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgBase.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgBase.td index 092d22983d3f2..49ececc0790aa 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgBase.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgBase.td @@ -33,8 +33,8 @@ def Linalg_Dialect : Dialect { }]; let cppNamespace = "::mlir::linalg"; let dependentDialects = [ - "AffineDialect", "memref::MemRefDialect", "StandardOpsDialect", - "tensor::TensorDialect" + "AffineDialect", "math::MathDialect", "memref::MemRefDialect", + "StandardOpsDialect", "tensor::TensorDialect" ]; let hasCanonicalizer = 1; let hasOperationAttrVerify = 1; diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml index 1e4277ecd7bdf..04f9776005c4e 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml @@ -887,3 +887,58 @@ structured_op: !LinalgStructuredOpConfig scalar_const: '2.3283063999999999E-10 : f64' - !ScalarExpression scalar_arg: min +--- !LinalgOpConfig +metadata: !LinalgOpMetadata + name: soft_plus_2d + cpp_class_name: SoftPlus2DOp + doc: |- + Implements the soft plus operator. + + Numeric casting is performed on the input operand, promoting it to the same + data type as the accumulator/output. +structured_op: !LinalgStructuredOpConfig + args: + - !LinalgOperandDefConfig + name: I + usage: InputOperand + type_var: T + shape_map: affine_map<()[s0, s1] -> (s0, s1)> + - !LinalgOperandDefConfig + name: O + usage: OutputOperand + type_var: U + shape_map: affine_map<()[s0, s1] -> (s0, s1)> + indexing_maps: !LinalgIndexingMapsConfig + static_indexing_maps: + - affine_map<(d0, d1)[s0, s1] -> (d0, d1)> + - affine_map<(d0, d1)[s0, s1] -> (d0, d1)> + iterator_types: + - parallel + - parallel + assignments: + - !ScalarAssign + arg: O + value: !ScalarExpression + scalar_apply: + fn_name: log + operands: + - !ScalarExpression + scalar_apply: + fn_name: add + operands: + - !ScalarExpression + symbolic_cast: + type_var: U + operands: + - !ScalarExpression + scalar_const: '1.000000e+00 : f64' + - !ScalarExpression + scalar_apply: + fn_name: exp + operands: + - !ScalarExpression + symbolic_cast: + type_var: U + operands: + - !ScalarExpression + scalar_arg: I diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td index 18f5beeddf2ea..fa17237216596 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td @@ -503,41 +503,10 @@ def PoolingSumOp: SingleInputPoolingBase_Op<"pooling_sum"> { // Generic Linalg ops. //===----------------------------------------------------------------------===// -class GenericOpBase : LinalgStructuredBase_Op, SingleBlockImplicitTerminator<"YieldOp">]> { - let arguments = (ins Variadic:$inputs, - Variadic:$outputs, - AffineMapArrayAttr:$indexing_maps, - ArrayAttr:$iterator_types, - OptionalAttr:$doc, - OptionalAttr:$library_call); - let results = (outs Variadic:$result_tensors); - let regions = (region AnyRegion:$region); - let extraClassDeclaration = structuredOpsBaseDecls # [{ - SmallVector linalgTraitAttrNames() { - return SmallVector{ - getDocAttrName(), - getIndexingMapsAttrName(), getLibraryCallAttrName(), - getIteratorTypesAttrName(), - }; - } - std::string getLibraryCallName() { - return library_call().hasValue() ? - library_call()->str() : "op_has_no_registered_library_name"; - } - - static std::function - getRegionBuilder() { - return nullptr; - } - }]; - let printer = [{ return ::print(p, *this); }]; - let parser = [{ return ::parseGenericOp(parser, result); }]; -} - -def GenericOp : GenericOpBase<"generic"> { let description = [{ Generic Linalg op form where the key properties of the computation are specified as attributes. In pretty form, a `linalg.generic` op is written @@ -636,6 +605,15 @@ def GenericOp : GenericOpBase<"generic"> { ``` }]; + let arguments = (ins Variadic:$inputs, + Variadic:$outputs, + AffineMapArrayAttr:$indexing_maps, + ArrayAttr:$iterator_types, + OptionalAttr:$doc, + OptionalAttr:$library_call); + let results = (outs Variadic:$result_tensors); + let regions = (region AnyRegion:$region); + let builders = [ OpBuilder<(ins "TypeRange":$resultTensorTypes, "ValueRange":$inputs, "ValueRange":$outputs, "ArrayRef":$indexingMaps, @@ -654,6 +632,29 @@ def GenericOp : GenericOpBase<"generic"> { "ArrayRef":$indexingMaps, "ArrayRef":$iteratorTypes, CArg<"function_ref", "nullptr">)> ]; + + let extraClassDeclaration = structuredOpsBaseDecls # [{ + SmallVector linalgTraitAttrNames() { + return SmallVector{ + getDocAttrName(), + getIndexingMapsAttrName(), getLibraryCallAttrName(), + getIteratorTypesAttrName(), + }; + } + std::string getLibraryCallName() { + return library_call().hasValue() ? + library_call()->str() : "op_has_no_registered_library_name"; + } + + static std::function + getRegionBuilder() { + return nullptr; + } + }]; + + let printer = [{ return ::print(p, *this); }]; + let parser = [{ return ::parseGenericOp(parser, result); }]; + let verifier = [{ return ::verify(*this); }]; let hasFolder = 1; diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgTypes.h b/mlir/include/mlir/Dialect/Linalg/IR/LinalgTypes.h index c5cfdd15c00a8..f5913e6ad6164 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgTypes.h +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgTypes.h @@ -10,6 +10,7 @@ #define MLIR_DIALECT_LINALG_LINALGTYPES_H_ #include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Math/IR/Math.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" diff --git a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt index 71694bf925188..988071e6a00de 100644 --- a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt +++ b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt @@ -32,6 +32,7 @@ add_mlir_conversion_library(MLIRGPUToGPURuntimeTransforms MLIRAsyncToLLVM MLIRGPUTransforms MLIRIR + MLIRLLVMCommonConversion MLIRLLVMIR MLIRPass MLIRSupport diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp index dcb28f462d570..557eabcad79e9 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -17,7 +17,9 @@ #include "../PassDetail.h" #include "mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h" +#include "mlir/Conversion/LLVMCommon/Pattern.h" #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" +#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h" #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h" #include "mlir/Dialect/Async/IR/Async.h" #include "mlir/Dialect/GPU/GPUDialect.h" diff --git a/mlir/lib/Conversion/LLVMCommon/CMakeLists.txt b/mlir/lib/Conversion/LLVMCommon/CMakeLists.txt index 14c7c94b983f9..3657e56e61b17 100644 --- a/mlir/lib/Conversion/LLVMCommon/CMakeLists.txt +++ b/mlir/lib/Conversion/LLVMCommon/CMakeLists.txt @@ -1,8 +1,10 @@ add_mlir_conversion_library(MLIRLLVMCommonConversion LoweringOptions.cpp MemRefBuilder.cpp + Pattern.cpp StructBuilder.cpp TypeConverter.cpp + VectorPattern.cpp LINK_COMPONENTS Core diff --git a/mlir/lib/Conversion/LLVMCommon/Pattern.cpp b/mlir/lib/Conversion/LLVMCommon/Pattern.cpp new file mode 100644 index 0000000000000..6e5a827b34e8d --- /dev/null +++ b/mlir/lib/Conversion/LLVMCommon/Pattern.cpp @@ -0,0 +1,269 @@ +//===- Pattern.cpp - Conversion pattern to the LLVM dialect ---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Conversion/LLVMCommon/Pattern.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/LLVMIR/LLVMTypes.h" +#include "mlir/IR/AffineMap.h" + +using namespace mlir; + +//===----------------------------------------------------------------------===// +// ConvertToLLVMPattern +//===----------------------------------------------------------------------===// + +ConvertToLLVMPattern::ConvertToLLVMPattern(StringRef rootOpName, + MLIRContext *context, + LLVMTypeConverter &typeConverter, + PatternBenefit benefit) + : ConversionPattern(typeConverter, rootOpName, benefit, context) {} + +LLVMTypeConverter *ConvertToLLVMPattern::getTypeConverter() const { + return static_cast( + ConversionPattern::getTypeConverter()); +} + +LLVM::LLVMDialect &ConvertToLLVMPattern::getDialect() const { + return *getTypeConverter()->getDialect(); +} + +Type ConvertToLLVMPattern::getIndexType() const { + return getTypeConverter()->getIndexType(); +} + +Type ConvertToLLVMPattern::getIntPtrType(unsigned addressSpace) const { + return IntegerType::get(&getTypeConverter()->getContext(), + getTypeConverter()->getPointerBitwidth(addressSpace)); +} + +Type ConvertToLLVMPattern::getVoidType() const { + return LLVM::LLVMVoidType::get(&getTypeConverter()->getContext()); +} + +Type ConvertToLLVMPattern::getVoidPtrType() const { + return LLVM::LLVMPointerType::get( + IntegerType::get(&getTypeConverter()->getContext(), 8)); +} + +Value ConvertToLLVMPattern::createIndexAttrConstant(OpBuilder &builder, + Location loc, + Type resultType, + int64_t value) { + return builder.create( + loc, resultType, builder.getIntegerAttr(builder.getIndexType(), value)); +} + +Value ConvertToLLVMPattern::createIndexConstant( + ConversionPatternRewriter &builder, Location loc, uint64_t value) const { + return createIndexAttrConstant(builder, loc, getIndexType(), value); +} + +Value ConvertToLLVMPattern::getStridedElementPtr( + Location loc, MemRefType type, Value memRefDesc, ValueRange indices, + ConversionPatternRewriter &rewriter) const { + + int64_t offset; + SmallVector strides; + auto successStrides = getStridesAndOffset(type, strides, offset); + assert(succeeded(successStrides) && "unexpected non-strided memref"); + (void)successStrides; + + MemRefDescriptor memRefDescriptor(memRefDesc); + Value base = memRefDescriptor.alignedPtr(rewriter, loc); + + Value index; + if (offset != 0) // Skip if offset is zero. + index = MemRefType::isDynamicStrideOrOffset(offset) + ? memRefDescriptor.offset(rewriter, loc) + : createIndexConstant(rewriter, loc, offset); + + for (int i = 0, e = indices.size(); i < e; ++i) { + Value increment = indices[i]; + if (strides[i] != 1) { // Skip if stride is 1. + Value stride = MemRefType::isDynamicStrideOrOffset(strides[i]) + ? memRefDescriptor.stride(rewriter, loc, i) + : createIndexConstant(rewriter, loc, strides[i]); + increment = rewriter.create(loc, increment, stride); + } + index = + index ? rewriter.create(loc, index, increment) : increment; + } + + Type elementPtrType = memRefDescriptor.getElementPtrType(); + return index ? rewriter.create(loc, elementPtrType, base, index) + : base; +} + +// Check if the MemRefType `type` is supported by the lowering. We currently +// only support memrefs with identity maps. +bool ConvertToLLVMPattern::isConvertibleAndHasIdentityMaps( + MemRefType type) const { + if (!typeConverter->convertType(type.getElementType())) + return false; + return type.getAffineMaps().empty() || + llvm::all_of(type.getAffineMaps(), + [](AffineMap map) { return map.isIdentity(); }); +} + +Type ConvertToLLVMPattern::getElementPtrType(MemRefType type) const { + auto elementType = type.getElementType(); + auto structElementType = typeConverter->convertType(elementType); + return LLVM::LLVMPointerType::get(structElementType, + type.getMemorySpaceAsInt()); +} + +void ConvertToLLVMPattern::getMemRefDescriptorSizes( + Location loc, MemRefType memRefType, ValueRange dynamicSizes, + ConversionPatternRewriter &rewriter, SmallVectorImpl &sizes, + SmallVectorImpl &strides, Value &sizeBytes) const { + assert(isConvertibleAndHasIdentityMaps(memRefType) && + "layout maps must have been normalized away"); + assert(count(memRefType.getShape(), ShapedType::kDynamicSize) == + static_cast(dynamicSizes.size()) && + "dynamicSizes size doesn't match dynamic sizes count in memref shape"); + + sizes.reserve(memRefType.getRank()); + unsigned dynamicIndex = 0; + for (int64_t size : memRefType.getShape()) { + sizes.push_back(size == ShapedType::kDynamicSize + ? dynamicSizes[dynamicIndex++] + : createIndexConstant(rewriter, loc, size)); + } + + // Strides: iterate sizes in reverse order and multiply. + int64_t stride = 1; + Value runningStride = createIndexConstant(rewriter, loc, 1); + strides.resize(memRefType.getRank()); + for (auto i = memRefType.getRank(); i-- > 0;) { + strides[i] = runningStride; + + int64_t size = memRefType.getShape()[i]; + if (size == 0) + continue; + bool useSizeAsStride = stride == 1; + if (size == ShapedType::kDynamicSize) + stride = ShapedType::kDynamicSize; + if (stride != ShapedType::kDynamicSize) + stride *= size; + + if (useSizeAsStride) + runningStride = sizes[i]; + else if (stride == ShapedType::kDynamicSize) + runningStride = + rewriter.create(loc, runningStride, sizes[i]); + else + runningStride = createIndexConstant(rewriter, loc, stride); + } + + // Buffer size in bytes. + Type elementPtrType = getElementPtrType(memRefType); + Value nullPtr = rewriter.create(loc, elementPtrType); + Value gepPtr = rewriter.create( + loc, elementPtrType, ArrayRef{nullPtr, runningStride}); + sizeBytes = rewriter.create(loc, getIndexType(), gepPtr); +} + +Value ConvertToLLVMPattern::getSizeInBytes( + Location loc, Type type, ConversionPatternRewriter &rewriter) const { + // Compute the size of an individual element. This emits the MLIR equivalent + // of the following sizeof(...) implementation in LLVM IR: + // %0 = getelementptr %elementType* null, %indexType 1 + // %1 = ptrtoint %elementType* %0 to %indexType + // which is a common pattern of getting the size of a type in bytes. + auto convertedPtrType = + LLVM::LLVMPointerType::get(typeConverter->convertType(type)); + auto nullPtr = rewriter.create(loc, convertedPtrType); + auto gep = rewriter.create( + loc, convertedPtrType, + ArrayRef{nullPtr, createIndexConstant(rewriter, loc, 1)}); + return rewriter.create(loc, getIndexType(), gep); +} + +Value ConvertToLLVMPattern::getNumElements( + Location loc, ArrayRef shape, + ConversionPatternRewriter &rewriter) const { + // Compute the total number of memref elements. + Value numElements = + shape.empty() ? createIndexConstant(rewriter, loc, 1) : shape.front(); + for (unsigned i = 1, e = shape.size(); i < e; ++i) + numElements = rewriter.create(loc, numElements, shape[i]); + return numElements; +} + +/// Creates and populates the memref descriptor struct given all its fields. +MemRefDescriptor ConvertToLLVMPattern::createMemRefDescriptor( + Location loc, MemRefType memRefType, Value allocatedPtr, Value alignedPtr, + ArrayRef sizes, ArrayRef strides, + ConversionPatternRewriter &rewriter) const { + auto structType = typeConverter->convertType(memRefType); + auto memRefDescriptor = MemRefDescriptor::undef(rewriter, loc, structType); + + // Field 1: Allocated pointer, used for malloc/free. + memRefDescriptor.setAllocatedPtr(rewriter, loc, allocatedPtr); + + // Field 2: Actual aligned pointer to payload. + memRefDescriptor.setAlignedPtr(rewriter, loc, alignedPtr); + + // Field 3: Offset in aligned pointer. + memRefDescriptor.setOffset(rewriter, loc, + createIndexConstant(rewriter, loc, 0)); + + // Fields 4: Sizes. + for (auto en : llvm::enumerate(sizes)) + memRefDescriptor.setSize(rewriter, loc, en.index(), en.value()); + + // Field 5: Strides. + for (auto en : llvm::enumerate(strides)) + memRefDescriptor.setStride(rewriter, loc, en.index(), en.value()); + + return memRefDescriptor; +} + +//===----------------------------------------------------------------------===// +// Detail methods +//===----------------------------------------------------------------------===// + +/// Replaces the given operation "op" with a new operation of type "targetOp" +/// and given operands. +LogicalResult LLVM::detail::oneToOneRewrite( + Operation *op, StringRef targetOp, ValueRange operands, + LLVMTypeConverter &typeConverter, ConversionPatternRewriter &rewriter) { + unsigned numResults = op->getNumResults(); + + Type packedType; + if (numResults != 0) { + packedType = typeConverter.packFunctionResults(op->getResultTypes()); + if (!packedType) + return failure(); + } + + // Create the operation through state since we don't know its C++ type. + OperationState state(op->getLoc(), targetOp); + state.addTypes(packedType); + state.addOperands(operands); + state.addAttributes(op->getAttrs()); + Operation *newOp = rewriter.createOperation(state); + + // If the operation produced 0 or 1 result, return them immediately. + if (numResults == 0) + return rewriter.eraseOp(op), success(); + if (numResults == 1) + return rewriter.replaceOp(op, newOp->getResult(0)), success(); + + // Otherwise, it had been converted to an operation producing a structure. + // Extract individual results from the structure and return them as list. + SmallVector results; + results.reserve(numResults); + for (unsigned i = 0; i < numResults; ++i) { + auto type = typeConverter.convertType(op->getResult(i).getType()); + results.push_back(rewriter.create( + op->getLoc(), type, newOp->getResult(0), rewriter.getI64ArrayAttr(i))); + } + rewriter.replaceOp(op, results); + return success(); +} diff --git a/mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp b/mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp new file mode 100644 index 0000000000000..ace5bec09f4e7 --- /dev/null +++ b/mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp @@ -0,0 +1,142 @@ +//===- VectorPattern.cpp - Vector conversion pattern to the LLVM dialect --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Conversion/LLVMCommon/VectorPattern.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" + +using namespace mlir; + +// For >1-D vector types, extracts the necessary information to iterate over all +// 1-D subvectors in the underlying llrepresentation of the n-D vector +// Iterates on the llvm array type until we hit a non-array type (which is +// asserted to be an llvm vector type). +LLVM::detail::NDVectorTypeInfo +LLVM::detail::extractNDVectorTypeInfo(VectorType vectorType, + LLVMTypeConverter &converter) { + assert(vectorType.getRank() > 1 && "expected >1D vector type"); + NDVectorTypeInfo info; + info.llvmNDVectorTy = converter.convertType(vectorType); + if (!info.llvmNDVectorTy || !LLVM::isCompatibleType(info.llvmNDVectorTy)) { + info.llvmNDVectorTy = nullptr; + return info; + } + info.arraySizes.reserve(vectorType.getRank() - 1); + auto llvmTy = info.llvmNDVectorTy; + while (llvmTy.isa()) { + info.arraySizes.push_back( + llvmTy.cast().getNumElements()); + llvmTy = llvmTy.cast().getElementType(); + } + if (!LLVM::isCompatibleVectorType(llvmTy)) + return info; + info.llvm1DVectorTy = llvmTy; + return info; +} + +// Express `linearIndex` in terms of coordinates of `basis`. +// Returns the empty vector when linearIndex is out of the range [0, P] where +// P is the product of all the basis coordinates. +// +// Prerequisites: +// Basis is an array of nonnegative integers (signed type inherited from +// vector shape type). +SmallVector LLVM::detail::getCoordinates(ArrayRef basis, + unsigned linearIndex) { + SmallVector res; + res.reserve(basis.size()); + for (unsigned basisElement : llvm::reverse(basis)) { + res.push_back(linearIndex % basisElement); + linearIndex = linearIndex / basisElement; + } + if (linearIndex > 0) + return {}; + std::reverse(res.begin(), res.end()); + return res; +} + +// Iterate of linear index, convert to coords space and insert splatted 1-D +// vector in each position. +void LLVM::detail::nDVectorIterate(const LLVM::detail::NDVectorTypeInfo &info, + OpBuilder &builder, + function_ref fun) { + unsigned ub = 1; + for (auto s : info.arraySizes) + ub *= s; + for (unsigned linearIndex = 0; linearIndex < ub; ++linearIndex) { + auto coords = getCoordinates(info.arraySizes, linearIndex); + // Linear index is out of bounds, we are done. + if (coords.empty()) + break; + assert(coords.size() == info.arraySizes.size()); + auto position = builder.getI64ArrayAttr(coords); + fun(position); + } +} + +LogicalResult LLVM::detail::handleMultidimensionalVectors( + Operation *op, ValueRange operands, LLVMTypeConverter &typeConverter, + std::function createOperand, + ConversionPatternRewriter &rewriter) { + auto resultNDVectorType = op->getResult(0).getType().cast(); + + SmallVector operand1DVectorTypes; + for (Value operand : op->getOperands()) { + auto operandNDVectorType = operand.getType().cast(); + auto operandTypeInfo = + extractNDVectorTypeInfo(operandNDVectorType, typeConverter); + operand1DVectorTypes.push_back(operandTypeInfo.llvm1DVectorTy); + } + auto resultTypeInfo = + extractNDVectorTypeInfo(resultNDVectorType, typeConverter); + auto result1DVectorTy = resultTypeInfo.llvm1DVectorTy; + auto resultNDVectoryTy = resultTypeInfo.llvmNDVectorTy; + auto loc = op->getLoc(); + Value desc = rewriter.create(loc, resultNDVectoryTy); + nDVectorIterate(resultTypeInfo, rewriter, [&](ArrayAttr position) { + // For this unrolled `position` corresponding to the `linearIndex`^th + // element, extract operand vectors + SmallVector extractedOperands; + for (auto operand : llvm::enumerate(operands)) { + extractedOperands.push_back(rewriter.create( + loc, operand1DVectorTypes[operand.index()], operand.value(), + position)); + } + Value newVal = createOperand(result1DVectorTy, extractedOperands); + desc = rewriter.create(loc, resultNDVectoryTy, desc, + newVal, position); + }); + rewriter.replaceOp(op, desc); + return success(); +} + +LogicalResult LLVM::detail::vectorOneToOneRewrite( + Operation *op, StringRef targetOp, ValueRange operands, + LLVMTypeConverter &typeConverter, ConversionPatternRewriter &rewriter) { + assert(!operands.empty()); + + // Cannot convert ops if their operands are not of LLVM type. + if (!llvm::all_of(operands.getTypes(), + [](Type t) { return isCompatibleType(t); })) + return failure(); + + auto llvmNDVectorTy = operands[0].getType(); + if (!llvmNDVectorTy.isa()) + return oneToOneRewrite(op, targetOp, operands, typeConverter, rewriter); + + auto callback = [op, targetOp, &rewriter](Type llvm1DVectorTy, + ValueRange operands) { + OperationState state(op->getLoc(), targetOp); + state.addTypes(llvm1DVectorTy); + state.addOperands(operands); + state.addAttributes(op->getAttrs()); + return rewriter.createOperation(state)->getResult(0); + }; + + return handleMultidimensionalVectors(op, operands, typeConverter, callback, + rewriter); +} diff --git a/mlir/lib/Conversion/OpenMPToLLVM/CMakeLists.txt b/mlir/lib/Conversion/OpenMPToLLVM/CMakeLists.txt index c9cf7883a0abf..e0774746960f9 100644 --- a/mlir/lib/Conversion/OpenMPToLLVM/CMakeLists.txt +++ b/mlir/lib/Conversion/OpenMPToLLVM/CMakeLists.txt @@ -13,6 +13,7 @@ add_mlir_conversion_library(MLIROpenMPToLLVM LINK_LIBS PUBLIC MLIRIR + MLIRLLVMCommonConversion MLIRLLVMIR MLIROpenMP MLIRStandardToLLVM diff --git a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp index 878e11ae6c5aa..3a9eff6ead6d3 100644 --- a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp +++ b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp @@ -9,7 +9,9 @@ #include "mlir/Conversion/OpenMPToLLVM/ConvertOpenMPToLLVM.h" #include "../PassDetail.h" +#include "mlir/Conversion/LLVMCommon/Pattern.h" #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" +#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" diff --git a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp index f094f6443b156..4ec3c70568765 100644 --- a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp +++ b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp @@ -13,6 +13,8 @@ #include "../PassDetail.h" #include "mlir/Analysis/DataLayoutAnalysis.h" +#include "mlir/Conversion/LLVMCommon/Pattern.h" +#include "mlir/Conversion/LLVMCommon/VectorPattern.h" #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h" #include "mlir/Dialect/LLVMIR/FunctionCallUtils.h" @@ -46,214 +48,6 @@ using namespace mlir; #define PASS_NAME "convert-std-to-llvm" -ConvertToLLVMPattern::ConvertToLLVMPattern(StringRef rootOpName, - MLIRContext *context, - LLVMTypeConverter &typeConverter, - PatternBenefit benefit) - : ConversionPattern(typeConverter, rootOpName, benefit, context) {} - - -LLVMTypeConverter *ConvertToLLVMPattern::getTypeConverter() const { - return static_cast( - ConversionPattern::getTypeConverter()); -} - -LLVM::LLVMDialect &ConvertToLLVMPattern::getDialect() const { - return *getTypeConverter()->getDialect(); -} - -Type ConvertToLLVMPattern::getIndexType() const { - return getTypeConverter()->getIndexType(); -} - -Type ConvertToLLVMPattern::getIntPtrType(unsigned addressSpace) const { - return IntegerType::get(&getTypeConverter()->getContext(), - getTypeConverter()->getPointerBitwidth(addressSpace)); -} - -Type ConvertToLLVMPattern::getVoidType() const { - return LLVM::LLVMVoidType::get(&getTypeConverter()->getContext()); -} - -Type ConvertToLLVMPattern::getVoidPtrType() const { - return LLVM::LLVMPointerType::get( - IntegerType::get(&getTypeConverter()->getContext(), 8)); -} - -// Creates a constant Op producing a value of `resultType` from an index-typed -// integer attribute. -static Value createIndexAttrConstant(OpBuilder &builder, Location loc, - Type resultType, int64_t value) { - return builder.create( - loc, resultType, builder.getIntegerAttr(builder.getIndexType(), value)); -} - -Value ConvertToLLVMPattern::createIndexConstant( - ConversionPatternRewriter &builder, Location loc, uint64_t value) const { - return createIndexAttrConstant(builder, loc, getIndexType(), value); -} - -Value ConvertToLLVMPattern::getStridedElementPtr( - Location loc, MemRefType type, Value memRefDesc, ValueRange indices, - ConversionPatternRewriter &rewriter) const { - - int64_t offset; - SmallVector strides; - auto successStrides = getStridesAndOffset(type, strides, offset); - assert(succeeded(successStrides) && "unexpected non-strided memref"); - (void)successStrides; - - MemRefDescriptor memRefDescriptor(memRefDesc); - Value base = memRefDescriptor.alignedPtr(rewriter, loc); - - Value index; - if (offset != 0) // Skip if offset is zero. - index = MemRefType::isDynamicStrideOrOffset(offset) - ? memRefDescriptor.offset(rewriter, loc) - : createIndexConstant(rewriter, loc, offset); - - for (int i = 0, e = indices.size(); i < e; ++i) { - Value increment = indices[i]; - if (strides[i] != 1) { // Skip if stride is 1. - Value stride = MemRefType::isDynamicStrideOrOffset(strides[i]) - ? memRefDescriptor.stride(rewriter, loc, i) - : createIndexConstant(rewriter, loc, strides[i]); - increment = rewriter.create(loc, increment, stride); - } - index = - index ? rewriter.create(loc, index, increment) : increment; - } - - Type elementPtrType = memRefDescriptor.getElementPtrType(); - return index ? rewriter.create(loc, elementPtrType, base, index) - : base; -} - -// Check if the MemRefType `type` is supported by the lowering. We currently -// only support memrefs with identity maps. -bool ConvertToLLVMPattern::isConvertibleAndHasIdentityMaps( - MemRefType type) const { - if (!typeConverter->convertType(type.getElementType())) - return false; - return type.getAffineMaps().empty() || - llvm::all_of(type.getAffineMaps(), - [](AffineMap map) { return map.isIdentity(); }); -} - -Type ConvertToLLVMPattern::getElementPtrType(MemRefType type) const { - auto elementType = type.getElementType(); - auto structElementType = typeConverter->convertType(elementType); - return LLVM::LLVMPointerType::get(structElementType, - type.getMemorySpaceAsInt()); -} - -void ConvertToLLVMPattern::getMemRefDescriptorSizes( - Location loc, MemRefType memRefType, ValueRange dynamicSizes, - ConversionPatternRewriter &rewriter, SmallVectorImpl &sizes, - SmallVectorImpl &strides, Value &sizeBytes) const { - assert(isConvertibleAndHasIdentityMaps(memRefType) && - "layout maps must have been normalized away"); - assert(count(memRefType.getShape(), ShapedType::kDynamicSize) == - static_cast(dynamicSizes.size()) && - "dynamicSizes size doesn't match dynamic sizes count in memref shape"); - - sizes.reserve(memRefType.getRank()); - unsigned dynamicIndex = 0; - for (int64_t size : memRefType.getShape()) { - sizes.push_back(size == ShapedType::kDynamicSize - ? dynamicSizes[dynamicIndex++] - : createIndexConstant(rewriter, loc, size)); - } - - // Strides: iterate sizes in reverse order and multiply. - int64_t stride = 1; - Value runningStride = createIndexConstant(rewriter, loc, 1); - strides.resize(memRefType.getRank()); - for (auto i = memRefType.getRank(); i-- > 0;) { - strides[i] = runningStride; - - int64_t size = memRefType.getShape()[i]; - if (size == 0) - continue; - bool useSizeAsStride = stride == 1; - if (size == ShapedType::kDynamicSize) - stride = ShapedType::kDynamicSize; - if (stride != ShapedType::kDynamicSize) - stride *= size; - - if (useSizeAsStride) - runningStride = sizes[i]; - else if (stride == ShapedType::kDynamicSize) - runningStride = - rewriter.create(loc, runningStride, sizes[i]); - else - runningStride = createIndexConstant(rewriter, loc, stride); - } - - // Buffer size in bytes. - Type elementPtrType = getElementPtrType(memRefType); - Value nullPtr = rewriter.create(loc, elementPtrType); - Value gepPtr = rewriter.create( - loc, elementPtrType, ArrayRef{nullPtr, runningStride}); - sizeBytes = rewriter.create(loc, getIndexType(), gepPtr); -} - -Value ConvertToLLVMPattern::getSizeInBytes( - Location loc, Type type, ConversionPatternRewriter &rewriter) const { - // Compute the size of an individual element. This emits the MLIR equivalent - // of the following sizeof(...) implementation in LLVM IR: - // %0 = getelementptr %elementType* null, %indexType 1 - // %1 = ptrtoint %elementType* %0 to %indexType - // which is a common pattern of getting the size of a type in bytes. - auto convertedPtrType = - LLVM::LLVMPointerType::get(typeConverter->convertType(type)); - auto nullPtr = rewriter.create(loc, convertedPtrType); - auto gep = rewriter.create( - loc, convertedPtrType, - ArrayRef{nullPtr, createIndexConstant(rewriter, loc, 1)}); - return rewriter.create(loc, getIndexType(), gep); -} - -Value ConvertToLLVMPattern::getNumElements( - Location loc, ArrayRef shape, - ConversionPatternRewriter &rewriter) const { - // Compute the total number of memref elements. - Value numElements = - shape.empty() ? createIndexConstant(rewriter, loc, 1) : shape.front(); - for (unsigned i = 1, e = shape.size(); i < e; ++i) - numElements = rewriter.create(loc, numElements, shape[i]); - return numElements; -} - -/// Creates and populates the memref descriptor struct given all its fields. -MemRefDescriptor ConvertToLLVMPattern::createMemRefDescriptor( - Location loc, MemRefType memRefType, Value allocatedPtr, Value alignedPtr, - ArrayRef sizes, ArrayRef strides, - ConversionPatternRewriter &rewriter) const { - auto structType = typeConverter->convertType(memRefType); - auto memRefDescriptor = MemRefDescriptor::undef(rewriter, loc, structType); - - // Field 1: Allocated pointer, used for malloc/free. - memRefDescriptor.setAllocatedPtr(rewriter, loc, allocatedPtr); - - // Field 2: Actual aligned pointer to payload. - memRefDescriptor.setAlignedPtr(rewriter, loc, alignedPtr); - - // Field 3: Offset in aligned pointer. - memRefDescriptor.setOffset(rewriter, loc, - createIndexConstant(rewriter, loc, 0)); - - // Fields 4: Sizes. - for (auto en : llvm::enumerate(sizes)) - memRefDescriptor.setSize(rewriter, loc, en.index(), en.value()); - - // Field 5: Strides. - for (auto en : llvm::enumerate(strides)) - memRefDescriptor.setStride(rewriter, loc, en.index(), en.value()); - - return memRefDescriptor; -} - /// Only retain those attributes that are not constructed by /// `LLVMFuncOp::build`. If `filterArgAttrs` is set, also filter out argument /// attributes. @@ -572,190 +366,6 @@ struct BarePtrFuncOpConversion : public FuncOpConversionBase { } }; -//////////////// Support for Lowering operations on n-D vectors //////////////// -// Helper struct to "unroll" operations on n-D vectors in terms of operations on -// 1-D LLVM vectors. -struct NDVectorTypeInfo { - // LLVM array struct which encodes n-D vectors. - Type llvmNDVectorTy; - // LLVM vector type which encodes the inner 1-D vector type. - Type llvm1DVectorTy; - // Multiplicity of llvmNDVectorTy to llvm1DVectorTy. - SmallVector arraySizes; -}; -} // namespace - -// For >1-D vector types, extracts the necessary information to iterate over all -// 1-D subvectors in the underlying llrepresentation of the n-D vector -// Iterates on the llvm array type until we hit a non-array type (which is -// asserted to be an llvm vector type). -static NDVectorTypeInfo extractNDVectorTypeInfo(VectorType vectorType, - LLVMTypeConverter &converter) { - assert(vectorType.getRank() > 1 && "expected >1D vector type"); - NDVectorTypeInfo info; - info.llvmNDVectorTy = converter.convertType(vectorType); - if (!info.llvmNDVectorTy || !LLVM::isCompatibleType(info.llvmNDVectorTy)) { - info.llvmNDVectorTy = nullptr; - return info; - } - info.arraySizes.reserve(vectorType.getRank() - 1); - auto llvmTy = info.llvmNDVectorTy; - while (llvmTy.isa()) { - info.arraySizes.push_back( - llvmTy.cast().getNumElements()); - llvmTy = llvmTy.cast().getElementType(); - } - if (!LLVM::isCompatibleVectorType(llvmTy)) - return info; - info.llvm1DVectorTy = llvmTy; - return info; -} - -// Express `linearIndex` in terms of coordinates of `basis`. -// Returns the empty vector when linearIndex is out of the range [0, P] where -// P is the product of all the basis coordinates. -// -// Prerequisites: -// Basis is an array of nonnegative integers (signed type inherited from -// vector shape type). -static SmallVector getCoordinates(ArrayRef basis, - unsigned linearIndex) { - SmallVector res; - res.reserve(basis.size()); - for (unsigned basisElement : llvm::reverse(basis)) { - res.push_back(linearIndex % basisElement); - linearIndex = linearIndex / basisElement; - } - if (linearIndex > 0) - return {}; - std::reverse(res.begin(), res.end()); - return res; -} - -// Iterate of linear index, convert to coords space and insert splatted 1-D -// vector in each position. -template -void nDVectorIterate(const NDVectorTypeInfo &info, OpBuilder &builder, - Lambda fun) { - unsigned ub = 1; - for (auto s : info.arraySizes) - ub *= s; - for (unsigned linearIndex = 0; linearIndex < ub; ++linearIndex) { - auto coords = getCoordinates(info.arraySizes, linearIndex); - // Linear index is out of bounds, we are done. - if (coords.empty()) - break; - assert(coords.size() == info.arraySizes.size()); - auto position = builder.getI64ArrayAttr(coords); - fun(position); - } -} -////////////// End Support for Lowering operations on n-D vectors ////////////// - -/// Replaces the given operation "op" with a new operation of type "targetOp" -/// and given operands. -LogicalResult LLVM::detail::oneToOneRewrite( - Operation *op, StringRef targetOp, ValueRange operands, - LLVMTypeConverter &typeConverter, ConversionPatternRewriter &rewriter) { - unsigned numResults = op->getNumResults(); - - Type packedType; - if (numResults != 0) { - packedType = typeConverter.packFunctionResults(op->getResultTypes()); - if (!packedType) - return failure(); - } - - // Create the operation through state since we don't know its C++ type. - OperationState state(op->getLoc(), targetOp); - state.addTypes(packedType); - state.addOperands(operands); - state.addAttributes(op->getAttrs()); - Operation *newOp = rewriter.createOperation(state); - - // If the operation produced 0 or 1 result, return them immediately. - if (numResults == 0) - return rewriter.eraseOp(op), success(); - if (numResults == 1) - return rewriter.replaceOp(op, newOp->getResult(0)), success(); - - // Otherwise, it had been converted to an operation producing a structure. - // Extract individual results from the structure and return them as list. - SmallVector results; - results.reserve(numResults); - for (unsigned i = 0; i < numResults; ++i) { - auto type = typeConverter.convertType(op->getResult(i).getType()); - results.push_back(rewriter.create( - op->getLoc(), type, newOp->getResult(0), rewriter.getI64ArrayAttr(i))); - } - rewriter.replaceOp(op, results); - return success(); -} - -static LogicalResult handleMultidimensionalVectors( - Operation *op, ValueRange operands, LLVMTypeConverter &typeConverter, - std::function createOperand, - ConversionPatternRewriter &rewriter) { - auto resultNDVectorType = op->getResult(0).getType().cast(); - - SmallVector operand1DVectorTypes; - for (Value operand : op->getOperands()) { - auto operandNDVectorType = operand.getType().cast(); - auto operandTypeInfo = - extractNDVectorTypeInfo(operandNDVectorType, typeConverter); - operand1DVectorTypes.push_back(operandTypeInfo.llvm1DVectorTy); - } - auto resultTypeInfo = - extractNDVectorTypeInfo(resultNDVectorType, typeConverter); - auto result1DVectorTy = resultTypeInfo.llvm1DVectorTy; - auto resultNDVectoryTy = resultTypeInfo.llvmNDVectorTy; - auto loc = op->getLoc(); - Value desc = rewriter.create(loc, resultNDVectoryTy); - nDVectorIterate(resultTypeInfo, rewriter, [&](ArrayAttr position) { - // For this unrolled `position` corresponding to the `linearIndex`^th - // element, extract operand vectors - SmallVector extractedOperands; - for (auto operand : llvm::enumerate(operands)) { - extractedOperands.push_back(rewriter.create( - loc, operand1DVectorTypes[operand.index()], operand.value(), - position)); - } - Value newVal = createOperand(result1DVectorTy, extractedOperands); - desc = rewriter.create(loc, resultNDVectoryTy, desc, - newVal, position); - }); - rewriter.replaceOp(op, desc); - return success(); -} - -LogicalResult LLVM::detail::vectorOneToOneRewrite( - Operation *op, StringRef targetOp, ValueRange operands, - LLVMTypeConverter &typeConverter, ConversionPatternRewriter &rewriter) { - assert(!operands.empty()); - - // Cannot convert ops if their operands are not of LLVM type. - if (!llvm::all_of(operands.getTypes(), - [](Type t) { return isCompatibleType(t); })) - return failure(); - - auto llvmNDVectorTy = operands[0].getType(); - if (!llvmNDVectorTy.isa()) - return oneToOneRewrite(op, targetOp, operands, typeConverter, rewriter); - - auto callback = [op, targetOp, &rewriter](Type llvm1DVectorTy, - ValueRange operands) { - OperationState state(op->getLoc(), targetOp); - state.addTypes(llvm1DVectorTy); - state.addOperands(operands); - state.addAttributes(op->getAttrs()); - return rewriter.createOperation(state)->getResult(0); - }; - - return handleMultidimensionalVectors(op, operands, typeConverter, callback, - rewriter); -} - -namespace { // Straightforward lowerings. using AbsFOpLowering = VectorConvertToLLVMPattern; using AddFOpLowering = VectorConvertToLLVMPattern; @@ -1427,7 +1037,7 @@ struct ExpM1OpLowering : public ConvertOpToLLVMPattern { if (!vectorType) return rewriter.notifyMatchFailure(op, "expected vector result type"); - return handleMultidimensionalVectors( + return LLVM::detail::handleMultidimensionalVectors( op.getOperation(), operands, *getTypeConverter(), [&](Type llvm1DVectorTy, ValueRange operands) { auto splatAttr = SplatElementsAttr::get( @@ -1482,7 +1092,7 @@ struct Log1pOpLowering : public ConvertOpToLLVMPattern { if (!vectorType) return rewriter.notifyMatchFailure(op, "expected vector result type"); - return handleMultidimensionalVectors( + return LLVM::detail::handleMultidimensionalVectors( op.getOperation(), operands, *getTypeConverter(), [&](Type llvm1DVectorTy, ValueRange operands) { auto splatAttr = SplatElementsAttr::get( @@ -1536,7 +1146,7 @@ struct RsqrtOpLowering : public ConvertOpToLLVMPattern { if (!vectorType) return failure(); - return handleMultidimensionalVectors( + return LLVM::detail::handleMultidimensionalVectors( op.getOperation(), operands, *getTypeConverter(), [&](Type llvm1DVectorTy, ValueRange operands) { auto splatAttr = SplatElementsAttr::get( @@ -2244,7 +1854,7 @@ struct CmpIOpLowering : public ConvertOpToLLVMPattern { if (!vectorType) return rewriter.notifyMatchFailure(cmpiOp, "expected vector result type"); - return handleMultidimensionalVectors( + return LLVM::detail::handleMultidimensionalVectors( cmpiOp.getOperation(), operands, *getTypeConverter(), [&](Type llvm1DVectorTy, ValueRange operands) { CmpIOpAdaptor transformed(operands); @@ -2282,7 +1892,7 @@ struct CmpFOpLowering : public ConvertOpToLLVMPattern { if (!vectorType) return rewriter.notifyMatchFailure(cmpfOp, "expected vector result type"); - return handleMultidimensionalVectors( + return LLVM::detail::handleMultidimensionalVectors( cmpfOp.getOperation(), operands, *getTypeConverter(), [&](Type llvm1DVectorTy, ValueRange operands) { CmpFOpAdaptor transformed(operands); @@ -2445,7 +2055,7 @@ struct SplatNdOpLowering : public ConvertOpToLLVMPattern { // First insert it into an undef vector so we can shuffle it. auto loc = splatOp.getLoc(); auto vectorTypeInfo = - extractNDVectorTypeInfo(resultType, *getTypeConverter()); + LLVM::detail::extractNDVectorTypeInfo(resultType, *getTypeConverter()); auto llvmNDVectorTy = vectorTypeInfo.llvmNDVectorTy; auto llvm1DVectorTy = vectorTypeInfo.llvm1DVectorTy; if (!llvmNDVectorTy || !llvm1DVectorTy) diff --git a/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt b/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt index 21104281b8120..14187f400e726 100644 --- a/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt +++ b/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt @@ -20,6 +20,7 @@ add_mlir_dialect_library(MLIRLinalg MLIRSideEffectInterfaces MLIRViewLikeInterface MLIRStandard + MLIRMath MLIRMemRef MLIRTensor ) diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp index 7774dbe5cd722..7d22cfd3ef0eb 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp @@ -338,10 +338,12 @@ LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) { return op->emitOpError("expected at least one output operand"); if (failed(OpTrait::impl::verifyNOperands(op, numInputs + numOutputs))) return failure(); - // Should have at least one output tensor per result tensor. - // Can also have outbut buffers that do not correspond to results. - if (op->getNumResults() > linalgOp.getOutputTensorOperands().size()) - return op->emitOpError("unexpected #results > #outputs"); + // Verify the number of results matches the number of output tensors. + if (op->getNumResults() != linalgOp.getOutputTensorOperands().size()) + return op->emitOpError("expected the number of results (") + << op->getNumResults() + << ") to be equal to the number of output tensors (" + << linalgOp.getOutputTensorOperands().size() << ")"; // Before checking indexing maps, we need to make sure the attributes // referenced by it are valid. @@ -394,10 +396,6 @@ LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) { "all have buffer type"); for (OpOperand *opOperand : linalgOp.getOutputTensorOperands()) { - // TODO: Enforce one output tensor per result? - if (opOperand->getOperandNumber() - linalgOp.getNumInputs() >= - linalgOp->getNumResults()) - continue; OpResult result = linalgOp.getTiedOpResult(opOperand); if (result.getType() != opOperand->get().getType()) return op->emitOpError("expected type of operand #") diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index 66cad6eaa3ccc..ea12a312d9c01 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -256,6 +256,20 @@ class RegionBuilderHelper { llvm_unreachable("unsupported non numeric type"); } + Value applyfn__exp(Value x) { + OpBuilder builder = getBuilder(); + if (isFloatingPoint(x)) + return builder.create(x.getLoc(), x); + llvm_unreachable("unsupported non numeric type"); + } + + Value applyfn__log(Value x) { + OpBuilder builder = getBuilder(); + if (isFloatingPoint(x)) + return builder.create(x.getLoc(), x); + llvm_unreachable("unsupported non numeric type"); + } + Value applyfn__sub(Value lhs, Value rhs) { OpBuilder builder = getBuilder(); if (isFloatingPoint(lhs)) @@ -458,10 +472,6 @@ static LogicalResult verify(FillOp op) { Type fillType = op.value().getType(); if (getElementTypeOrSelf(output->get()) != fillType) return op.emitOpError("expects fill type to match view elemental type"); - if (!op.getNumResults() && !output->get().getType().isa()) { - return op.emitOpError( - "expected fill op with no result value to use memref type"); - } return success(); } diff --git a/mlir/python/mlir/dialects/linalg/opdsl/lang/emitter.py b/mlir/python/mlir/dialects/linalg/opdsl/lang/emitter.py index 61d2260587116..3810df9dff74a 100644 --- a/mlir/python/mlir/dialects/linalg/opdsl/lang/emitter.py +++ b/mlir/python/mlir/dialects/linalg/opdsl/lang/emitter.py @@ -7,6 +7,7 @@ from mlir.ir import * from mlir.dialects import linalg from mlir.dialects import std +from mlir.dialects import math # TODO: resolve name collision for Linalg functionality that is injected inside # the _mlir.dialects.linalg directly via pybind. from _mlir.dialects.linalg import fill_builtin_region @@ -293,6 +294,16 @@ def _eval_add(self, lhs: Value, rhs: Value) -> Value: return std.AddIOp(lhs.type, lhs, rhs).result raise NotImplementedError("Unsupported 'add' operand: {lhs}") + def _eval_exp(self, x: Value) -> Value: + if _is_floating_point_type(x.type): + return math.ExpOp(x.type, x).result + raise NotImplementedError("Unsupported 'exp' operand: {x}") + + def _eval_log(self, x: Value) -> Value: + if _is_floating_point_type(x.type): + return math.LogOp(x.type, x).result + raise NotImplementedError("Unsupported 'log' operand: {x}") + def _eval_sub(self, lhs: Value, rhs: Value) -> Value: if _is_floating_point_type(lhs.type): return std.SubFOp(lhs.type, lhs, rhs).result diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py index a37e1944c1f75..72793cbf9c726 100644 --- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py +++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py @@ -209,3 +209,16 @@ def fill_rng_2d( offset = cast(F64, const(2147483647)) scaling = (max - min) * inv_range O[D.m, D.n] = cast(T, (offset + cast(F64, rand2)) * scaling + min) + + +@linalg_structured_op +def soft_plus_2d( + I=TensorDef(T, S.M, S.N), O=TensorDef(U, S.M, S.N, output=True)): + """Implements the soft plus operator. + + Numeric casting is performed on the input operand, promoting it to the same + data type as the accumulator/output. + """ + domain(D.m, D.n) + O[D.m, D.n] = \ + PrimFn.log(cast(U, const(1.0)) + PrimFn.exp(cast(U, I[D.m, D.n]))) diff --git a/mlir/test/Dialect/Linalg/generalize-named-polymorphic-ops.mlir b/mlir/test/Dialect/Linalg/generalize-named-polymorphic-ops.mlir index 0e1c6a62a7b10..aed3585d4f547 100644 --- a/mlir/test/Dialect/Linalg/generalize-named-polymorphic-ops.mlir +++ b/mlir/test/Dialect/Linalg/generalize-named-polymorphic-ops.mlir @@ -188,6 +188,23 @@ func @generalize_fill_rng_2d_i32(%min: f64, %max: f64, %seed: i32, %O: tensor<16 // CHECK-NEXT: linalg.yield %[[VAL6]] : i32 // CHECK-NEXT: -> tensor<16x32xi32> +// ----- + +func @generalize_soft_plus_2d_f32(%input: tensor<16x32xf32>, %output: tensor<16x32xf32>) -> tensor<16x32xf32> { + %0 = linalg.soft_plus_2d ins(%input: tensor<16x32xf32>) outs(%output: tensor<16x32xf32>) -> tensor<16x32xf32> + return %0: tensor<16x32xf32> +} + +// CHECK-LABEL: @generalize_soft_plus_2d_f32 +// CHECK: %[[C1:.+]] = constant 1.000000e+00 : f64 +// CHECK: ^{{.*}}(%[[IN:.+]]: f32, %[[OUT:.+]]: f32 +// CHECK-NEXT: %[[C1_CAST:.+]] = fptrunc %[[C1]] : f64 to f32 +// CHECK-NEXT: %[[EXP:.+]] = math.exp %[[IN]] : f32 +// CHECK-NEXT: %[[SUM:.+]] = addf %[[C1_CAST]], %[[EXP]] : f32 +// CHECK-NEXT: %[[LOG:.+]] = math.log %[[SUM]] : f32 +// CHECK-NEXT: linalg.yield %[[LOG]] : f32 +// CHECK-NEXT: -> tensor<16x32xf32> + // ----- // Verifies floating point to integer cast. func @generalize_matmul_tensor_f32_f32_i16(%A : tensor<16x8xf32>, %B: tensor<8x32xf32>, %C: tensor<16x32xi16>) -> tensor<16x32xi16> { diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir index 8f26533f0b32f..6d8536a730d7a 100644 --- a/mlir/test/Dialect/Linalg/invalid.mlir +++ b/mlir/test/Dialect/Linalg/invalid.mlir @@ -640,7 +640,7 @@ func @pad_yield_type(%arg0: tensor, %arg1: i8) -> tensor { func @illegal_fill_tensor_no_return(%arg0 : index, %arg1 : index, %arg2 : f32) { %0 = linalg.init_tensor [%arg0, %arg1] : tensor - // expected-error @+1 {{expected fill op with no result value to use memref type}} + // expected-error @+1 {{expected the number of results (0) to be equal to the number of output tensors (1)}} linalg.fill(%arg2, %0) : f32, tensor } @@ -648,7 +648,7 @@ func @illegal_fill_tensor_no_return(%arg0 : index, %arg1 : index, %arg2 : f32) func @illegal_fill_memref_with_return(%arg0 : memref, %arg1 : f32) -> memref { - // expected-error @+1 {{unexpected #results > #outputs}} + // expected-error @+1 {{expected the number of results (1) to be equal to the number of output tensors (0)}} %0 = linalg.fill(%arg1, %arg0) : f32, memref -> memref return %0 : memref } @@ -658,7 +658,7 @@ func @illegal_fill_memref_with_return(%arg0 : memref, %arg1 : f32) -> m func @illegal_fill_memref_with_tensor_return (%arg0 : memref, %arg1 : f32) -> tensor { - // expected-error @+1 {{unexpected #results > #outputs}} + // expected-error @+1 {{expected the number of results (1) to be equal to the number of output tensors (0)}} %0 = linalg.fill(%arg1, %arg0) : f32, memref -> tensor return %0 : tensor } diff --git a/mlir/test/python/dialects/linalg/opdsl/emit_structured_generic.py b/mlir/test/python/dialects/linalg/opdsl/emit_structured_generic.py index 44ac4e8e8c5b4..ed33644859012 100644 --- a/mlir/test/python/dialects/linalg/opdsl/emit_structured_generic.py +++ b/mlir/test/python/dialects/linalg/opdsl/emit_structured_generic.py @@ -84,6 +84,13 @@ def fill_rng_poly( O[D.m, D.n] = cast(T, (offset + cast(F64, rand2)) * scaling + min) +@linalg_structured_op +def soft_plus_poly( + I=TensorDef(T, S.M, S.N), O=TensorDef(U, S.M, S.N, output=True)): + O[D.m, D.n] = \ + PrimFn.log(cast(U, const(1.0)) + cast(U, PrimFn.exp(I[D.m, D.n]))) + + with Context() as ctx, Location.unknown(): module = Module.create() f16 = F16Type.get() @@ -299,5 +306,19 @@ def test_f32f32_min_pooling(input, shape, init_result): def test_i32_fill_rng(min, max, seed, init_result): return fill_rng_poly(min, max, seed, outs=[init_result]) + # CHECK-LABEL: @test_f32_soft_plus + # CHECK: ^{{.*}}(%[[IN:.+]]: f32, %[[OUT:.+]]: f32) + # CHECK-NEXT: %[[C1:.+]] = constant 1.000000e+00 : f64 + # CHECK-NEXT: %[[C1_CAST:.+]] = fptrunc %[[C1]] : f64 to f32 + # CHECK-NEXT: %[[EXP:.+]] = math.exp %[[IN]] : f32 + # CHECK-NEXT: %[[SUM:.+]] = addf %[[C1_CAST]], %[[EXP]] : f32 + # CHECK-NEXT: %[[LOG:.+]] = math.log %[[SUM]] : f32 + # CHECK-NEXT: linalg.yield %[[LOG]] : f32 + # CHECK-NEXT: -> tensor<4x16xf32> + @builtin.FuncOp.from_py_func( + RankedTensorType.get((4, 16), f32), RankedTensorType.get((4, 16), f32)) + def test_f32_soft_plus(input, init_result): + return soft_plus_poly(input, outs=[init_result]) + print(module) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 9cab8fa6b117b..27f96f443031e 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -2805,6 +2805,7 @@ cc_library( ":GPUToNVVMGen", ":GPUTransforms", ":IR", + ":LLVMCommonConversion", ":LLVMDialect", ":MathDialect", ":MemRefDialect", @@ -2888,6 +2889,7 @@ cc_library( ":GPUDialect", ":GPUToROCDLTGen", ":GPUTransforms", + ":LLVMCommonConversion", ":MathDialect", ":Pass", ":ROCDLDialect", @@ -3012,6 +3014,7 @@ cc_library( ":ConversionPassIncGen", ":GPUDialect", ":IR", + ":LLVMCommonConversion", ":LLVMDialect", ":Pass", ":SPIRVDialect", @@ -4236,6 +4239,22 @@ alias( actual = "SCFToStandard", ) +cc_library( + name = "LLVMCommonConversion", + srcs = glob([ + "lib/Conversion/LLVMCommon/*.cpp", + ]) + ["lib/Conversion/LLVMCommon/MemRefDescriptor.h"], + hdrs = glob(["include/mlir/Conversion/LLVMCommon/*.h"]), + includes = ["include"], + deps = [ + ":IR", + ":LLVMDialect", + ":Support", + ":Transforms", + "//llvm:Core", + ], +) + cc_library( name = "StandardToLLVM", srcs = [ @@ -4253,6 +4272,7 @@ cc_library( ":DataLayoutInterfaces", ":DialectUtils", ":IR", + ":LLVMCommonConversion", ":LLVMDialect", ":MathDialect", ":MemRefDialect", @@ -5127,6 +5147,7 @@ cc_binary( ":GPUToSPIRV", ":GPUToVulkanTransforms", ":GPUTransforms", + ":LLVMCommonConversion", ":LLVMDialect", ":LLVMToLLVMIRTranslation", ":MemRefDialect", @@ -6229,6 +6250,7 @@ cc_library( ], deps = [ ":ConversionPassIncGen", + ":DialectUtils", ":IR", ":LinalgOps", ":MathDialect", @@ -6378,6 +6400,7 @@ cc_library( ":ComplexDialect", ":ConversionPassIncGen", ":IR", + ":LLVMCommonConversion", ":LLVMDialect", ":Pass", ":StandardToLLVM",