diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp index 1744c1e571722..1aad25242712f 100644 --- a/bolt/lib/Core/BinaryEmitter.cpp +++ b/bolt/lib/Core/BinaryEmitter.cpp @@ -47,12 +47,16 @@ BreakFunctionNames("break-funcs", cl::cat(BoltCategory)); static cl::list -FunctionPadSpec("pad-funcs", - cl::CommaSeparated, - cl::desc("list of functions to pad with amount of bytes"), - cl::value_desc("func1:pad1,func2:pad2,func3:pad3,..."), - cl::Hidden, - cl::cat(BoltCategory)); + FunctionPadSpec("pad-funcs", cl::CommaSeparated, + cl::desc("list of functions to pad with amount of bytes"), + cl::value_desc("func1:pad1,func2:pad2,func3:pad3,..."), + cl::Hidden, cl::cat(BoltCategory)); + +static cl::list FunctionPadBeforeSpec( + "pad-funcs-before", cl::CommaSeparated, + cl::desc("list of functions to pad with amount of bytes"), + cl::value_desc("func1:pad1,func2:pad2,func3:pad3,..."), cl::Hidden, + cl::cat(BoltCategory)); static cl::opt MarkFuncs( "mark-funcs", @@ -70,11 +74,11 @@ X86AlignBranchBoundaryHotOnly("x86-align-branch-boundary-hot-only", cl::init(true), cl::cat(BoltOptCategory)); -size_t padFunction(const BinaryFunction &Function) { - static std::map FunctionPadding; - - if (FunctionPadding.empty() && !FunctionPadSpec.empty()) { - for (std::string &Spec : FunctionPadSpec) { +size_t padFunction(std::map &FunctionPadding, + const cl::list &Spec, + const BinaryFunction &Function) { + if (FunctionPadding.empty() && !Spec.empty()) { + for (const std::string &Spec : Spec) { size_t N = Spec.find(':'); if (N == std::string::npos) continue; @@ -94,6 +98,15 @@ size_t padFunction(const BinaryFunction &Function) { return 0; } +size_t padFunctionBefore(const BinaryFunction &Function) { + static std::map CacheFunctionPadding; + return padFunction(CacheFunctionPadding, FunctionPadBeforeSpec, Function); +} +size_t padFunctionAfter(const BinaryFunction &Function) { + static std::map CacheFunctionPadding; + return padFunction(CacheFunctionPadding, FunctionPadSpec, Function); +} + } // namespace opts namespace { @@ -319,6 +332,31 @@ bool BinaryEmitter::emitFunction(BinaryFunction &Function, Streamer.emitCodeAlignment(Function.getAlign(), &*BC.STI); } + if (size_t Padding = opts::padFunctionBefore(Function)) { + // Handle padFuncsBefore after the above alignment logic but before + // symbol addresses are decided. + if (!BC.HasRelocations) { + BC.errs() << "BOLT-ERROR: -pad-before-funcs is not supported in " + << "non-relocation mode\n"; + exit(1); + } + + // Preserve Function.getMinAlign(). + if (!isAligned(Function.getMinAlign(), Padding)) { + BC.errs() << "BOLT-ERROR: user-requested " << Padding + << " padding bytes before function " << Function + << " is not a multiple of the minimum function alignment (" + << Function.getMinAlign().value() << ").\n"; + exit(1); + } + + LLVM_DEBUG(dbgs() << "BOLT-DEBUG: padding before function " << Function + << " with " << Padding << " bytes\n"); + + // Since the padding is not executed, it can be null bytes. + Streamer.emitFill(Padding, 0); + } + MCContext &Context = Streamer.getContext(); const MCAsmInfo *MAI = Context.getAsmInfo(); @@ -373,7 +411,7 @@ bool BinaryEmitter::emitFunction(BinaryFunction &Function, emitFunctionBody(Function, FF, /*EmitCodeOnly=*/false); // Emit padding if requested. - if (size_t Padding = opts::padFunction(Function)) { + if (size_t Padding = opts::padFunctionAfter(Function)) { LLVM_DEBUG(dbgs() << "BOLT-DEBUG: padding function " << Function << " with " << Padding << " bytes\n"); Streamer.emitFill(Padding, MAI->getTextAlignFillValue()); diff --git a/bolt/lib/Passes/ReorderFunctions.cpp b/bolt/lib/Passes/ReorderFunctions.cpp index 1256d71342b13..35c5acfdecdb9 100644 --- a/bolt/lib/Passes/ReorderFunctions.cpp +++ b/bolt/lib/Passes/ReorderFunctions.cpp @@ -28,7 +28,8 @@ extern cl::OptionCategory BoltOptCategory; extern cl::opt Verbosity; extern cl::opt RandomSeed; -extern size_t padFunction(const bolt::BinaryFunction &Function); +extern size_t padFunctionBefore(const bolt::BinaryFunction &Function); +extern size_t padFunctionAfter(const bolt::BinaryFunction &Function); extern cl::opt ReorderFunctions; cl::opt ReorderFunctions( @@ -304,8 +305,10 @@ Error ReorderFunctions::runOnFunctions(BinaryContext &BC) { return false; if (B->isIgnored()) return true; - const size_t PadA = opts::padFunction(*A); - const size_t PadB = opts::padFunction(*B); + const size_t PadA = opts::padFunctionBefore(*A) + + opts::padFunctionAfter(*A); + const size_t PadB = opts::padFunctionBefore(*B) + + opts::padFunctionAfter(*B); if (!PadA || !PadB) { if (PadA) return true; diff --git a/bolt/test/AArch64/pad-before-funcs.s b/bolt/test/AArch64/pad-before-funcs.s new file mode 100644 index 0000000000000..f3e8a23ddfdda --- /dev/null +++ b/bolt/test/AArch64/pad-before-funcs.s @@ -0,0 +1,48 @@ +# Test checks that --pad-before-funcs is working as expected. +# It should be able to introduce a configurable offset for the _start symbol. +# It should reject requests which don't obey the code alignment requirement. + +# Tests check inserting padding before _start; and additionally a test where +# padding is inserted after start. In each case, check that the following +# symbol ends up in the expected place as well. + + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o +# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -Wl,--section-start=.text=0x4000 +# RUN: llvm-bolt %t.exe -o %t.bolt.0 --pad-funcs-before=_start:0 +# RUN: llvm-bolt %t.exe -o %t.bolt.4 --pad-funcs-before=_start:4 +# RUN: llvm-bolt %t.exe -o %t.bolt.8 --pad-funcs-before=_start:8 +# RUN: llvm-bolt %t.exe -o %t.bolt.4.4 --pad-funcs-before=_start:4 --pad-funcs=_start:4 +# RUN: llvm-bolt %t.exe -o %t.bolt.4.8 --pad-funcs-before=_start:4 --pad-funcs=_start:8 + +# RUN: not llvm-bolt %t.exe -o %t.bolt.8 --pad-funcs-before=_start:1 2>&1 | FileCheck --check-prefix=CHECK-BAD-ALIGN %s + +# CHECK-BAD-ALIGN: user-requested 1 padding bytes before function _start(*2) is not a multiple of the minimum function alignment (4). + +# RUN: llvm-objdump --section=.text --disassemble %t.bolt.0 | FileCheck --check-prefix=CHECK-0 %s +# RUN: llvm-objdump --section=.text --disassemble %t.bolt.4 | FileCheck --check-prefix=CHECK-4 %s +# RUN: llvm-objdump --section=.text --disassemble %t.bolt.8 | FileCheck --check-prefix=CHECK-8 %s +# RUN: llvm-objdump --section=.text --disassemble %t.bolt.4.4 | FileCheck --check-prefix=CHECK-4-4 %s +# RUN: llvm-objdump --section=.text --disassemble %t.bolt.4.8 | FileCheck --check-prefix=CHECK-4-8 %s + +# Trigger relocation mode in bolt. +.reloc 0, R_AARCH64_NONE + +.section .text + +# CHECK-0: 0000000000400000 <_start> +# CHECK-4: 0000000000400004 <_start> +# CHECK-4-4: 0000000000400004 <_start> +# CHECK-8: 0000000000400008 <_start> +.globl _start +_start: + ret + +# CHECK-0: 0000000000400004 <_subsequent> +# CHECK-4: 0000000000400008 <_subsequent> +# CHECK-4-4: 000000000040000c <_subsequent> +# CHECK-4-8: 0000000000400010 <_subsequent> +# CHECK-8: 000000000040000c <_subsequent> +.globl _subsequent +_subsequent: + ret diff --git a/clang-tools-extra/clang-tidy/bugprone/UnhandledSelfAssignmentCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnhandledSelfAssignmentCheck.cpp index 8121a36f80346..1f432c4ccc5f0 100644 --- a/clang-tools-extra/clang-tidy/bugprone/UnhandledSelfAssignmentCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/UnhandledSelfAssignmentCheck.cpp @@ -74,9 +74,11 @@ void UnhandledSelfAssignmentCheck::registerMatchers(MatchFinder *Finder) { // Matcher for standard smart pointers. const auto SmartPointerType = qualType(hasUnqualifiedDesugaredType( recordType(hasDeclaration(classTemplateSpecializationDecl( - hasAnyName("::std::shared_ptr", "::std::unique_ptr", - "::std::weak_ptr", "::std::auto_ptr"), - templateArgumentCountIs(1)))))); + anyOf(allOf(hasAnyName("::std::shared_ptr", "::std::weak_ptr", + "::std::auto_ptr"), + templateArgumentCountIs(1)), + allOf(hasName("::std::unique_ptr"), + templateArgumentCountIs(2)))))))); // We will warn only if the class has a pointer or a C array field which // probably causes a problem during self-assignment (e.g. first resetting diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 1fd9b6077be5f..35cb3e387e4e6 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -233,6 +233,10 @@ Changes in existing checks `bsl::optional` and `bdlb::NullableValue` from _. +- Improved :doc:`bugprone-unhandled-self-assignment + ` check by fixing smart + pointer check against std::unique_ptr type. + - Improved :doc:`bugprone-unsafe-functions ` check to allow specifying additional functions to match. diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unhandled-self-assignment.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unhandled-self-assignment.cpp index 14d27855d7c5a..8610393449f97 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unhandled-self-assignment.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unhandled-self-assignment.cpp @@ -10,7 +10,9 @@ template T &&move(T &x) { } -template +template class default_delete {}; + +template > class unique_ptr { }; diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 922dd21bcda7e..e8d3f1b63f1bb 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -734,6 +734,16 @@ Improvements to Clang's diagnostics return ptr + index < ptr; // warning } +- Clang now emits a ``-Wvarargs`` diagnostic when the second argument + to ``va_arg`` is of array type, which is an undefined behavior (#GH119360). + + .. code-block:: c++ + + void test() { + va_list va; + va_arg(va, int[10]); // warning + } + - Fix -Wdangling false positives on conditional operators (#120206). - Fixed a bug where Clang hung on an unsupported optional scope specifier ``::`` when parsing @@ -786,6 +796,7 @@ Bug Fixes in This Version the unsupported type instead of the ``register`` keyword (#GH109776). - Fixed a crash when emit ctor for global variant with flexible array init (#GH113187). - Fixed a crash when GNU statement expression contains invalid statement (#GH113468). +- Fixed a crash when passing the variable length array type to ``va_arg`` (#GH119360). - Fixed a failed assertion when using ``__attribute__((noderef))`` on an ``_Atomic``-qualified type (#GH116124). - No longer return ``false`` for ``noexcept`` expressions involving a diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst index 29d5e1f92a69c..e093b2d672a74 100644 --- a/clang/docs/analyzer/checkers.rst +++ b/clang/docs/analyzer/checkers.rst @@ -476,6 +476,9 @@ cplusplus.NewDelete (C++) """"""""""""""""""""""""" Check for double-free and use-after-free problems. Traces memory managed by new/delete. +Custom allocation/deallocation functions can be defined using +:ref:`ownership attributes`. + .. literalinclude:: checkers/newdelete_example.cpp :language: cpp @@ -485,6 +488,9 @@ cplusplus.NewDeleteLeaks (C++) """""""""""""""""""""""""""""" Check for memory leaks. Traces memory managed by new/delete. +Custom allocation/deallocation functions can be defined using +:ref:`ownership attributes`. + .. code-block:: cpp void test() { @@ -1263,6 +1269,9 @@ You can silence this warning either by bound checking the ``size`` parameter, or by explicitly marking the ``size`` parameter as sanitized. See the :ref:`optin-taint-GenericTaint` checker for an example. +Custom allocation/deallocation functions can be defined using +:ref:`ownership attributes`. + .. code-block:: c void vulnerable(void) { @@ -1857,6 +1866,9 @@ unix.Malloc (C) """"""""""""""" Check for memory leaks, double free, and use-after-free problems. Traces memory managed by malloc()/free(). +Custom allocation/deallocation functions can be defined using +:ref:`ownership attributes`. + .. literalinclude:: checkers/unix_malloc_example.c :language: c @@ -1866,6 +1878,9 @@ unix.MallocSizeof (C) """"""""""""""""""""" Check for dubious ``malloc`` arguments involving ``sizeof``. +Custom allocation/deallocation functions can be defined using +:ref:`ownership attributes`. + .. code-block:: c void test() { @@ -1881,6 +1896,9 @@ unix.MismatchedDeallocator (C, C++) """"""""""""""""""""""""""""""""""" Check for mismatched deallocators. +Custom allocation/deallocation functions can be defined using +:ref:`ownership attributes`. + .. literalinclude:: checkers/mismatched_deallocator_example.cpp :language: c diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h index 3f95f1db2fbe5..63d266dc60ec7 100644 --- a/clang/include/clang-c/Index.h +++ b/clang/include/clang-c/Index.h @@ -2202,7 +2202,11 @@ enum CXCursorKind { */ CXCursor_OpenACCSetConstruct = 330, - CXCursor_LastStmt = CXCursor_OpenACCSetConstruct, + /** OpenACC update Construct. + */ + CXCursor_OpenACCUpdateConstruct = 331, + + CXCursor_LastStmt = CXCursor_OpenACCUpdateConstruct, /** * Cursor that represents the translation unit itself. diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h index 92954cf566c83..d500f4eadef75 100644 --- a/clang/include/clang/AST/RecursiveASTVisitor.h +++ b/clang/include/clang/AST/RecursiveASTVisitor.h @@ -4082,6 +4082,8 @@ DEF_TRAVERSE_STMT(OpenACCShutdownConstruct, { TRY_TO(VisitOpenACCClauseList(S->clauses())); }) DEF_TRAVERSE_STMT(OpenACCSetConstruct, { TRY_TO(VisitOpenACCClauseList(S->clauses())); }) +DEF_TRAVERSE_STMT(OpenACCUpdateConstruct, + { TRY_TO(VisitOpenACCClauseList(S->clauses())); }) // Traverse HLSL: Out argument expression DEF_TRAVERSE_STMT(HLSLOutArgExpr, {}) diff --git a/clang/include/clang/AST/StmtOpenACC.h b/clang/include/clang/AST/StmtOpenACC.h index d3cc106ff0081..ebbee152f918f 100644 --- a/clang/include/clang/AST/StmtOpenACC.h +++ b/clang/include/clang/AST/StmtOpenACC.h @@ -712,5 +712,44 @@ class OpenACCSetConstruct final SourceLocation End, ArrayRef Clauses); }; +// This class represents an 'update' construct, which has just a clause list. +class OpenACCUpdateConstruct final + : public OpenACCConstructStmt, + private llvm::TrailingObjects { + friend TrailingObjects; + OpenACCUpdateConstruct(unsigned NumClauses) + : OpenACCConstructStmt(OpenACCUpdateConstructClass, + OpenACCDirectiveKind::Update, SourceLocation{}, + SourceLocation{}, SourceLocation{}) { + std::uninitialized_value_construct( + getTrailingObjects(), + getTrailingObjects() + NumClauses); + setClauseList(MutableArrayRef(getTrailingObjects(), + NumClauses)); + } + + OpenACCUpdateConstruct(SourceLocation Start, SourceLocation DirectiveLoc, + SourceLocation End, + ArrayRef Clauses) + : OpenACCConstructStmt(OpenACCUpdateConstructClass, + OpenACCDirectiveKind::Update, Start, DirectiveLoc, + End) { + std::uninitialized_copy(Clauses.begin(), Clauses.end(), + getTrailingObjects()); + setClauseList(MutableArrayRef(getTrailingObjects(), + Clauses.size())); + } + +public: + static bool classof(const Stmt *T) { + return T->getStmtClass() == OpenACCUpdateConstructClass; + } + static OpenACCUpdateConstruct *CreateEmpty(const ASTContext &C, + unsigned NumClauses); + static OpenACCUpdateConstruct * + Create(const ASTContext &C, SourceLocation Start, SourceLocation DirectiveLoc, + SourceLocation End, ArrayRef Clauses); +}; } // namespace clang #endif // LLVM_CLANG_AST_STMTOPENACC_H diff --git a/clang/include/clang/AST/TextNodeDumper.h b/clang/include/clang/AST/TextNodeDumper.h index 59cd3ce5c8fbb..4aaae48ba8b42 100644 --- a/clang/include/clang/AST/TextNodeDumper.h +++ b/clang/include/clang/AST/TextNodeDumper.h @@ -419,6 +419,7 @@ class TextNodeDumper void VisitOpenACCInitConstruct(const OpenACCInitConstruct *S); void VisitOpenACCSetConstruct(const OpenACCSetConstruct *S); void VisitOpenACCShutdownConstruct(const OpenACCShutdownConstruct *S); + void VisitOpenACCUpdateConstruct(const OpenACCUpdateConstruct *S); void VisitOpenACCAsteriskSizeExpr(const OpenACCAsteriskSizeExpr *S); void VisitEmbedExpr(const EmbedExpr *S); void VisitAtomicExpr(const AtomicExpr *AE); diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index ba581e02542fc..b8d702e41aa0b 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -1393,6 +1393,7 @@ def OwnershipDocs : Documentation { let Heading = "ownership_holds, ownership_returns, ownership_takes (Clang " "Static Analyzer)"; let Category = DocCatFunction; + let Label = "analyzer-ownership-attrs"; let Content = [{ .. note:: diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 671a5e3bf02a9..b138e57dce6e7 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -10515,6 +10515,10 @@ def warn_second_parameter_to_va_arg_ownership_qualified : Warning< def warn_second_parameter_to_va_arg_never_compatible : Warning< "second argument to 'va_arg' is of promotable type %0; this va_arg has " "undefined behavior because arguments will be promoted to %1">, InGroup; +def warn_second_parameter_to_va_arg_array : Warning< + "second argument to 'va_arg' is of array type %0; " + "this va_arg has undefined behavior because arguments " + "will never be compatible with array type">, InGroup; def warn_return_missing_expr : Warning< "non-void %select{function|method}1 %0 should return a value">, DefaultError, @@ -12828,6 +12832,10 @@ def err_acc_loop_not_monotonic "('++', '--', or compound assignment)">; def err_acc_construct_one_clause_of : Error<"OpenACC '%0' construct must have at least one %1 clause">; +def err_acc_update_as_body + : Error<"OpenACC 'update' construct may not appear in place of the " + "statement following a%select{n if statement| while statement| do " + "statement| switch statement| label statement}0">; // AMDGCN builtins diagnostics def err_amdgcn_global_load_lds_size_invalid_value : Error<"invalid size value">; diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td index 2ecf19ef6252d..ce2c48bd3c84e 100644 --- a/clang/include/clang/Basic/StmtNodes.td +++ b/clang/include/clang/Basic/StmtNodes.td @@ -316,6 +316,7 @@ def OpenACCWaitConstruct : StmtNode; def OpenACCInitConstruct : StmtNode; def OpenACCShutdownConstruct : StmtNode; def OpenACCSetConstruct : StmtNode; +def OpenACCUpdateConstruct : StmtNode; // OpenACC Additional Expressions. def OpenACCAsteriskSizeExpr : StmtNode; diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h index f11565c55d84d..0260d23659883 100644 --- a/clang/include/clang/Driver/Driver.h +++ b/clang/include/clang/Driver/Driver.h @@ -757,6 +757,11 @@ class Driver { /// \returns true if error occurred. bool loadDefaultConfigFiles(llvm::cl::ExpansionContext &ExpCtx); + /// Tries to load options from customization file. + /// + /// \returns true if error occurred. + bool loadZOSCustomizationFile(llvm::cl::ExpansionContext &); + /// Read options from the specified file. /// /// \param [in] FileName File to read. diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h index a46a7e133f1b2..aac165130b719 100644 --- a/clang/include/clang/Serialization/ASTBitCodes.h +++ b/clang/include/clang/Serialization/ASTBitCodes.h @@ -2025,6 +2025,7 @@ enum StmtCode { STMT_OPENACC_INIT_CONSTRUCT, STMT_OPENACC_SHUTDOWN_CONSTRUCT, STMT_OPENACC_SET_CONSTRUCT, + STMT_OPENACC_UPDATE_CONSTRUCT, // HLSL Constructs EXPR_HLSL_OUT_ARG, diff --git a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h index 3f341ecf8c1e4..2c970301879d2 100644 --- a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h +++ b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h @@ -126,11 +126,18 @@ enum class CTUPhase1InliningKind { None, Small, All }; class PositiveAnalyzerOption { public: - PositiveAnalyzerOption() = default; - PositiveAnalyzerOption(const PositiveAnalyzerOption &) = default; - PositiveAnalyzerOption &operator=(const PositiveAnalyzerOption &) = default; + constexpr PositiveAnalyzerOption() = default; + constexpr PositiveAnalyzerOption(unsigned Value) : Value(Value) { + assert(Value > 0 && "only positive values are accepted"); + } + constexpr PositiveAnalyzerOption(const PositiveAnalyzerOption &) = default; + constexpr PositiveAnalyzerOption & + operator=(const PositiveAnalyzerOption &Other) { + Value = Other.Value; + return *this; + } - static std::optional create(unsigned Val) { + static constexpr std::optional create(unsigned Val) { if (Val == 0) return std::nullopt; return PositiveAnalyzerOption{Val}; @@ -141,11 +148,9 @@ class PositiveAnalyzerOption { return std::nullopt; return PositiveAnalyzerOption::create(Parsed); } - operator unsigned() const { return Value; } + constexpr operator unsigned() const { return Value; } private: - explicit constexpr PositiveAnalyzerOption(unsigned Value) : Value(Value) {} - unsigned Value = 1; }; diff --git a/clang/lib/AST/StmtOpenACC.cpp b/clang/lib/AST/StmtOpenACC.cpp index 889573f57b40a..2b0ac716bab56 100644 --- a/clang/lib/AST/StmtOpenACC.cpp +++ b/clang/lib/AST/StmtOpenACC.cpp @@ -284,3 +284,24 @@ OpenACCSetConstruct::Create(const ASTContext &C, SourceLocation Start, auto *Inst = new (Mem) OpenACCSetConstruct(Start, DirectiveLoc, End, Clauses); return Inst; } + +OpenACCUpdateConstruct * +OpenACCUpdateConstruct::CreateEmpty(const ASTContext &C, unsigned NumClauses) { + void *Mem = C.Allocate( + OpenACCUpdateConstruct::totalSizeToAlloc( + NumClauses)); + auto *Inst = new (Mem) OpenACCUpdateConstruct(NumClauses); + return Inst; +} + +OpenACCUpdateConstruct * +OpenACCUpdateConstruct::Create(const ASTContext &C, SourceLocation Start, + SourceLocation DirectiveLoc, SourceLocation End, + ArrayRef Clauses) { + void *Mem = C.Allocate( + OpenACCUpdateConstruct::totalSizeToAlloc( + Clauses.size())); + auto *Inst = + new (Mem) OpenACCUpdateConstruct(Start, DirectiveLoc, End, Clauses); + return Inst; +} diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp index 52eead979b175..52bcb5135d351 100644 --- a/clang/lib/AST/StmtPrinter.cpp +++ b/clang/lib/AST/StmtPrinter.cpp @@ -1204,10 +1204,12 @@ void StmtPrinter::VisitOpenACCInitConstruct(OpenACCInitConstruct *S) { void StmtPrinter::VisitOpenACCShutdownConstruct(OpenACCShutdownConstruct *S) { PrintOpenACCConstruct(S); } - void StmtPrinter::VisitOpenACCSetConstruct(OpenACCSetConstruct *S) { PrintOpenACCConstruct(S); } +void StmtPrinter::VisitOpenACCUpdateConstruct(OpenACCUpdateConstruct *S) { + PrintOpenACCConstruct(S); +} void StmtPrinter::VisitOpenACCWaitConstruct(OpenACCWaitConstruct *S) { Indent() << "#pragma acc wait"; diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp index 150b92ef6a1ab..b68c83f99550b 100644 --- a/clang/lib/AST/StmtProfile.cpp +++ b/clang/lib/AST/StmtProfile.cpp @@ -2780,6 +2780,13 @@ void StmtProfiler::VisitOpenACCSetConstruct(const OpenACCSetConstruct *S) { P.VisitOpenACCClauseList(S->clauses()); } +void StmtProfiler::VisitOpenACCUpdateConstruct( + const OpenACCUpdateConstruct *S) { + VisitStmt(S); + OpenACCClauseProfiler P{*this}; + P.VisitOpenACCClauseList(S->clauses()); +} + void StmtProfiler::VisitHLSLOutArgExpr(const HLSLOutArgExpr *S) { VisitStmt(S); } diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp index 00e3af3e81125..eedd8faad9e85 100644 --- a/clang/lib/AST/TextNodeDumper.cpp +++ b/clang/lib/AST/TextNodeDumper.cpp @@ -2931,7 +2931,6 @@ void TextNodeDumper::VisitOpenACCConstructStmt(const OpenACCConstructStmt *S) { OS << " " << S->getDirectiveKind(); } void TextNodeDumper::VisitOpenACCLoopConstruct(const OpenACCLoopConstruct *S) { - if (S->isOrphanedLoopConstruct()) OS << " "; else @@ -2940,40 +2939,44 @@ void TextNodeDumper::VisitOpenACCLoopConstruct(const OpenACCLoopConstruct *S) { void TextNodeDumper::VisitOpenACCCombinedConstruct( const OpenACCCombinedConstruct *S) { - OS << " " << S->getDirectiveKind(); + VisitOpenACCConstructStmt(S); } void TextNodeDumper::VisitOpenACCDataConstruct(const OpenACCDataConstruct *S) { - OS << " " << S->getDirectiveKind(); + VisitOpenACCConstructStmt(S); } void TextNodeDumper::VisitOpenACCEnterDataConstruct( const OpenACCEnterDataConstruct *S) { - OS << " " << S->getDirectiveKind(); + VisitOpenACCConstructStmt(S); } void TextNodeDumper::VisitOpenACCExitDataConstruct( const OpenACCExitDataConstruct *S) { - OS << " " << S->getDirectiveKind(); + VisitOpenACCConstructStmt(S); } void TextNodeDumper::VisitOpenACCHostDataConstruct( const OpenACCHostDataConstruct *S) { - OS << " " << S->getDirectiveKind(); + VisitOpenACCConstructStmt(S); } void TextNodeDumper::VisitOpenACCWaitConstruct(const OpenACCWaitConstruct *S) { - OS << " " << S->getDirectiveKind(); + VisitOpenACCConstructStmt(S); } void TextNodeDumper::VisitOpenACCInitConstruct(const OpenACCInitConstruct *S) { - OS << " " << S->getDirectiveKind(); + VisitOpenACCConstructStmt(S); } void TextNodeDumper::VisitOpenACCShutdownConstruct( const OpenACCShutdownConstruct *S) { - OS << " " << S->getDirectiveKind(); + VisitOpenACCConstructStmt(S); } void TextNodeDumper::VisitOpenACCSetConstruct(const OpenACCSetConstruct *S) { - OS << " " << S->getDirectiveKind(); + VisitOpenACCConstructStmt(S); +} +void TextNodeDumper::VisitOpenACCUpdateConstruct( + const OpenACCUpdateConstruct *S) { + VisitOpenACCConstructStmt(S); } void TextNodeDumper::VisitEmbedExpr(const EmbedExpr *S) { diff --git a/clang/lib/Basic/Targets/OSTargets.cpp b/clang/lib/Basic/Targets/OSTargets.cpp index 6f98353fb8c2e..88c054150ab22 100644 --- a/clang/lib/Basic/Targets/OSTargets.cpp +++ b/clang/lib/Basic/Targets/OSTargets.cpp @@ -114,6 +114,9 @@ void getDarwinDefines(MacroBuilder &Builder, const LangOptions &Opts, assert(OsVersion.getMinor().value_or(0) < 100 && OsVersion.getSubminor().value_or(0) < 100 && "Invalid version!"); Builder.defineMacro("__ENVIRONMENT_OS_VERSION_MIN_REQUIRED__", Str); + + // Tell users about the kernel if there is one. + Builder.defineMacro("__MACH__"); } PlatformMinVersion = OsVersion; diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index a9195536a4ded..8b1b882a6655c 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -6105,6 +6105,8 @@ RValue CodeGenFunction::EmitVAArg(VAArgExpr *VE, Address &VAListAddr, VAListAddr = VE->isMicrosoftABI() ? EmitMSVAListRef(VE->getSubExpr()) : EmitVAListRef(VE->getSubExpr()); QualType Ty = VE->getType(); + if (Ty->isVariablyModifiedType()) + EmitVariablyModifiedType(Ty); if (VE->isMicrosoftABI()) return CGM.getABIInfo().EmitMSVAArg(*this, VAListAddr, Ty, Slot); return CGM.getABIInfo().EmitVAArg(*this, VAListAddr, Ty, Slot); diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index 4b71bd730ce12..b282d4e0b32f0 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -5448,11 +5448,6 @@ Value *ScalarExprEmitter::VisitChooseExpr(ChooseExpr *E) { } Value *ScalarExprEmitter::VisitVAArgExpr(VAArgExpr *VE) { - QualType Ty = VE->getType(); - - if (Ty->isVariablyModifiedType()) - CGF.EmitVariablyModifiedType(Ty); - Address ArgValue = Address::invalid(); RValue ArgPtr = CGF.EmitVAArg(VE, ArgValue); diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp index 22b4f97eb302a..90b04bb6cc9b0 100644 --- a/clang/lib/CodeGen/CGStmt.cpp +++ b/clang/lib/CodeGen/CGStmt.cpp @@ -1469,6 +1469,9 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef Attrs) { case Stmt::OpenACCSetConstructClass: EmitOpenACCSetConstruct(cast(*S)); break; + case Stmt::OpenACCUpdateConstructClass: + EmitOpenACCUpdateConstruct(cast(*S)); + break; } } diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 4d374ed0d4b23..1a74105adf5e7 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4244,6 +4244,11 @@ class CodeGenFunction : public CodeGenTypeCache { // but in the future we will implement some sort of IR. } + void EmitOpenACCUpdateConstruct(const OpenACCUpdateConstruct &S) { + // TODO OpenACC: Implement this. It is currently implemented as a 'no-op', + // but in the future we will implement some sort of IR. + } + //===--------------------------------------------------------------------===// // LValue Expression Emission //===--------------------------------------------------------------------===// diff --git a/clang/lib/CodeGen/SanitizerMetadata.cpp b/clang/lib/CodeGen/SanitizerMetadata.cpp index 61fdf3399ff3c..b7b212ba46efd 100644 --- a/clang/lib/CodeGen/SanitizerMetadata.cpp +++ b/clang/lib/CodeGen/SanitizerMetadata.cpp @@ -145,7 +145,9 @@ void SanitizerMetadata::reportGlobal(llvm::GlobalVariable *GV, const VarDecl &D, for (auto *Attr : D.specific_attrs()) NoSanitizeMask |= Attr->getMask(); - if (D.hasExternalStorage()) + // External definitions and incomplete types get handled at the place they + // are defined. + if (D.hasExternalStorage() || D.getType()->isIncompleteType()) NoSanitizeMask |= SanitizerKind::Type; return NoSanitizeMask; diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index cf5620380ac2d..2838d9a9a543f 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -1066,6 +1066,34 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, // } +bool Driver::loadZOSCustomizationFile(llvm::cl::ExpansionContext &ExpCtx) { + if (IsCLMode() || IsDXCMode() || IsFlangMode()) + return false; + + SmallString<128> CustomizationFile; + StringRef PathLIBEnv = StringRef(getenv("CLANG_CONFIG_PATH")).trim(); + // If the env var is a directory then append "/clang.cfg" and treat + // that as the config file. Otherwise treat the env var as the + // config file. + if (!PathLIBEnv.empty()) { + llvm::sys::path::append(CustomizationFile, PathLIBEnv); + if (llvm::sys::fs::is_directory(PathLIBEnv)) + llvm::sys::path::append(CustomizationFile, "/clang.cfg"); + if (llvm::sys::fs::is_regular_file(CustomizationFile)) + return readConfigFile(CustomizationFile, ExpCtx); + Diag(diag::err_drv_config_file_not_found) << CustomizationFile; + return true; + } + + SmallString<128> BaseDir(llvm::sys::path::parent_path(Dir)); + llvm::sys::path::append(CustomizationFile, BaseDir + "/etc/clang.cfg"); + if (llvm::sys::fs::is_regular_file(CustomizationFile)) + return readConfigFile(CustomizationFile, ExpCtx); + + // If no customization file, just return + return false; +} + static void appendOneArg(InputArgList &Args, const Arg *Opt) { // The args for config files or /clang: flags belong to different InputArgList // objects than Args. This copies an Arg from one of those other InputArgLists @@ -1286,11 +1314,18 @@ bool Driver::loadDefaultConfigFiles(llvm::cl::ExpansionContext &ExpCtx) { } // Otherwise, use the real triple as used by the driver. + llvm::Triple RealTriple = + computeTargetTriple(*this, TargetTriple, *CLOptions); if (Triple.str().empty()) { - Triple = computeTargetTriple(*this, TargetTriple, *CLOptions); + Triple = RealTriple; assert(!Triple.str().empty()); } + // On z/OS, start by loading the customization file before loading + // the usual default config file(s). + if (RealTriple.isOSzOS() && loadZOSCustomizationFile(ExpCtx)) + return true; + // Search for config files in the following order: // 1. -.cfg using real driver mode // (e.g. i386-pc-linux-gnu-clang++.cfg). @@ -6747,8 +6782,6 @@ const ToolChain &Driver::getToolChain(const ArgList &Args, TC = std::make_unique(*this, Target, Args); else if (Target.isOSBinFormatELF()) TC = std::make_unique(*this, Target, Args); - else if (Target.isAppleMachO()) - TC = std::make_unique(*this, Target, Args); else if (Target.isOSBinFormatMachO()) TC = std::make_unique(*this, Target, Args); else diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp index e5dffb11d1a5e..56b6dd78673cb 100644 --- a/clang/lib/Driver/ToolChains/Darwin.cpp +++ b/clang/lib/Driver/ToolChains/Darwin.cpp @@ -966,14 +966,11 @@ MachO::MachO(const Driver &D, const llvm::Triple &Triple, const ArgList &Args) getProgramPaths().push_back(getDriver().Dir); } -AppleMachO::AppleMachO(const Driver &D, const llvm::Triple &Triple, - const ArgList &Args) - : MachO(D, Triple, Args), CudaInstallation(D, Triple, Args), - RocmInstallation(D, Triple, Args), SYCLInstallation(D, Triple, Args) {} - /// Darwin - Darwin tool chain for i386 and x86_64. Darwin::Darwin(const Driver &D, const llvm::Triple &Triple, const ArgList &Args) - : AppleMachO(D, Triple, Args), TargetInitialized(false) {} + : MachO(D, Triple, Args), TargetInitialized(false), + CudaInstallation(D, Triple, Args), RocmInstallation(D, Triple, Args), + SYCLInstallation(D, Triple, Args) {} types::ID MachO::LookupTypeForExtension(StringRef Ext) const { types::ID Ty = ToolChain::LookupTypeForExtension(Ext); @@ -1022,18 +1019,18 @@ bool Darwin::hasBlocksRuntime() const { } } -void AppleMachO::AddCudaIncludeArgs(const ArgList &DriverArgs, - ArgStringList &CC1Args) const { +void Darwin::AddCudaIncludeArgs(const ArgList &DriverArgs, + ArgStringList &CC1Args) const { CudaInstallation->AddCudaIncludeArgs(DriverArgs, CC1Args); } -void AppleMachO::AddHIPIncludeArgs(const ArgList &DriverArgs, - ArgStringList &CC1Args) const { +void Darwin::AddHIPIncludeArgs(const ArgList &DriverArgs, + ArgStringList &CC1Args) const { RocmInstallation->AddHIPIncludeArgs(DriverArgs, CC1Args); } -void AppleMachO::addSYCLIncludeArgs(const ArgList &DriverArgs, - ArgStringList &CC1Args) const { +void Darwin::addSYCLIncludeArgs(const ArgList &DriverArgs, + ArgStringList &CC1Args) const { SYCLInstallation->addSYCLIncludeArgs(DriverArgs, CC1Args); } @@ -1128,8 +1125,6 @@ VersionTuple MachO::getLinkerVersion(const llvm::opt::ArgList &Args) const { Darwin::~Darwin() {} -AppleMachO::~AppleMachO() {} - MachO::~MachO() {} std::string Darwin::ComputeEffectiveClangTriple(const ArgList &Args, @@ -2493,7 +2488,7 @@ static void AppendPlatformPrefix(SmallString<128> &Path, // Returns the effective sysroot from either -isysroot or --sysroot, plus the // platform prefix (if any). llvm::SmallString<128> -AppleMachO::GetEffectiveSysroot(const llvm::opt::ArgList &DriverArgs) const { +DarwinClang::GetEffectiveSysroot(const llvm::opt::ArgList &DriverArgs) const { llvm::SmallString<128> Path("/"); if (DriverArgs.hasArg(options::OPT_isysroot)) Path = DriverArgs.getLastArgValue(options::OPT_isysroot); @@ -2506,9 +2501,8 @@ AppleMachO::GetEffectiveSysroot(const llvm::opt::ArgList &DriverArgs) const { return Path; } -void AppleMachO::AddClangSystemIncludeArgs( - const llvm::opt::ArgList &DriverArgs, - llvm::opt::ArgStringList &CC1Args) const { +void DarwinClang::AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CC1Args) const { const Driver &D = getDriver(); llvm::SmallString<128> Sysroot = GetEffectiveSysroot(DriverArgs); @@ -2586,7 +2580,7 @@ bool DarwinClang::AddGnuCPlusPlusIncludePaths(const llvm::opt::ArgList &DriverAr return getVFS().exists(Base); } -void AppleMachO::AddClangCXXStdlibIncludeArgs( +void DarwinClang::AddClangCXXStdlibIncludeArgs( const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args) const { // The implementation from a base class will pass through the -stdlib to @@ -2643,60 +2637,55 @@ void AppleMachO::AddClangCXXStdlibIncludeArgs( } case ToolChain::CST_Libstdcxx: - AddGnuCPlusPlusIncludePaths(DriverArgs, CC1Args); - break; - } -} - -void AppleMachO::AddGnuCPlusPlusIncludePaths( - const llvm::opt::ArgList &DriverArgs, - llvm::opt::ArgStringList &CC1Args) const {} - -void DarwinClang::AddGnuCPlusPlusIncludePaths( - const llvm::opt::ArgList &DriverArgs, - llvm::opt::ArgStringList &CC1Args) const { - llvm::SmallString<128> UsrIncludeCxx = GetEffectiveSysroot(DriverArgs); - llvm::sys::path::append(UsrIncludeCxx, "usr", "include", "c++"); + llvm::SmallString<128> UsrIncludeCxx = Sysroot; + llvm::sys::path::append(UsrIncludeCxx, "usr", "include", "c++"); + + llvm::Triple::ArchType arch = getTriple().getArch(); + bool IsBaseFound = true; + switch (arch) { + default: break; + + case llvm::Triple::x86: + case llvm::Triple::x86_64: + IsBaseFound = AddGnuCPlusPlusIncludePaths(DriverArgs, CC1Args, UsrIncludeCxx, + "4.2.1", + "i686-apple-darwin10", + arch == llvm::Triple::x86_64 ? "x86_64" : ""); + IsBaseFound |= AddGnuCPlusPlusIncludePaths(DriverArgs, CC1Args, UsrIncludeCxx, + "4.0.0", "i686-apple-darwin8", + ""); + break; - llvm::Triple::ArchType arch = getTriple().getArch(); - bool IsBaseFound = true; - switch (arch) { - default: - break; + case llvm::Triple::arm: + case llvm::Triple::thumb: + IsBaseFound = AddGnuCPlusPlusIncludePaths(DriverArgs, CC1Args, UsrIncludeCxx, + "4.2.1", + "arm-apple-darwin10", + "v7"); + IsBaseFound |= AddGnuCPlusPlusIncludePaths(DriverArgs, CC1Args, UsrIncludeCxx, + "4.2.1", + "arm-apple-darwin10", + "v6"); + break; - case llvm::Triple::x86: - case llvm::Triple::x86_64: - IsBaseFound = AddGnuCPlusPlusIncludePaths( - DriverArgs, CC1Args, UsrIncludeCxx, "4.2.1", "i686-apple-darwin10", - arch == llvm::Triple::x86_64 ? "x86_64" : ""); - IsBaseFound |= AddGnuCPlusPlusIncludePaths( - DriverArgs, CC1Args, UsrIncludeCxx, "4.0.0", "i686-apple-darwin8", ""); - break; + case llvm::Triple::aarch64: + IsBaseFound = AddGnuCPlusPlusIncludePaths(DriverArgs, CC1Args, UsrIncludeCxx, + "4.2.1", + "arm64-apple-darwin10", + ""); + break; + } - case llvm::Triple::arm: - case llvm::Triple::thumb: - IsBaseFound = - AddGnuCPlusPlusIncludePaths(DriverArgs, CC1Args, UsrIncludeCxx, "4.2.1", - "arm-apple-darwin10", "v7"); - IsBaseFound |= - AddGnuCPlusPlusIncludePaths(DriverArgs, CC1Args, UsrIncludeCxx, "4.2.1", - "arm-apple-darwin10", "v6"); - break; + if (!IsBaseFound) { + getDriver().Diag(diag::warn_drv_libstdcxx_not_found); + } - case llvm::Triple::aarch64: - IsBaseFound = - AddGnuCPlusPlusIncludePaths(DriverArgs, CC1Args, UsrIncludeCxx, "4.2.1", - "arm64-apple-darwin10", ""); break; } - - if (!IsBaseFound) { - getDriver().Diag(diag::warn_drv_libstdcxx_not_found); - } } -void AppleMachO::AddCXXStdlibLibArgs(const ArgList &Args, - ArgStringList &CmdArgs) const { +void DarwinClang::AddCXXStdlibLibArgs(const ArgList &Args, + ArgStringList &CmdArgs) const { CXXStdlibType Type = GetCXXStdlibType(Args); switch (Type) { @@ -3632,7 +3621,7 @@ SanitizerMask Darwin::getSupportedSanitizers() const { return Res; } -void AppleMachO::printVerboseInfo(raw_ostream &OS) const { +void Darwin::printVerboseInfo(raw_ostream &OS) const { CudaInstallation->print(OS); RocmInstallation->print(OS); } diff --git a/clang/lib/Driver/ToolChains/Darwin.h b/clang/lib/Driver/ToolChains/Darwin.h index c44780c577f4f..5bc18581cfd2e 100644 --- a/clang/lib/Driver/ToolChains/Darwin.h +++ b/clang/lib/Driver/ToolChains/Darwin.h @@ -291,52 +291,8 @@ class LLVM_LIBRARY_VISIBILITY MachO : public ToolChain { /// } }; -/// Apple specific MachO extensions -class LLVM_LIBRARY_VISIBILITY AppleMachO : public MachO { -public: - AppleMachO(const Driver &D, const llvm::Triple &Triple, - const llvm::opt::ArgList &Args); - ~AppleMachO() override; - - /// } - /// @name Apple Specific ToolChain Implementation - /// { - void - AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs, - llvm::opt::ArgStringList &CC1Args) const override; - - void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs, - llvm::opt::ArgStringList &CC1Args) const override; - void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs, - llvm::opt::ArgStringList &CC1Args) const override; - void addSYCLIncludeArgs(const llvm::opt::ArgList &DriverArgs, - llvm::opt::ArgStringList &CC1Args) const override; - - void AddClangCXXStdlibIncludeArgs( - const llvm::opt::ArgList &DriverArgs, - llvm::opt::ArgStringList &CC1Args) const override; - void AddCXXStdlibLibArgs(const llvm::opt::ArgList &Args, - llvm::opt::ArgStringList &CmdArgs) const override; - - void printVerboseInfo(raw_ostream &OS) const override; - /// } - - LazyDetector CudaInstallation; - LazyDetector RocmInstallation; - LazyDetector SYCLInstallation; - -protected: - llvm::SmallString<128> - GetEffectiveSysroot(const llvm::opt::ArgList &DriverArgs) const; - -private: - virtual void - AddGnuCPlusPlusIncludePaths(const llvm::opt::ArgList &DriverArgs, - llvm::opt::ArgStringList &CC1Args) const; -}; - /// Darwin - The base Darwin tool chain. -class LLVM_LIBRARY_VISIBILITY Darwin : public AppleMachO { +class LLVM_LIBRARY_VISIBILITY Darwin : public MachO { public: /// Whether the information on the target has been initialized. // @@ -374,6 +330,10 @@ class LLVM_LIBRARY_VISIBILITY Darwin : public AppleMachO { /// The target variant triple that was specified (if any). mutable std::optional TargetVariantTriple; + LazyDetector CudaInstallation; + LazyDetector RocmInstallation; + LazyDetector SYCLInstallation; + private: void AddDeploymentTarget(llvm::opt::DerivedArgList &Args) const; @@ -385,7 +345,7 @@ class LLVM_LIBRARY_VISIBILITY Darwin : public AppleMachO { std::string ComputeEffectiveClangTriple(const llvm::opt::ArgList &Args, types::ID InputType) const override; - /// @name Darwin Specific Toolchain Implementation + /// @name Apple Specific Toolchain Implementation /// { void addMinVersionArgs(const llvm::opt::ArgList &Args, @@ -601,6 +561,13 @@ class LLVM_LIBRARY_VISIBILITY Darwin : public AppleMachO { ObjCRuntime getDefaultObjCRuntime(bool isNonFragile) const override; bool hasBlocksRuntime() const override; + void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CC1Args) const override; + void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CC1Args) const override; + void addSYCLIncludeArgs(const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CC1Args) const override; + bool UseObjCMixedDispatch() const override { // This is only used with the non-fragile ABI and non-legacy dispatch. @@ -631,6 +598,8 @@ class LLVM_LIBRARY_VISIBILITY Darwin : public AppleMachO { bool SupportsEmbeddedBitcode() const override; SanitizerMask getSupportedSanitizers() const override; + + void printVerboseInfo(raw_ostream &OS) const override; }; /// DarwinClang - The Darwin toolchain used by Clang. @@ -648,6 +617,16 @@ class LLVM_LIBRARY_VISIBILITY DarwinClang : public Darwin { llvm::opt::ArgStringList &CmdArgs, bool ForceLinkBuiltinRT = false) const override; + void AddClangCXXStdlibIncludeArgs( + const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CC1Args) const override; + + void AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CC1Args) const override; + + void AddCXXStdlibLibArgs(const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs) const override; + void AddCCKextLibArgs(const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const override; @@ -672,16 +651,15 @@ class LLVM_LIBRARY_VISIBILITY DarwinClang : public Darwin { StringRef Sanitizer, bool shared = true) const; - void - AddGnuCPlusPlusIncludePaths(const llvm::opt::ArgList &DriverArgs, - llvm::opt::ArgStringList &CC1Args) const override; - bool AddGnuCPlusPlusIncludePaths(const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args, llvm::SmallString<128> Base, llvm::StringRef Version, llvm::StringRef ArchDir, llvm::StringRef BitDir) const; + + llvm::SmallString<128> + GetEffectiveSysroot(const llvm::opt::ArgList &DriverArgs) const; }; } // end namespace toolchains diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 764c1a54dc340..7d12dc02f401b 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -1282,9 +1282,7 @@ static void initOption(AnalyzerOptions::ConfigTable &Config, Diags->Report(diag::err_analyzer_config_invalid_input) << Name << "a positive"; - auto Default = PositiveAnalyzerOption::create(DefaultVal); - assert(Default.has_value()); - OptionField = Default.value(); + OptionField = DefaultVal; } static void parseAnalyzerConfigs(AnalyzerOptions &AnOpts, diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index 8eba766f21a64..29723b573e771 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -1507,11 +1507,6 @@ static void InitializePredefinedMacros(const TargetInfo &TI, // ELF targets define __ELF__ if (TI.getTriple().isOSBinFormatELF()) Builder.defineMacro("__ELF__"); - else if (TI.getTriple().isAppleMachO()) - // Apple MachO targets define __MACH__ even when not using DarwinTargetInfo. - // Hurd will also define this in some circumstances, but that's done in - // HurdTargetInfo. Windows targets don't define this. - Builder.defineMacro("__MACH__"); // Target OS macro definitions. if (PPOpts.DefineTargetOSMacros) { diff --git a/clang/lib/Lex/InitHeaderSearch.cpp b/clang/lib/Lex/InitHeaderSearch.cpp index bb2a21356fa8f..67c9d92b849ea 100644 --- a/clang/lib/Lex/InitHeaderSearch.cpp +++ b/clang/lib/Lex/InitHeaderSearch.cpp @@ -313,7 +313,7 @@ bool InitHeaderSearch::ShouldAddDefaultIncludePaths( break; case llvm::Triple::UnknownOS: - if (triple.isWasm() || triple.isAppleMachO()) + if (triple.isWasm()) return false; break; diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp index 94f59bbc0aa36..ac5d51a1d2ff6 100644 --- a/clang/lib/Sema/SemaExceptionSpec.cpp +++ b/clang/lib/Sema/SemaExceptionSpec.cpp @@ -1402,6 +1402,7 @@ CanThrowResult Sema::canThrow(const Stmt *S) { case Stmt::OpenACCInitConstructClass: case Stmt::OpenACCShutdownConstructClass: case Stmt::OpenACCSetConstructClass: + case Stmt::OpenACCUpdateConstructClass: // These expressions can never throw. return CT_Cannot; diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 562c98c6babe0..ae40895980d90 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -16592,6 +16592,13 @@ ExprResult Sema::BuildVAArgExpr(SourceLocation BuiltinLoc, << TInfo->getTypeLoc().getSourceRange(); } + if (TInfo->getType()->isArrayType()) { + DiagRuntimeBehavior(TInfo->getTypeLoc().getBeginLoc(), E, + PDiag(diag::warn_second_parameter_to_va_arg_array) + << TInfo->getType() + << TInfo->getTypeLoc().getSourceRange()); + } + // Check for va_arg where arguments of the given type will be promoted // (i.e. this va_arg is guaranteed to have undefined behavior). QualType PromoteType; diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp index 846b1966e765a..00cd3a009386e 100644 --- a/clang/lib/Sema/SemaOpenACC.cpp +++ b/clang/lib/Sema/SemaOpenACC.cpp @@ -709,18 +709,11 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitTileClause( OpenACCClause *SemaOpenACCClauseVisitor::VisitIfClause( SemaOpenACC::OpenACCParsedClause &Clause) { - // Restrictions only properly implemented on 'compute'/'combined'/'data' - // constructs, and 'compute'/'combined'/'data' constructs are the only - // constructs that can do anything with this yet, so skip/treat as - // unimplemented in this case. - if (!isDirectiveKindImplemented(Clause.getDirectiveKind())) - return isNotImplemented(); - // There is no prose in the standard that says duplicates aren't allowed, // but this diagnostic is present in other compilers, as well as makes // sense. Prose DOES exist for 'data' and 'host_data', 'set', 'enter data' and // 'exit data' both don't, but other implmementations do this. OpenACC issue - // 519 filed for the latter two. + // 519 filed for the latter two. Prose also exists for 'update'. // GCC allows this on init/shutdown, presumably for good reason, so we do too. if (Clause.getDirectiveKind() != OpenACCDirectiveKind::Init && Clause.getDirectiveKind() != OpenACCDirectiveKind::Shutdown && @@ -944,13 +937,6 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitVectorLengthClause( OpenACCClause *SemaOpenACCClauseVisitor::VisitAsyncClause( SemaOpenACC::OpenACCParsedClause &Clause) { - // Restrictions only properly implemented on 'compute'/'combined'/'data' - // constructs, and 'compute'/'combined'/'data' constructs are the only - // construct that can do anything with this yet, so skip/treat as - // unimplemented in this case. - if (!isDirectiveKindImplemented(Clause.getDirectiveKind())) - return isNotImplemented(); - // There is no prose in the standard that says duplicates aren't allowed, // but this diagnostic is present in other compilers, as well as makes // sense. @@ -1185,13 +1171,6 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitDevicePtrClause( OpenACCClause *SemaOpenACCClauseVisitor::VisitWaitClause( SemaOpenACC::OpenACCParsedClause &Clause) { - // Restrictions only properly implemented on 'compute'/'combined'/'data' - // constructs, and 'compute'/'combined'/'data' constructs are the only - // construct that can do anything with this yet, so skip/treat as - // unimplemented in this case. - if (!isDirectiveKindImplemented(Clause.getDirectiveKind())) - return isNotImplemented(); - return OpenACCWaitClause::Create( Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getDevNumExpr(), Clause.getQueuesLoc(), Clause.getQueueIdExprs(), Clause.getEndLoc()); @@ -1744,8 +1723,6 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitFinalizeClause( OpenACCClause *SemaOpenACCClauseVisitor::VisitIfPresentClause( SemaOpenACC::OpenACCParsedClause &Clause) { - if (!isDirectiveKindImplemented(Clause.getDirectiveKind())) - return isNotImplemented(); // There isn't anything to do here, this is only valid on one construct, and // has no associated rules. return OpenACCIfPresentClause::Create(Ctx, Clause.getBeginLoc(), @@ -1936,6 +1913,7 @@ bool PreserveLoopRAIIDepthInAssociatedStmtRAII(OpenACCDirectiveKind DK) { case OpenACCDirectiveKind::Init: case OpenACCDirectiveKind::Shutdown: case OpenACCDirectiveKind::Set: + case OpenACCDirectiveKind::Update: llvm_unreachable("Doesn't have an associated stmt"); default: case OpenACCDirectiveKind::Invalid: @@ -2365,6 +2343,7 @@ void SemaOpenACC::ActOnConstruct(OpenACCDirectiveKind K, case OpenACCDirectiveKind::Init: case OpenACCDirectiveKind::Shutdown: case OpenACCDirectiveKind::Set: + case OpenACCDirectiveKind::Update: // Nothing to do here, there is no real legalization that needs to happen // here as these constructs do not take any arguments. break; @@ -3713,6 +3692,9 @@ bool SemaOpenACC::ActOnStartStmtDirective( OpenACCClauseKind::DeviceType, OpenACCClauseKind::If}); + // TODO: OpenACC: 'Update' construct needs to have one of 'self', 'host', or + // 'device'. Implement here. + return diagnoseConstructAppertainment(*this, K, StartLoc, /*IsStmt=*/true); } @@ -3780,6 +3762,10 @@ StmtResult SemaOpenACC::ActOnEndStmtDirective( return OpenACCSetConstruct::Create(getASTContext(), StartLoc, DirLoc, EndLoc, Clauses); } + case OpenACCDirectiveKind::Update: { + return OpenACCUpdateConstruct::Create(getASTContext(), StartLoc, DirLoc, + EndLoc, Clauses); + } } llvm_unreachable("Unhandled case in directive handling?"); } diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp index 418d8d8c81206..49b554516d5bb 100644 --- a/clang/lib/Sema/SemaStmt.cpp +++ b/clang/lib/Sema/SemaStmt.cpp @@ -611,6 +611,15 @@ Sema::ActOnLabelStmt(SourceLocation IdentLoc, LabelDecl *TheDecl, if (getCurScope()->isInOpenACCComputeConstructScope()) setFunctionHasBranchProtectedScope(); + // OpenACC3.3 2.14.4: + // The update directive is executable. It must not appear in place of the + // statement following an 'if', 'while', 'do', 'switch', or 'label' in C or + // C++. + if (isa(SubStmt)) { + Diag(SubStmt->getBeginLoc(), diag::err_acc_update_as_body) << /*Label*/ 4; + SubStmt = new (Context) NullStmt(SubStmt->getBeginLoc()); + } + // Otherwise, things are good. Fill in the declaration and return it. LabelStmt *LS = new (Context) LabelStmt(IdentLoc, TheDecl, SubStmt); TheDecl->setStmt(LS); @@ -1005,6 +1014,15 @@ StmtResult Sema::ActOnIfStmt(SourceLocation IfLoc, Diags.Report(IfLoc, diag::warn_consteval_if_always_true) << Immediate; } + // OpenACC3.3 2.14.4: + // The update directive is executable. It must not appear in place of the + // statement following an 'if', 'while', 'do', 'switch', or 'label' in C or + // C++. + if (isa(thenStmt)) { + Diag(thenStmt->getBeginLoc(), diag::err_acc_update_as_body) << /*if*/ 0; + thenStmt = new (Context) NullStmt(thenStmt->getBeginLoc()); + } + return BuildIfStmt(IfLoc, StatementKind, LParenLoc, InitStmt, Cond, RParenLoc, thenStmt, ElseLoc, elseStmt); } @@ -1283,6 +1301,16 @@ Sema::ActOnFinishSwitchStmt(SourceLocation SwitchLoc, Stmt *Switch, getCurFunction()->SwitchStack.pop_back(); if (!BodyStmt) return StmtError(); + + // OpenACC3.3 2.14.4: + // The update directive is executable. It must not appear in place of the + // statement following an 'if', 'while', 'do', 'switch', or 'label' in C or + // C++. + if (isa(BodyStmt)) { + Diag(BodyStmt->getBeginLoc(), diag::err_acc_update_as_body) << /*switch*/ 3; + BodyStmt = new (Context) NullStmt(BodyStmt->getBeginLoc()); + } + SS->setBody(BodyStmt, SwitchLoc); Expr *CondExpr = SS->getCond(); @@ -1760,6 +1788,15 @@ StmtResult Sema::ActOnWhileStmt(SourceLocation WhileLoc, !Diags.isIgnored(diag::warn_comma_operator, CondVal.second->getExprLoc())) CommaVisitor(*this).Visit(CondVal.second); + // OpenACC3.3 2.14.4: + // The update directive is executable. It must not appear in place of the + // statement following an 'if', 'while', 'do', 'switch', or 'label' in C or + // C++. + if (isa(Body)) { + Diag(Body->getBeginLoc(), diag::err_acc_update_as_body) << /*while*/ 1; + Body = new (Context) NullStmt(Body->getBeginLoc()); + } + if (isa(Body)) getCurCompoundScope().setHasEmptyLoopBodies(); @@ -1789,6 +1826,15 @@ Sema::ActOnDoStmt(SourceLocation DoLoc, Stmt *Body, !Diags.isIgnored(diag::warn_comma_operator, Cond->getExprLoc())) CommaVisitor(*this).Visit(Cond); + // OpenACC3.3 2.14.4: + // The update directive is executable. It must not appear in place of the + // statement following an 'if', 'while', 'do', 'switch', or 'label' in C or + // C++. + if (isa(Body)) { + Diag(Body->getBeginLoc(), diag::err_acc_update_as_body) << /*do*/ 2; + Body = new (Context) NullStmt(Body->getBeginLoc()); + } + return new (Context) DoStmt(Body, Cond, DoLoc, WhileLoc, CondRParen); } diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index c40ff8b0d2011..bff1e5bd8f078 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -4178,6 +4178,15 @@ class TreeTransform { SourceLocation{}, {}, SourceLocation{}, EndLoc, Clauses, {}); } + StmtResult RebuildOpenACCUpdateConstruct(SourceLocation BeginLoc, + SourceLocation DirLoc, + SourceLocation EndLoc, + ArrayRef Clauses) { + return getSema().OpenACC().ActOnEndStmtDirective( + OpenACCDirectiveKind::Update, BeginLoc, DirLoc, SourceLocation{}, + SourceLocation{}, {}, SourceLocation{}, EndLoc, Clauses, {}); + } + StmtResult RebuildOpenACCWaitConstruct( SourceLocation BeginLoc, SourceLocation DirLoc, SourceLocation LParenLoc, Expr *DevNumExpr, SourceLocation QueuesLoc, ArrayRef QueueIdExprs, @@ -12471,6 +12480,23 @@ TreeTransform::TransformOpenACCSetConstruct(OpenACCSetConstruct *C) { TransformedClauses); } +template +StmtResult TreeTransform::TransformOpenACCUpdateConstruct( + OpenACCUpdateConstruct *C) { + getSema().OpenACC().ActOnConstruct(C->getDirectiveKind(), C->getBeginLoc()); + + llvm::SmallVector TransformedClauses = + getDerived().TransformOpenACCClauseList(C->getDirectiveKind(), + C->clauses()); + if (getSema().OpenACC().ActOnStartStmtDirective( + C->getDirectiveKind(), C->getBeginLoc(), TransformedClauses)) + return StmtError(); + + return getDerived().RebuildOpenACCUpdateConstruct( + C->getBeginLoc(), C->getDirectiveLoc(), C->getEndLoc(), + TransformedClauses); +} + template StmtResult TreeTransform::TransformOpenACCWaitConstruct(OpenACCWaitConstruct *C) { diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp index 32e20c1508144..4766f34e9f3a8 100644 --- a/clang/lib/Serialization/ASTReaderStmt.cpp +++ b/clang/lib/Serialization/ASTReaderStmt.cpp @@ -2880,6 +2880,11 @@ void ASTStmtReader::VisitOpenACCSetConstruct(OpenACCSetConstruct *S) { VisitOpenACCConstructStmt(S); } +void ASTStmtReader::VisitOpenACCUpdateConstruct(OpenACCUpdateConstruct *S) { + VisitStmt(S); + VisitOpenACCConstructStmt(S); +} + void ASTStmtReader::VisitOpenACCHostDataConstruct(OpenACCHostDataConstruct *S) { VisitStmt(S); VisitOpenACCAssociatedStmtConstruct(S); @@ -4417,6 +4422,11 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) { S = OpenACCSetConstruct::CreateEmpty(Context, NumClauses); break; } + case STMT_OPENACC_UPDATE_CONSTRUCT: { + unsigned NumClauses = Record[ASTStmtReader::NumStmtFields]; + S = OpenACCUpdateConstruct::CreateEmpty(Context, NumClauses); + break; + } case EXPR_REQUIRES: { unsigned numLocalParameters = Record[ASTStmtReader::NumExprFields]; unsigned numRequirement = Record[ASTStmtReader::NumExprFields + 1]; diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp index de0e7bf5f176f..7eedf7da7d3fc 100644 --- a/clang/lib/Serialization/ASTWriterStmt.cpp +++ b/clang/lib/Serialization/ASTWriterStmt.cpp @@ -2963,6 +2963,12 @@ void ASTStmtWriter::VisitOpenACCSetConstruct(OpenACCSetConstruct *S) { Code = serialization::STMT_OPENACC_SET_CONSTRUCT; } +void ASTStmtWriter::VisitOpenACCUpdateConstruct(OpenACCUpdateConstruct *S) { + VisitStmt(S); + VisitOpenACCConstructStmt(S); + Code = serialization::STMT_OPENACC_UPDATE_CONSTRUCT; +} + void ASTStmtWriter::VisitOpenACCHostDataConstruct(OpenACCHostDataConstruct *S) { VisitStmt(S); VisitOpenACCAssociatedStmtConstruct(S); diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp index 70e95c2c644c0..ff8bdcea9a220 100644 --- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp +++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp @@ -1833,6 +1833,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred, case Stmt::OpenACCInitConstructClass: case Stmt::OpenACCShutdownConstructClass: case Stmt::OpenACCSetConstructClass: + case Stmt::OpenACCUpdateConstructClass: case Stmt::OMPUnrollDirectiveClass: case Stmt::OMPMetaDirectiveClass: case Stmt::HLSLOutArgExprClass: { diff --git a/clang/test/AST/ast-print-openacc-update-construct.cpp b/clang/test/AST/ast-print-openacc-update-construct.cpp new file mode 100644 index 0000000000000..627c15be4b863 --- /dev/null +++ b/clang/test/AST/ast-print-openacc-update-construct.cpp @@ -0,0 +1,32 @@ +// RUN: %clang_cc1 -fopenacc -ast-print %s -o - | FileCheck %s +void uses(bool cond) { + int I; + int *iPtr; + int array[5]; + // CHECK: #pragma acc update +#pragma acc update + +// CHECK: #pragma acc update if_present +#pragma acc update if_present +// CHECK: #pragma acc update if(cond) +#pragma acc update if(cond) + +// CHECK: #pragma acc update async +#pragma acc update async +// CHECK: #pragma acc update async(*iPtr) +#pragma acc update async(*iPtr) +// CHECK: #pragma acc update async(I) +#pragma acc update async(I) + +// CHECK: #pragma acc update wait(*iPtr, I) async +#pragma acc update wait(*iPtr, I) async + +// CHECK: #pragma acc update wait(queues: *iPtr, I) async(*iPtr) +#pragma acc update wait(queues:*iPtr, I) async(*iPtr) + +// CHECK: #pragma acc update wait(devnum: I : *iPtr, I) async(I) +#pragma acc update wait(devnum:I:*iPtr, I) async(I) + +// CHECK: #pragma acc update wait(devnum: I : queues: *iPtr, I) if(I == array[I]) async(I) +#pragma acc update wait(devnum:I:queues:*iPtr, I) if(I == array[I]) async(I) +} diff --git a/clang/test/CodeGen/sanitize-type-globals.cpp b/clang/test/CodeGen/sanitize-type-globals.cpp index 7cb8de8b238cc..1154ab4ca5df2 100644 --- a/clang/test/CodeGen/sanitize-type-globals.cpp +++ b/clang/test/CodeGen/sanitize-type-globals.cpp @@ -3,7 +3,10 @@ //. // CHECK: @x = global %struct.CompleteS zeroinitializer, align 8 +// CHECK: @xExtern = external global %struct.CompleteS, align 8 // CHECK: @y = external global %struct.S, align 1 +// CHECK: @d = global %class.b zeroinitializer, align 1 +// CHECK: @_ZN1b1eE = external global %class.a, align 1 // CHECK: @__tysan_shadow_memory_address = external global i64 // CHECK: @__tysan_app_memory_mask = external global i64 // CHECK: @__tysan_v1_Simple_20C_2b_2b_20TBAA = linkonce_odr constant { i64, i64, [16 x i8] } { i64 2, i64 0, [16 x i8] c"Simple C++ TBAA\00" }, comdat @@ -12,8 +15,9 @@ // CHECK: @__tysan_v1_any_20pointer = linkonce_odr constant { i64, i64, ptr, i64, [12 x i8] } { i64 2, i64 1, ptr @__tysan_v1_omnipotent_20char, i64 0, [12 x i8] c"any pointer\00" }, comdat // CHECK: @__tysan_v1_p1_20int = linkonce_odr constant { i64, i64, ptr, i64, [7 x i8] } { i64 2, i64 1, ptr @__tysan_v1_any_20pointer, i64 0, [7 x i8] c"p1 int\00" }, comdat // CHECK: @__tysan_v1___ZTS9CompleteS = linkonce_odr constant { i64, i64, ptr, i64, ptr, i64, [15 x i8] } { i64 2, i64 2, ptr @__tysan_v1_int, i64 0, ptr @__tysan_v1_p1_20int, i64 8, [15 x i8] c"_ZTS9CompleteS\00" }, comdat -// CHECK: @llvm.used = appending global [7 x ptr] [ptr @tysan.module_ctor, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, ptr @__tysan_v1_omnipotent_20char, ptr @__tysan_v1_int, ptr @__tysan_v1_any_20pointer, ptr @__tysan_v1_p1_20int, ptr @__tysan_v1___ZTS9CompleteS], section "llvm.metadata" -// CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }] +// CHECK: @__tysan_v1___ZTS1b = linkonce_odr constant { i64, i64, [7 x i8] } { i64 2, i64 0, [7 x i8] c"_ZTS1b\00" }, comdat +// CHECK: @llvm.used = appending global [8 x ptr] [ptr @tysan.module_ctor, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, ptr @__tysan_v1_omnipotent_20char, ptr @__tysan_v1_int, ptr @__tysan_v1_any_20pointer, ptr @__tysan_v1_p1_20int, ptr @__tysan_v1___ZTS9CompleteS, ptr @__tysan_v1___ZTS1b], section "llvm.metadata" +// CHECK: @llvm.global_ctors = appending global [2 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @_GLOBAL__sub_I_sanitize_type_globals.cpp, ptr null }, { i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }] //. struct CompleteS { int x; @@ -22,13 +26,18 @@ struct CompleteS { void f(CompleteS *); CompleteS x; +extern CompleteS xExtern; // CHECK-LABEL: define dso_local void @_Z1gv( // CHECK-SAME: ) #[[ATTR0:[0-9]+]] { // CHECK: [[ENTRY:.*:]] // CHECK: call void @_Z1fP9CompleteS(ptr noundef @x) +// CHECK: call void @_Z1fP9CompleteS(ptr noundef @xExtern) // CHECK: ret void // -void g() { f(&x); } +void g() { + f(&x); + f(&xExtern); +} typedef struct S IncompleteS; void f(IncompleteS *); @@ -40,11 +49,21 @@ extern IncompleteS y; // CHECK: ret void // void h() { f(&y); } + +class a; +class b { +public: + using c = a; + static c e; + b(int, c & = e); +} d = 0; + //. // CHECK: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone sanitize_type "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } // CHECK: attributes #[[ATTR1:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } -// CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } -// CHECK: attributes #[[ATTR3:[0-9]+]] = { nounwind } +// CHECK: attributes #[[ATTR2:[0-9]+]] = { noinline nounwind sanitize_type "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } +// CHECK: attributes #[[ATTR3:[0-9]+]] = { nounwind "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } +// CHECK: attributes #[[ATTR4:[0-9]+]] = { nounwind } //. // CHECK: [[META0:![0-9]+]] = !{ptr @x, [[META1:![0-9]+]]} // CHECK: [[META1]] = !{!"_ZTS9CompleteS", [[META2:![0-9]+]], i64 0, [[META5:![0-9]+]], i64 8} @@ -53,6 +72,8 @@ void h() { f(&y); } // CHECK: [[META4]] = !{!"Simple C++ TBAA"} // CHECK: [[META5]] = !{!"p1 int", [[META6:![0-9]+]], i64 0} // CHECK: [[META6]] = !{!"any pointer", [[META3]], i64 0} -// CHECK: [[META7:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -// CHECK: [[META8:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +// CHECK: [[META7:![0-9]+]] = !{ptr @d, [[META8:![0-9]+]]} +// CHECK: [[META8]] = !{!"_ZTS1b"} +// CHECK: [[META9:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// CHECK: [[META10:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} //. diff --git a/clang/test/CodeGen/xcore-abi.c b/clang/test/CodeGen/xcore-abi.c index bb8d2fec46bdb..40e2f418f7304 100644 --- a/clang/test/CodeGen/xcore-abi.c +++ b/clang/test/CodeGen/xcore-abi.c @@ -76,7 +76,8 @@ void testva (int n, ...) { // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[V5]], ptr align 4 [[P]], i32 20, i1 false) // CHECK: call void @f(ptr noundef [[V5]]) - int* v6 = va_arg (ap, int[4]); // an unusual aggregate type + // an unusual aggregate type + int* v6 = va_arg (ap, int[4]); // expected-warning{{second argument to 'va_arg' is of array type 'int[4]'}} f(v6); // CHECK: [[I:%[a-z0-9]+]] = load ptr, ptr [[AP]] // CHECK: [[P:%[a-z0-9]+]] = load ptr, ptr [[I]] diff --git a/clang/test/Driver/Inputs/MacOSX15.1.sdk/embedded/usr/include/.keep b/clang/test/Driver/Inputs/MacOSX15.1.sdk/embedded/usr/include/.keep deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/MacOSX15.1.sdk/embedded/usr/local/include/.keep b/clang/test/Driver/Inputs/MacOSX15.1.sdk/embedded/usr/local/include/.keep deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/MacOSX15.1.sdk/usr/include/c++/v1/.keep b/clang/test/Driver/Inputs/MacOSX15.1.sdk/usr/include/c++/v1/.keep deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/MacOSX15.1.sdk/usr/local/include/.keep b/clang/test/Driver/Inputs/MacOSX15.1.sdk/usr/local/include/.keep deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/config-zos/clang.cfg b/clang/test/Driver/Inputs/config-zos/clang.cfg new file mode 100644 index 0000000000000..43a5dbfaa6182 --- /dev/null +++ b/clang/test/Driver/Inputs/config-zos/clang.cfg @@ -0,0 +1 @@ +-DABC=123 diff --git a/clang/test/Driver/Inputs/config-zos/def.cfg b/clang/test/Driver/Inputs/config-zos/def.cfg new file mode 100644 index 0000000000000..156f9c85fb4f2 --- /dev/null +++ b/clang/test/Driver/Inputs/config-zos/def.cfg @@ -0,0 +1 @@ +-DDEF=456 diff --git a/clang/test/Driver/Inputs/config-zos/tst/def.cfg b/clang/test/Driver/Inputs/config-zos/tst/def.cfg new file mode 100644 index 0000000000000..156f9c85fb4f2 --- /dev/null +++ b/clang/test/Driver/Inputs/config-zos/tst/def.cfg @@ -0,0 +1 @@ +-DDEF=456 diff --git a/clang/test/Driver/config-zos.c b/clang/test/Driver/config-zos.c new file mode 100644 index 0000000000000..8de02ec101b91 --- /dev/null +++ b/clang/test/Driver/config-zos.c @@ -0,0 +1,17 @@ +// REQUIRES: shell +// REQUIRES: systemz-registered-target + +// RUN: unset CLANG_NO_DEFAULT_CONFIG +// RUN: rm -rf %t && mkdir %t + +// RUN: mkdir -p %t/testbin +// RUN: mkdir -p %t/etc +// RUN: ln -s %clang %t/testbin/clang +// RUN: echo "-DXYZ=789" >%t/etc/clang.cfg +// RUN: %t/testbin/clang --target=s390x-ibm-zos -c -### -no-canonical-prefixes %s 2>&1 | FileCheck -DDIR=%t %s +// RUN: %t/testbin/clang --target=s390x-ibm-zos -c -### -no-canonical-prefixes --no-default-config %s 2>&1 | FileCheck -check-prefix=NOCONFIG %s +// +// CHECK: Configuration file: [[DIR]]/etc/clang.cfg +// CHECK: "-D" "XYZ=789" +// NOCONFIG-NOT: Configuration file: {{.*}}/etc/clang.cfg +// NOCONFIG-NOT: "-D" "XYZ=789" diff --git a/clang/test/Driver/config-zos1.c b/clang/test/Driver/config-zos1.c new file mode 100644 index 0000000000000..5b1012d00736c --- /dev/null +++ b/clang/test/Driver/config-zos1.c @@ -0,0 +1,23 @@ +// REQUIRES: shell +// REQUIRES: systemz-registered-target + +// RUN: unset CLANG_NO_DEFAULT_CONFIG + +// RUN: export CLANG_CONFIG_PATH=%S/Inputs/config-zos +// RUN: %clang --target=s390x-ibm-zos -c -### %s 2>&1 | FileCheck %s +// CHECK: Configuration file: {{.*}}/Inputs/config-zos/clang.cfg +// CHECK: "-D" "ABC=123" + +// RUN: export CLANG_CONFIG_PATH=%S/Inputs/config-zos/def.cfg +// RUN: %clang --target=s390x-ibm-zos -c -### %s 2>&1 | FileCheck %s -check-prefix=CHECK-DEF +// CHECK-DEF: Configuration file: {{.*}}/Inputs/config-zos/def.cfg +// CHECK-DEF: "-D" "DEF=456" + +// RUN: export CLANG_CONFIG_PATH=%S/Inputs/config-zos/Garbage +// RUN: not %clang --target=s390x-ibm-zos -c -### %s 2>&1 | FileCheck %s -check-prefix=CHECK-ERR +// CHECK-ERR: error: configuration file '{{.*}}/Inputs/config-zos/Garbage' cannot be found + +// The directory exists but no clang.cfg in it +// RUN: export CLANG_CONFIG_PATH=%S/Inputs/config-zos/tst +// RUN: not %clang --target=s390x-ibm-zos -c -### %s 2>&1 | FileCheck %s -check-prefix=CHECK-ERRDIR +// CHECK-ERRDIR: error: configuration file '{{.*}}/Inputs/config-zos/tst/clang.cfg' cannot be found diff --git a/clang/test/Driver/darwin-embedded-search-paths.c b/clang/test/Driver/darwin-embedded-search-paths.c deleted file mode 100644 index 030086a20865b..0000000000000 --- a/clang/test/Driver/darwin-embedded-search-paths.c +++ /dev/null @@ -1,43 +0,0 @@ -// UNSUPPORTED: system-windows -// Windows is unsupported because we use the Unix path separator `/` in the test. -// XFAIL: * -// Unlike the Darwin driver, the MachO driver doesn't add any framework search paths, -// only the normal header ones. -// RUN: %clang -x c -target arm64-apple-none-macho -isysroot %S/Inputs/MacOSX15.1.sdk -### -c %s 2>&1 \ -// RUN: | FileCheck --check-prefixes=CC1,NO-CXX,ULI,CI,UI,NO-FW -DSDKROOT=%S/Inputs/MacOSX15.1.sdk %s - -// Unlike the Darwin driver, the MachO driver doesn't default to libc++ -// RUN: %clang -x c++ -target arm64-apple-none-macho -isysroot %S/Inputs/MacOSX15.1.sdk -### -c %s 2>&1 \ -// RUN: | FileCheck --check-prefixes=CC1,NO-CXX,ULI,CI,UI,NO-FW -DSDKROOT=%S/Inputs/MacOSX15.1.sdk %s - -// However, if the user requests libc++, the MachO driver should find the search path. -// RUN: %clang -x c++ -stdlib=libc++ -target arm64-apple-none-macho -isysroot %S/Inputs/MacOSX15.1.sdk -### -c %s 2>&1 \ -// RUN: | FileCheck --check-prefixes=CC1,CXX,ULI,CI,UI,NO-FW -DSDKROOT=%S/Inputs/MacOSX15.1.sdk %s - -// Verify that embedded uses can swap in alternate usr/include and usr/local/include directories. -// usr/local/include is specified in the driver as -internal-isystem, however, the driver generated -// paths come before the paths in the driver arguments. In order to keep usr/local/include in the -// same position, -isystem has to be used instead of -Xclang -internal-isystem. There isn't an -// -externc-isystem, but it's ok to use -Xclang -internal-externc-isystem since the driver doesn't -// use that if -nostdlibinc or -nostdinc is passed. -// RUN: %clang -x c++ -stdlib=libc++ -target arm64-apple-none-macho -isysroot %S/Inputs/MacOSX15.1.sdk \ -// RUN: -nostdlibinc -isystem %S/Inputs/MacOSX15.1.sdk/embedded/usr/local/include \ -// RUN: -Xclang -internal-externc-isystem -Xclang %S/Inputs/MacOSX15.1.sdk/embedded/usr/include \ -// RUN: -### -c %s 2>&1 | FileCheck --check-prefixes=CC1,NO-CXX,EULI,CI,EUI,NO-FW -DSDKROOT=%S/Inputs/MacOSX15.1.sdk %s - - -// The ordering of these flags doesn't matter, and so this test is a little -// fragile. i.e. all of the -internal-isystem paths will be searched before the -// -internal-externc-isystem ones, and their order on the command line doesn't -// matter. The line order here is just the current order that the driver writes -// the cc1 arguments. - -// CC1: "-cc1" -// NO-CXX-NOT: "-internal-isystem" "[[SDKROOT]]/usr/include/c++/v1" -// CXX-SAME: "-internal-isystem" "[[SDKROOT]]/usr/include/c++/v1" -// ULI-SAME: "-internal-isystem" "[[SDKROOT]]/usr/local/include" -// EULI-SAME: "-isystem" "[[SDKROOT]]/embedded/usr/local/include" -// CI-SAME: "-internal-isystem" "{{.*}}/clang/{{[[:digit:].]*}}/include" -// UI-SAME: "-internal-externc-isystem" "[[SDKROOT]]/usr/include" -// EUI-SAME: "-internal-externc-isystem" "[[SDKROOT]]/embedded/usr/include" -// NO-FW-NOT: "-internal-iframework" diff --git a/clang/test/ParserOpenACC/parse-clauses.c b/clang/test/ParserOpenACC/parse-clauses.c index 27970615c9581..9b88c147d0faa 100644 --- a/clang/test/ParserOpenACC/parse-clauses.c +++ b/clang/test/ParserOpenACC/parse-clauses.c @@ -343,23 +343,18 @@ struct HasMembersArray { void SelfUpdate() { struct Members s; - // expected-error@+2{{expected '('}} - // expected-warning@+1{{OpenACC construct 'update' not yet implemented, pragma ignored}} + // expected-error@+1{{expected '('}} #pragma acc update self for(int i = 0; i < 5;++i) {} - // expected-error@+6{{use of undeclared identifier 'zero'}} - // expected-error@+5{{expected ','}} - // expected-error@+4{{expected expression}} - // expected-warning@+3{{OpenACC clause 'self' not yet implemented, clause ignored}} - // expected-warning@+2{{OpenACC clause 'if_present' not yet implemented, clause ignored}} - // expected-warning@+1{{OpenACC construct 'update' not yet implemented, pragma ignored}} + // expected-error@+4{{use of undeclared identifier 'zero'}} + // expected-error@+3{{expected ','}} + // expected-error@+2{{expected expression}} + // expected-warning@+1{{OpenACC clause 'self' not yet implemented, clause ignored}} #pragma acc update self(zero : s.array[s.value : 5], s.value), if_present for(int i = 0; i < 5;++i) {} - // expected-warning@+3{{OpenACC clause 'self' not yet implemented, clause ignored}} - // expected-warning@+2{{OpenACC clause 'if_present' not yet implemented, clause ignored}} - // expected-warning@+1{{OpenACC construct 'update' not yet implemented, pragma ignored}} + // expected-warning@+1{{OpenACC clause 'self' not yet implemented, clause ignored}} #pragma acc update self(s.array[s.value : 5], s.value), if_present for(int i = 0; i < 5;++i) {} } diff --git a/clang/test/ParserOpenACC/parse-constructs.c b/clang/test/ParserOpenACC/parse-constructs.c index 7f090f828feb7..9948e33ac94d1 100644 --- a/clang/test/ParserOpenACC/parse-constructs.c +++ b/clang/test/ParserOpenACC/parse-constructs.c @@ -151,8 +151,7 @@ void func() { // expected-error@+1{{OpenACC 'set' construct must have at least one 'default_async', 'device_num', 'device_type' or 'if' clause}} #pragma acc set clause list for(;;){} - // expected-error@+2{{invalid OpenACC clause 'clause'}} - // expected-warning@+1{{OpenACC construct 'update' not yet implemented, pragma ignored}} + // expected-error@+1{{invalid OpenACC clause 'clause'}} #pragma acc update clause list for(;;){} } diff --git a/clang/test/Preprocessor/macho-embedded-predefines.c b/clang/test/Preprocessor/macho-embedded-predefines.c index a7e5777a89a98..74f29199218c4 100644 --- a/clang/test/Preprocessor/macho-embedded-predefines.c +++ b/clang/test/Preprocessor/macho-embedded-predefines.c @@ -3,18 +3,18 @@ // CHECK-7M: #define __APPLE_CC__ // CHECK-7M: #define __APPLE__ // CHECK-7M: #define __ARM_ARCH_7M__ -// CHECK-7M: #define __MACH__ +// CHECK-7M-NOT: #define __MACH__ // RUN: %clang_cc1 -E -dM -triple thumbv7em-apple-unknown-macho -target-cpu cortex-m4 %s | FileCheck %s -check-prefix CHECK-7EM // CHECK-7EM: #define __APPLE_CC__ // CHECK-7EM: #define __APPLE__ // CHECK-7EM: #define __ARM_ARCH_7EM__ -// CHECK-7EM: #define __MACH__ +// CHECK-7EM-NOT: #define __MACH__ // RUN: %clang_cc1 -E -dM -triple thumbv6m-apple-unknown-macho -target-cpu cortex-m0 %s | FileCheck %s -check-prefix CHECK-6M // CHECK-6M: #define __APPLE_CC__ // CHECK-6M: #define __APPLE__ // CHECK-6M: #define __ARM_ARCH_6M__ -// CHECK-6M: #define __MACH__ +// CHECK-6M-NOT: #define __MACH__ diff --git a/clang/test/Sema/varargs.c b/clang/test/Sema/varargs.c index 2cb7270f604a0..bec41dda65d57 100644 --- a/clang/test/Sema/varargs.c +++ b/clang/test/Sema/varargs.c @@ -75,6 +75,11 @@ void f9(__builtin_va_list args) (void)__builtin_va_arg(args, enum E); // Don't warn here in C (void)__builtin_va_arg(args, short); // expected-warning {{second argument to 'va_arg' is of promotable type 'short'}} (void)__builtin_va_arg(args, char); // expected-warning {{second argument to 'va_arg' is of promotable type 'char'}} + // Don't crash on some undefined behaviors. + int n; + (void)__builtin_va_arg(args, int[10]); // expected-warning{{second argument to 'va_arg' is of array type 'int[10]'}} + (void)__builtin_va_arg(args, int[++n]); // expected-warning{{second argument to 'va_arg' is of array type 'int[++n]'}} + (void)__builtin_va_arg(args, int[n][n]); // expected-warning{{second argument to 'va_arg' is of array type 'int[n][n]'}} } void f10(int a, ...) { diff --git a/clang/test/SemaOpenACC/update-construct-ast.cpp b/clang/test/SemaOpenACC/update-construct-ast.cpp new file mode 100644 index 0000000000000..f55409d99a13c --- /dev/null +++ b/clang/test/SemaOpenACC/update-construct-ast.cpp @@ -0,0 +1,200 @@ +// RUN: %clang_cc1 %s -fopenacc -ast-dump | FileCheck %s + +// Test this with PCH. +// RUN: %clang_cc1 %s -fopenacc -emit-pch -o %t %s +// RUN: %clang_cc1 %s -fopenacc -include-pch %t -ast-dump-all | FileCheck %s + +#ifndef PCH_HELPER +#define PCH_HELPER + +int some_int(); +long some_long(); + +void NormalFunc() { + // CHECK-LABEL: NormalFunc + // CHECK-NEXT: CompoundStmt + +#pragma acc update if_present if (some_int() < some_long()) + // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update + // CHECK-NEXT: if_present clause + // CHECK-NEXT: if clause + // CHECK-NEXT: BinaryOperator{{.*}}'bool' '<' + // CHECK-NEXT: ImplicitCastExpr{{.*}}'long' + // CHECK-NEXT: CallExpr{{.*}}'int' + // CHECK-NEXT: ImplicitCastExpr + // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()' + // CHECK-NEXT: CallExpr{{.*}} 'long' + // CHECK-NEXT: ImplicitCastExpr + // CHECK-NEXT: DeclRefExpr{{.*}}'some_long' 'long ()' + +#pragma acc update wait async + // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update + // CHECK-NEXT: wait clause + // CHECK-NEXT: <<>> + // CHECK-NEXT: async clause +#pragma acc update wait(some_int(), some_long()) async(some_int()) + // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update + // CHECK-NEXT: wait clause + // CHECK-NEXT: <<>> + // CHECK-NEXT: CallExpr{{.*}}'int' + // CHECK-NEXT: ImplicitCastExpr + // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()' + // CHECK-NEXT: CallExpr{{.*}}'long' + // CHECK-NEXT: ImplicitCastExpr + // CHECK-NEXT: DeclRefExpr{{.*}}'some_long' 'long ()' + // CHECK-NEXT: async clause + // CHECK-NEXT: CallExpr{{.*}}'int' + // CHECK-NEXT: ImplicitCastExpr + // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()' +#pragma acc update wait(queues:some_int(), some_long()) + // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update + // CHECK-NEXT: wait clause + // CHECK-NEXT: <<>> + // CHECK-NEXT: CallExpr{{.*}}'int' + // CHECK-NEXT: ImplicitCastExpr + // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()' + // CHECK-NEXT: CallExpr{{.*}}'long' + // CHECK-NEXT: ImplicitCastExpr + // CHECK-NEXT: DeclRefExpr{{.*}}'some_long' 'long ()' +#pragma acc update wait(devnum: some_int() :some_int(), some_long()) + // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update + // CHECK-NEXT: wait clause + // CHECK-NEXT: CallExpr{{.*}}'int' + // CHECK-NEXT: ImplicitCastExpr + // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()' + // CHECK-NEXT: CallExpr{{.*}}'int' + // CHECK-NEXT: ImplicitCastExpr + // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()' + // CHECK-NEXT: CallExpr{{.*}}'long' + // CHECK-NEXT: ImplicitCastExpr + // CHECK-NEXT: DeclRefExpr{{.*}}'some_long' 'long ()' +} + +template +void TemplFunc(T t) { + // CHECK-LABEL: FunctionTemplateDecl {{.*}}TemplFunc + // CHECK-NEXT: TemplateTypeParmDecl + // CHECK-NEXT: FunctionDecl{{.*}}TemplFunc + // CHECK-NEXT: ParmVarDecl{{.*}} t 'T' + // CHECK-NEXT: CompoundStmt + +#pragma acc update if_present if (T::value < t) + // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update + // CHECK-NEXT: if_present clause + // CHECK-NEXT: if clause + // CHECK-NEXT: BinaryOperator{{.*}}'' '<' + // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '' + // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T' + // CHECK-NEXT: DeclRefExpr{{.*}}'t' 'T' + +#pragma acc update wait async + // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update + // CHECK-NEXT: wait clause + // CHECK-NEXT: <<>> + // CHECK-NEXT: async clause +#pragma acc update wait(T::value, t) async(T::value) + // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update + // CHECK-NEXT: wait clause + // CHECK-NEXT: <<>> + // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}}'' + // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T' + // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'T' + // CHECK-NEXT: async clause + // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}}'' + // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T' +#pragma acc update wait(queues:T::value, t) async(t) + // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update + // CHECK-NEXT: wait clause + // CHECK-NEXT: <<>> + // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}}'' + // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T' + // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'T' + // CHECK-NEXT: async clause + // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'T' +#pragma acc update wait(devnum: T::value:t, T::value) + // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update + // CHECK-NEXT: wait clause + // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}}'' + // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T' + // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'T' + // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}}'' + // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T' + + // Instantiation: + // CHECK-NEXT: FunctionDecl{{.*}} TemplFunc 'void (SomeStruct)' implicit_instantiation + // CHECK-NEXT: TemplateArgument type 'SomeStruct' + // CHECK-NEXT: RecordType{{.*}} 'SomeStruct' + // CHECK-NEXT: CXXRecord{{.*}} 'SomeStruct' + // CHECK-NEXT: ParmVarDecl{{.*}} t 'SomeStruct' + // CHECK-NEXT: CompoundStmt + + // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update + // CHECK-NEXT: if_present clause + // CHECK-NEXT: if clause + // CHECK-NEXT: BinaryOperator{{.*}}'bool' '<' + // CHECK-NEXT: ImplicitCastExpr {{.*}}'unsigned int' + // CHECK-NEXT: DeclRefExpr{{.*}}'value' 'const unsigned int' + // CHECK-NEXT: NestedNameSpecifier TypeSpec 'SomeStruct' + // CHECK-NEXT: ImplicitCastExpr {{.*}}'unsigned int' + // CHECK-NEXT: CXXMemberCallExpr{{.*}}'unsigned int' + // CHECK-NEXT: MemberExpr{{.*}}.operator unsigned int + // CHECK-NEXT: DeclRefExpr{{.*}}'t' 'SomeStruct' + + // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update + // CHECK-NEXT: wait clause + // CHECK-NEXT: <<>> + // CHECK-NEXT: async clause + + // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update + // CHECK-NEXT: wait clause + // CHECK-NEXT: <<>> + // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int' + // CHECK-NEXT: DeclRefExpr{{.*}}'value' 'const unsigned int' + // CHECK-NEXT: NestedNameSpecifier TypeSpec 'SomeStruct' + // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int' + // CHECK-NEXT: CXXMemberCallExpr{{.*}}'unsigned int' + // CHECK-NEXT: MemberExpr{{.*}}.operator unsigned int + // CHECK-NEXT: DeclRefExpr{{.*}}'t' 'SomeStruct' + // CHECK-NEXT: async clause + // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int' + // CHECK-NEXT: DeclRefExpr{{.*}}'value' 'const unsigned int' + // CHECK-NEXT: NestedNameSpecifier TypeSpec 'SomeStruct' + + // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update + // CHECK-NEXT: wait clause + // CHECK-NEXT: <<>> + // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int' + // CHECK-NEXT: DeclRefExpr{{.*}}'value' 'const unsigned int' + // CHECK-NEXT: NestedNameSpecifier TypeSpec 'SomeStruct' + // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int' + // CHECK-NEXT: CXXMemberCallExpr{{.*}}'unsigned int' + // CHECK-NEXT: MemberExpr{{.*}}.operator unsigned int + // CHECK-NEXT: DeclRefExpr{{.*}}'t' 'SomeStruct' + // CHECK-NEXT: async clause + // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int' + // CHECK-NEXT: CXXMemberCallExpr{{.*}}'unsigned int' + // CHECK-NEXT: MemberExpr{{.*}}.operator unsigned int + // CHECK-NEXT: DeclRefExpr{{.*}}'t' 'SomeStruct' + + // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update + // CHECK-NEXT: wait clause + // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int' + // CHECK-NEXT: DeclRefExpr{{.*}}'value' 'const unsigned int' + // CHECK-NEXT: NestedNameSpecifier TypeSpec 'SomeStruct' + // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int' + // CHECK-NEXT: CXXMemberCallExpr{{.*}}'unsigned int' + // CHECK-NEXT: MemberExpr{{.*}}.operator unsigned int + // CHECK-NEXT: DeclRefExpr{{.*}}'t' 'SomeStruct' + // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int' + // CHECK-NEXT: DeclRefExpr{{.*}}'value' 'const unsigned int' + // CHECK-NEXT: NestedNameSpecifier TypeSpec 'SomeStruct' +} + +struct SomeStruct{ + static constexpr unsigned value = 5; + operator unsigned(); +}; +void use() { + TemplFunc(SomeStruct{}); +} +#endif diff --git a/clang/test/SemaOpenACC/update-construct.cpp b/clang/test/SemaOpenACC/update-construct.cpp new file mode 100644 index 0000000000000..6aa7613d2b81d --- /dev/null +++ b/clang/test/SemaOpenACC/update-construct.cpp @@ -0,0 +1,136 @@ +// RUN: %clang_cc1 %s -fopenacc -verify + +struct NotConvertible{} NC; +int getI(); +void uses() { + int Var; + // expected-warning@+1{{OpenACC clause 'self' not yet implemented}} +#pragma acc update async self(Var) + // expected-warning@+1{{OpenACC clause 'self' not yet implemented}} +#pragma acc update wait self(Var) + // expected-warning@+2{{OpenACC clause 'self' not yet implemented}} + // expected-warning@+1{{OpenACC clause 'device_type' not yet implemented}} +#pragma acc update self(Var) device_type(I) + // expected-warning@+1{{OpenACC clause 'self' not yet implemented}} +#pragma acc update if(true) self(Var) + // expected-warning@+1{{OpenACC clause 'self' not yet implemented}} +#pragma acc update if_present self(Var) + // expected-warning@+1{{OpenACC clause 'self' not yet implemented}} +#pragma acc update self(Var) + // expected-warning@+1{{OpenACC clause 'host' not yet implemented}} +#pragma acc update host(Var) + // expected-warning@+1{{OpenACC clause 'device' not yet implemented}} +#pragma acc update device(Var) + + // TODO: OpenACC: These all should diagnose as they aren't allowed after + // device_type. + // expected-warning@+3{{OpenACC clause 'self' not yet implemented}} + // expected-warning@+2{{OpenACC clause 'device_type' not yet implemented}} + // expected-warning@+1{{OpenACC clause 'device_type' not yet implemented}} +#pragma acc update self(Var) device_type(I) device_type(I) + // expected-warning@+2{{OpenACC clause 'self' not yet implemented}} + // expected-warning@+1{{OpenACC clause 'device_type' not yet implemented}} +#pragma acc update self(Var) device_type(I) if(true) + // expected-warning@+2{{OpenACC clause 'self' not yet implemented}} + // expected-warning@+1{{OpenACC clause 'device_type' not yet implemented}} +#pragma acc update self(Var) device_type(I) if_present + // expected-warning@+2{{OpenACC clause 'device_type' not yet implemented}} + // expected-warning@+1{{OpenACC clause 'self' not yet implemented}} +#pragma acc update device_type(I) self(Var) + // expected-warning@+2{{OpenACC clause 'device_type' not yet implemented}} + // expected-warning@+1{{OpenACC clause 'host' not yet implemented}} +#pragma acc update device_type(I) host(Var) + // expected-warning@+2{{OpenACC clause 'device_type' not yet implemented}} + // expected-warning@+1{{OpenACC clause 'device' not yet implemented}} +#pragma acc update device_type(I) device(Var) + // These 2 are OK. + // expected-warning@+2{{OpenACC clause 'self' not yet implemented}} + // expected-warning@+1{{OpenACC clause 'device_type' not yet implemented}} +#pragma acc update self(Var) device_type(I) async + // expected-warning@+2{{OpenACC clause 'self' not yet implemented}} + // expected-warning@+1{{OpenACC clause 'device_type' not yet implemented}} +#pragma acc update self(Var) device_type(I) wait + + // TODO: OpenACC: These should diagnose because there isn't at least 1 of + // 'self', 'host', or 'device'. +#pragma acc update async +#pragma acc update wait + // expected-warning@+1{{OpenACC clause 'device_type' not yet implemented}} +#pragma acc update device_type(I) +#pragma acc update if(true) +#pragma acc update if_present + + // expected-error@+2{{value of type 'struct NotConvertible' is not contextually convertible to 'bool'}} + // expected-warning@+1{{OpenACC clause 'device_type' not yet implemented}} +#pragma acc update if (NC) device_type(I) + + // expected-error@+2{{OpenACC 'if' clause cannot appear more than once on a 'update' directive}} + // expected-note@+1{{previous clause is here}} +#pragma acc update if(true) if (false) + + // TODO: OpenACC: There is restrictions on the contents of a 'varlist', so + // those should be checked here too. + + // Cannot be the body of an 'if', 'while', 'do', 'switch', or + // 'label'. + // expected-error@+3{{OpenACC 'update' construct may not appear in place of the statement following an if statement}} + if (true) + // expected-warning@+1{{OpenACC clause 'device' not yet implemented}} +#pragma acc update device(Var) + + // expected-error@+3{{OpenACC 'update' construct may not appear in place of the statement following a while statement}} + while (true) + // expected-warning@+1{{OpenACC clause 'device' not yet implemented}} +#pragma acc update device(Var) + + // expected-error@+3{{OpenACC 'update' construct may not appear in place of the statement following a do statement}} + do + // expected-warning@+1{{OpenACC clause 'device' not yet implemented}} +#pragma acc update device(Var) + while (true); + + // expected-error@+3{{OpenACC 'update' construct may not appear in place of the statement following a switch statement}} + switch(Var) + // expected-warning@+1{{OpenACC clause 'device' not yet implemented}} +#pragma acc update device(Var) + + // expected-error@+3{{OpenACC 'update' construct may not appear in place of the statement following a label statement}} + LABEL: + // expected-warning@+1{{OpenACC clause 'device' not yet implemented}} +#pragma acc update device(Var) + + // For loops are OK. + for (;;) + // expected-warning@+1{{OpenACC clause 'device' not yet implemented}} +#pragma acc update device(Var) + + // Checking for 'async', which requires an 'int' expression. +#pragma acc update async + +#pragma acc update async(getI()) + // expected-error@+2{{expected ')'}} + // expected-note@+1{{to match this '('}} +#pragma acc update async(getI(), getI()) + // expected-error@+2{{OpenACC 'async' clause cannot appear more than once on a 'update' directive}} + // expected-note@+1{{previous clause is here}} +#pragma acc update async(getI()) async(getI()) + // expected-error@+1{{OpenACC clause 'async' requires expression of integer type ('struct NotConvertible' invalid)}} +#pragma acc update async(NC) + + // Checking for 'wait', which has a complicated set arguments. +#pragma acc update wait +#pragma acc update wait() +#pragma acc update wait(getI(), getI()) +#pragma acc update wait(devnum: getI(): getI()) +#pragma acc update wait(devnum: getI(): queues: getI(), getI()) + // expected-error@+1{{OpenACC clause 'wait' requires expression of integer type ('struct NotConvertible' invalid)}} +#pragma acc update wait(devnum:NC : 5) + // expected-error@+1{{OpenACC clause 'wait' requires expression of integer type ('struct NotConvertible' invalid)}} +#pragma acc update wait(devnum:5 : NC) + + int arr[5]; + // expected-error@+3{{OpenACC clause 'wait' requires expression of integer type ('int[5]' invalid)}} + // expected-error@+2{{OpenACC clause 'wait' requires expression of integer type ('int[5]' invalid)}} + // expected-error@+1{{OpenACC clause 'wait' requires expression of integer type ('struct NotConvertible' invalid)}} +#pragma acc update wait(devnum:arr : queues: arr, NC, 5) +} diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp index 3e761024392cb..4114d9a37f1ec 100644 --- a/clang/tools/libclang/CIndex.cpp +++ b/clang/tools/libclang/CIndex.cpp @@ -6439,6 +6439,8 @@ CXString clang_getCursorKindSpelling(enum CXCursorKind Kind) { return cxstring::createRef("OpenACCShutdownConstruct"); case CXCursor_OpenACCSetConstruct: return cxstring::createRef("OpenACCSetConstruct"); + case CXCursor_OpenACCUpdateConstruct: + return cxstring::createRef("OpenACCUpdateConstruct"); } llvm_unreachable("Unhandled CXCursorKind"); diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp index cbc3485d41970..ee276d8e4e148 100644 --- a/clang/tools/libclang/CXCursor.cpp +++ b/clang/tools/libclang/CXCursor.cpp @@ -912,6 +912,9 @@ CXCursor cxcursor::MakeCXCursor(const Stmt *S, const Decl *Parent, case Stmt::OpenACCSetConstructClass: K = CXCursor_OpenACCSetConstruct; break; + case Stmt::OpenACCUpdateConstructClass: + K = CXCursor_OpenACCUpdateConstruct; + break; case Stmt::OMPTargetParallelGenericLoopDirectiveClass: K = CXCursor_OMPTargetParallelGenericLoopDirective; break; diff --git a/clang/unittests/StaticAnalyzer/Z3CrosscheckOracleTest.cpp b/clang/unittests/StaticAnalyzer/Z3CrosscheckOracleTest.cpp index ed8627c500098..626f5c163d17d 100644 --- a/clang/unittests/StaticAnalyzer/Z3CrosscheckOracleTest.cpp +++ b/clang/unittests/StaticAnalyzer/Z3CrosscheckOracleTest.cpp @@ -27,22 +27,13 @@ static constexpr std::optional UNDEF = std::nullopt; static unsigned operator""_ms(unsigned long long ms) { return ms; } static unsigned operator""_step(unsigned long long rlimit) { return rlimit; } -template static Ret makeDefaultOption(Arg Value) { - return Value; -} -template <> PositiveAnalyzerOption makeDefaultOption(int Value) { - auto DefaultVal = PositiveAnalyzerOption::create(Value); - assert(DefaultVal.has_value()); - return DefaultVal.value(); -} - static const AnalyzerOptions DefaultOpts = [] { AnalyzerOptions Config; #define ANALYZER_OPTION_DEPENDS_ON_USER_MODE(TYPE, NAME, CMDFLAG, DESC, \ SHALLOW_VAL, DEEP_VAL) \ ANALYZER_OPTION(TYPE, NAME, CMDFLAG, DESC, DEEP_VAL) #define ANALYZER_OPTION(TYPE, NAME, CMDFLAG, DESC, DEFAULT_VAL) \ - Config.NAME = makeDefaultOption(DEFAULT_VAL); + Config.NAME = DEFAULT_VAL; #include "clang/StaticAnalyzer/Core/AnalyzerOptions.def" // Remember to update the tests in this file when these values change. diff --git a/compiler-rt/lib/gwp_asan/tests/harness.h b/compiler-rt/lib/gwp_asan/tests/harness.h index c96f846996d35..3fbcf991c5592 100644 --- a/compiler-rt/lib/gwp_asan/tests/harness.h +++ b/compiler-rt/lib/gwp_asan/tests/harness.h @@ -12,7 +12,9 @@ #include #if defined(__Fuchsia__) +#ifndef ZXTEST_USE_STREAMABLE_MACROS #define ZXTEST_USE_STREAMABLE_MACROS +#endif #include namespace testing = zxtest; // zxtest defines a different ASSERT_DEATH, taking a lambda and an error message diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp index f1fe20b255d9c..7ec0382b58566 100644 --- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp @@ -337,6 +337,23 @@ INTERCEPTOR(FILE *, fmemopen, void *buf, size_t size, const char *mode) { #define RTSAN_MAYBE_INTERCEPT_FMEMOPEN #endif +#if SANITIZER_INTERCEPT_SETVBUF +INTERCEPTOR(void, setbuf, FILE *stream, char *buf) { + __rtsan_notify_intercepted_call("setbuf"); + return REAL(setbuf)(stream, buf); +} + +INTERCEPTOR(int, setvbuf, FILE *stream, char *buf, int mode, size_t size) { + __rtsan_notify_intercepted_call("setvbuf"); + return REAL(setvbuf)(stream, buf, mode, size); +} +#define RTSAN_MAYBE_INTERCEPT_SETBUF INTERCEPT_FUNCTION(setbuf) +#define RTSAN_MAYBE_INTERCEPT_SETVBUF INTERCEPT_FUNCTION(setvbuf) +#else +#define RTSAN_MAYBE_INTERCEPT_SETBUF +#define RTSAN_MAYBE_INTERCEPT_SETVBUF +#endif + INTERCEPTOR(int, puts, const char *s) { __rtsan_notify_intercepted_call("puts"); return REAL(puts)(s); @@ -999,6 +1016,8 @@ void __rtsan::InitializeInterceptors() { RTSAN_MAYBE_INTERCEPT_FOPENCOOKIE; RTSAN_MAYBE_INTERCEPT_OPEN_MEMSTREAM; RTSAN_MAYBE_INTERCEPT_FMEMOPEN; + RTSAN_MAYBE_INTERCEPT_SETBUF; + RTSAN_MAYBE_INTERCEPT_SETVBUF; INTERCEPT_FUNCTION(lseek); RTSAN_MAYBE_INTERCEPT_LSEEK64; INTERCEPT_FUNCTION(dup); diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp index d9872c54b2614..db0ec951ad10c 100644 --- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp @@ -403,6 +403,34 @@ TEST_F(RtsanFileTest, FmemOpenDiesWhenRealtime) { } #endif +#if SANITIZER_INTERCEPT_SETVBUF +TEST_F(RtsanFileTest, SetbufDieWhenRealtime) { + char buffer[BUFSIZ]; + FILE *f = fopen(GetTemporaryFilePath(), "w"); + EXPECT_THAT(f, Ne(nullptr)); + + auto Func = [&f, &buffer]() { setbuf(f, buffer); }; + + ExpectRealtimeDeath(Func, "setbuf"); + ExpectNonRealtimeSurvival(Func); +} + +TEST_F(RtsanFileTest, SetvbufDieWhenRealtime) { + char buffer[1024]; + size_t size = sizeof(buffer); + FILE *f = fopen(GetTemporaryFilePath(), "w"); + EXPECT_THAT(f, Ne(nullptr)); + + auto Func = [&f, &buffer, &size]() { + int r = setvbuf(f, buffer, _IOFBF, size); + EXPECT_THAT(r, Eq(0)); + }; + + ExpectRealtimeDeath(Func, "setvbuf"); + ExpectNonRealtimeSurvival(Func); +} +#endif + class RtsanOpenedFileTest : public RtsanFileTest { protected: void SetUp() override { diff --git a/compiler-rt/lib/ubsan/ubsan_value.h b/compiler-rt/lib/ubsan/ubsan_value.h index 430c9ea0dc8d1..ee523cf5ddda5 100644 --- a/compiler-rt/lib/ubsan/ubsan_value.h +++ b/compiler-rt/lib/ubsan/ubsan_value.h @@ -150,9 +150,12 @@ class TypeDescriptor { unsigned getIntegerBitCount() const { DCHECK(isIntegerTy()); - if (isSignedBitIntTy()) - return *reinterpret_cast(getBitIntBitCountPointer()); - else + if (isSignedBitIntTy()) { + u32 BitCountValue; + internal_memcpy(&BitCountValue, getBitIntBitCountPointer(), + sizeof(BitCountValue)); + return BitCountValue; + } else return getIntegerBitWidth(); } diff --git a/flang/include/flang/Optimizer/Dialect/CanonicalizationPatterns.td b/flang/include/flang/Optimizer/Dialect/CanonicalizationPatterns.td index 1dbde5c1c7302..2414de496d45b 100644 --- a/flang/include/flang/Optimizer/Dialect/CanonicalizationPatterns.td +++ b/flang/include/flang/Optimizer/Dialect/CanonicalizationPatterns.td @@ -57,6 +57,9 @@ def StrictSmallerWidthPred : Constraint>; +def PointerCompatiblePred + : Constraint>; + // floats or ints that undergo successive extensions or successive truncations. def ConvertConvertOptPattern : Pat<(fir_ConvertOp:$res (fir_ConvertOp:$irm $arg)), @@ -112,4 +115,18 @@ def ForwardConstantConvertPattern (createConstantOp $res, $attr), [(IndexTypePred $res), (IntegerTypePred $cnt)]>; +// Optimize redundant pointer conversions, e.g.: +// %1 = fir.convert %0 : +// (!fir.heap>) -> !fir.ref> +// %2 = fir.convert %1 : +// (!fir.ref>) -> !fir.heap> +// Will be optimized into: +// %2 = fir.convert %0 : +// (!fir.heap>) -> !fir.heap> +// which is redundant due to RedundantConvertOptPattern. +def ChainedPointerConvertsPattern + : Pat<(fir_ConvertOp:$res(fir_ConvertOp:$irm $arg)), (fir_ConvertOp $arg), + [(PointerCompatiblePred $arg), (PointerCompatiblePred $irm), + (PointerCompatiblePred $res)]>; + #endif // FORTRAN_FIR_REWRITE_PATTERNS diff --git a/flang/lib/Lower/CMakeLists.txt b/flang/lib/Lower/CMakeLists.txt index ba6622d8504a4..f57f0e7a77a01 100644 --- a/flang/lib/Lower/CMakeLists.txt +++ b/flang/lib/Lower/CMakeLists.txt @@ -29,6 +29,7 @@ add_flang_library(FortranLower OpenMP/DataSharingProcessor.cpp OpenMP/Decomposer.cpp OpenMP/OpenMP.cpp + OpenMP/PrivateReductionUtils.cpp OpenMP/ReductionProcessor.cpp OpenMP/Utils.cpp PFTBuilder.cpp diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp index cd312537551ea..9dfdbd8337ae9 100644 --- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp +++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp @@ -126,7 +126,8 @@ void DataSharingProcessor::cloneSymbol(const semantics::Symbol *sym) { assert(sb); mlir::Value addr = sb.getAddr(); assert(addr); - return hlfir::mayHaveAllocatableComponent(addr.getType()); + return !fir::isPointerType(addr.getType()) && + hlfir::mayHaveAllocatableComponent(addr.getType()); }; if (needInitClone()) { diff --git a/flang/lib/Lower/OpenMP/PrivateReductionUtils.cpp b/flang/lib/Lower/OpenMP/PrivateReductionUtils.cpp new file mode 100644 index 0000000000000..83f0d4e93ca54 --- /dev/null +++ b/flang/lib/Lower/OpenMP/PrivateReductionUtils.cpp @@ -0,0 +1,236 @@ +//===-- PrivateReductionUtils.cpp -------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/ +// +//===----------------------------------------------------------------------===// + +#include "PrivateReductionUtils.h" + +#include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Builder/HLFIRTools.h" +#include "flang/Optimizer/Builder/Todo.h" +#include "flang/Optimizer/HLFIR/HLFIROps.h" +#include "flang/Optimizer/Support/FatalError.h" +#include "mlir/Dialect/OpenMP/OpenMPDialect.h" +#include "mlir/IR/Location.h" + +static void createCleanupRegion(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Type argType, + mlir::Region &cleanupRegion) { + assert(cleanupRegion.empty()); + mlir::Block *block = builder.createBlock(&cleanupRegion, cleanupRegion.end(), + {argType}, {loc}); + builder.setInsertionPointToEnd(block); + + auto typeError = [loc]() { + fir::emitFatalError(loc, + "Attempt to create an omp cleanup region " + "for a type that wasn't allocated", + /*genCrashDiag=*/true); + }; + + mlir::Type valTy = fir::unwrapRefType(argType); + if (auto boxTy = mlir::dyn_cast_or_null(valTy)) { + if (!mlir::isa(boxTy.getEleTy())) { + mlir::Type innerTy = fir::extractSequenceType(boxTy); + if (!mlir::isa(innerTy)) + typeError(); + } + + mlir::Value arg = builder.loadIfRef(loc, block->getArgument(0)); + assert(mlir::isa(arg.getType())); + + // Deallocate box + // The FIR type system doesn't nesecarrily know that this is a mutable box + // if we allocated the thread local array on the heap to avoid looped stack + // allocations. + mlir::Value addr = + hlfir::genVariableRawAddress(loc, builder, hlfir::Entity{arg}); + mlir::Value isAllocated = builder.genIsNotNullAddr(loc, addr); + fir::IfOp ifOp = + builder.create(loc, isAllocated, /*withElseRegion=*/false); + builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); + + mlir::Value cast = builder.createConvert( + loc, fir::HeapType::get(fir::dyn_cast_ptrEleTy(addr.getType())), addr); + builder.create(loc, cast); + + builder.setInsertionPointAfter(ifOp); + builder.create(loc); + return; + } + + typeError(); +} + +fir::ShapeShiftOp Fortran::lower::omp::getShapeShift(fir::FirOpBuilder &builder, + mlir::Location loc, + mlir::Value box) { + fir::SequenceType sequenceType = mlir::cast( + hlfir::getFortranElementOrSequenceType(box.getType())); + const unsigned rank = sequenceType.getDimension(); + llvm::SmallVector lbAndExtents; + lbAndExtents.reserve(rank * 2); + + mlir::Type idxTy = builder.getIndexType(); + for (unsigned i = 0; i < rank; ++i) { + // TODO: ideally we want to hoist box reads out of the critical section. + // We could do this by having box dimensions in block arguments like + // OpenACC does + mlir::Value dim = builder.createIntegerConstant(loc, idxTy, i); + auto dimInfo = + builder.create(loc, idxTy, idxTy, idxTy, box, dim); + lbAndExtents.push_back(dimInfo.getLowerBound()); + lbAndExtents.push_back(dimInfo.getExtent()); + } + + auto shapeShiftTy = fir::ShapeShiftType::get(builder.getContext(), rank); + auto shapeShift = + builder.create(loc, shapeShiftTy, lbAndExtents); + return shapeShift; +} + +void Fortran::lower::omp::populateByRefInitAndCleanupRegions( + fir::FirOpBuilder &builder, mlir::Location loc, mlir::Type argType, + mlir::Value scalarInitValue, mlir::Block *initBlock, + mlir::Value allocatedPrivVarArg, mlir::Value moldArg, + mlir::Region &cleanupRegion) { + mlir::Type ty = fir::unwrapRefType(argType); + builder.setInsertionPointToEnd(initBlock); + auto yield = [&](mlir::Value ret) { + builder.create(loc, ret); + }; + + if (fir::isa_trivial(ty)) { + builder.setInsertionPointToEnd(initBlock); + + if (scalarInitValue) + builder.createStoreWithConvert(loc, scalarInitValue, allocatedPrivVarArg); + yield(allocatedPrivVarArg); + return; + } + + // check if an allocatable box is unallocated. If so, initialize the boxAlloca + // to be unallocated e.g. + // %box_alloca = fir.alloca !fir.box> + // %addr = fir.box_addr %box + // if (%addr == 0) { + // %nullbox = fir.embox %addr + // fir.store %nullbox to %box_alloca + // } else { + // // ... + // fir.store %something to %box_alloca + // } + // omp.yield %box_alloca + moldArg = builder.loadIfRef(loc, moldArg); + auto handleNullAllocatable = [&](mlir::Value boxAlloca) -> fir::IfOp { + mlir::Value addr = builder.create(loc, moldArg); + mlir::Value isNotAllocated = builder.genIsNullAddr(loc, addr); + fir::IfOp ifOp = builder.create(loc, isNotAllocated, + /*withElseRegion=*/true); + builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); + // just embox the null address and return + mlir::Value nullBox = builder.create(loc, ty, addr); + builder.create(loc, nullBox, boxAlloca); + return ifOp; + }; + + // all arrays are boxed + if (auto boxTy = mlir::dyn_cast_or_null(ty)) { + bool isAllocatableOrPointer = + mlir::isa(boxTy.getEleTy()); + + builder.setInsertionPointToEnd(initBlock); + mlir::Value boxAlloca = allocatedPrivVarArg; + mlir::Type innerTy = fir::unwrapRefType(boxTy.getEleTy()); + if (fir::isa_trivial(innerTy)) { + // boxed non-sequence value e.g. !fir.box> + if (!isAllocatableOrPointer) + TODO(loc, + "Reduction/Privatization of non-allocatable trivial typed box"); + + fir::IfOp ifUnallocated = handleNullAllocatable(boxAlloca); + + builder.setInsertionPointToStart(&ifUnallocated.getElseRegion().front()); + mlir::Value valAlloc = builder.create(loc, innerTy); + if (scalarInitValue) + builder.createStoreWithConvert(loc, scalarInitValue, valAlloc); + mlir::Value box = builder.create(loc, ty, valAlloc); + builder.create(loc, box, boxAlloca); + + createCleanupRegion(builder, loc, argType, cleanupRegion); + builder.setInsertionPointAfter(ifUnallocated); + yield(boxAlloca); + return; + } + innerTy = fir::extractSequenceType(boxTy); + if (!mlir::isa(innerTy)) + TODO(loc, "Unsupported boxed type for reduction/privatization"); + + fir::IfOp ifUnallocated{nullptr}; + if (isAllocatableOrPointer) { + ifUnallocated = handleNullAllocatable(boxAlloca); + builder.setInsertionPointToStart(&ifUnallocated.getElseRegion().front()); + } + + // Create the private copy from the initial fir.box: + mlir::Value loadedBox = builder.loadIfRef(loc, moldArg); + hlfir::Entity source = hlfir::Entity{loadedBox}; + + // Allocating on the heap in case the whole reduction is nested inside of a + // loop + // TODO: compare performance here to using allocas - this could be made to + // work by inserting stacksave/stackrestore around the reduction in + // openmpirbuilder + auto [temp, needsDealloc] = createTempFromMold(loc, builder, source); + // if needsDealloc isn't statically false, add cleanup region. Always + // do this for allocatable boxes because they might have been re-allocated + // in the body of the loop/parallel region + + std::optional cstNeedsDealloc = + fir::getIntIfConstant(needsDealloc); + assert(cstNeedsDealloc.has_value() && + "createTempFromMold decides this statically"); + if (cstNeedsDealloc.has_value() && *cstNeedsDealloc != false) { + mlir::OpBuilder::InsertionGuard guard(builder); + createCleanupRegion(builder, loc, argType, cleanupRegion); + } else { + assert(!isAllocatableOrPointer && + "Pointer-like arrays must be heap allocated"); + } + + // Put the temporary inside of a box: + // hlfir::genVariableBox doesn't handle non-default lower bounds + mlir::Value box; + fir::ShapeShiftOp shapeShift = getShapeShift(builder, loc, loadedBox); + mlir::Type boxType = loadedBox.getType(); + if (mlir::isa(temp.getType())) + // the box created by the declare form createTempFromMold is missing lower + // bounds info + box = builder.create(loc, boxType, temp, shapeShift, + /*shift=*/mlir::Value{}); + else + box = builder.create( + loc, boxType, temp, shapeShift, + /*slice=*/mlir::Value{}, + /*typeParams=*/llvm::ArrayRef{}); + + if (scalarInitValue) + builder.create(loc, scalarInitValue, box); + builder.create(loc, box, boxAlloca); + if (ifUnallocated) + builder.setInsertionPointAfter(ifUnallocated); + yield(boxAlloca); + return; + } + + TODO(loc, + "creating reduction/privatization init region for unsupported type"); + return; +} diff --git a/flang/lib/Lower/OpenMP/PrivateReductionUtils.h b/flang/lib/Lower/OpenMP/PrivateReductionUtils.h new file mode 100644 index 0000000000000..b4abc40cd4b67 --- /dev/null +++ b/flang/lib/Lower/OpenMP/PrivateReductionUtils.h @@ -0,0 +1,51 @@ +//===-- Lower/OpenMP/PrivateReductionUtils.h --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/ +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_LOWER_OPENMP_PRIVATEREDUCTIONUTILS_H +#define FORTRAN_LOWER_OPENMP_PRIVATEREDUCTIONUTILS_H + +#include "mlir/IR/Location.h" +#include "mlir/IR/Value.h" + +namespace mlir { +class Region; +} // namespace mlir + +namespace fir { +class FirOpBuilder; +class ShapeShiftOp; +} // namespace fir + +namespace Fortran { +namespace lower { +namespace omp { + +/// Generate init and cleanup regions suitable for reduction or privatizer +/// declarations. `scalarInitValue` may be nullptr if there is no default +/// initialization (for privatization). +void populateByRefInitAndCleanupRegions(fir::FirOpBuilder &builder, + mlir::Location loc, mlir::Type argType, + mlir::Value scalarInitValue, + mlir::Block *initBlock, + mlir::Value allocatedPrivVarArg, + mlir::Value moldArg, + mlir::Region &cleanupRegion); + +/// Generate a fir::ShapeShift op describing the provided boxed array. +fir::ShapeShiftOp getShapeShift(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value box); + +} // namespace omp +} // namespace lower +} // namespace Fortran + +#endif // FORTRAN_LOWER_OPENMP_PRIVATEREDUCTIONUTILS_H diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp index 736de2ee511be..2cd21107a916e 100644 --- a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp @@ -12,6 +12,7 @@ #include "ReductionProcessor.h" +#include "PrivateReductionUtils.h" #include "flang/Lower/AbstractConverter.h" #include "flang/Lower/ConvertType.h" #include "flang/Lower/SymbolMap.h" @@ -294,33 +295,6 @@ mlir::Value ReductionProcessor::createScalarCombiner( return reductionOp; } -/// Generate a fir::ShapeShift op describing the provided boxed array. -static fir::ShapeShiftOp getShapeShift(fir::FirOpBuilder &builder, - mlir::Location loc, mlir::Value box) { - fir::SequenceType sequenceType = mlir::cast( - hlfir::getFortranElementOrSequenceType(box.getType())); - const unsigned rank = sequenceType.getDimension(); - llvm::SmallVector lbAndExtents; - lbAndExtents.reserve(rank * 2); - - mlir::Type idxTy = builder.getIndexType(); - for (unsigned i = 0; i < rank; ++i) { - // TODO: ideally we want to hoist box reads out of the critical section. - // We could do this by having box dimensions in block arguments like - // OpenACC does - mlir::Value dim = builder.createIntegerConstant(loc, idxTy, i); - auto dimInfo = - builder.create(loc, idxTy, idxTy, idxTy, box, dim); - lbAndExtents.push_back(dimInfo.getLowerBound()); - lbAndExtents.push_back(dimInfo.getExtent()); - } - - auto shapeShiftTy = fir::ShapeShiftType::get(builder.getContext(), rank); - auto shapeShift = - builder.create(loc, shapeShiftTy, lbAndExtents); - return shapeShift; -} - /// Create reduction combiner region for reduction variables which are boxed /// arrays static void genBoxCombiner(fir::FirOpBuilder &builder, mlir::Location loc, @@ -422,59 +396,6 @@ static void genCombiner(fir::FirOpBuilder &builder, mlir::Location loc, TODO(loc, "OpenMP genCombiner for unsupported reduction variable type"); } -static void -createReductionCleanupRegion(fir::FirOpBuilder &builder, mlir::Location loc, - mlir::omp::DeclareReductionOp &reductionDecl) { - mlir::Type redTy = reductionDecl.getType(); - - mlir::Region &cleanupRegion = reductionDecl.getCleanupRegion(); - assert(cleanupRegion.empty()); - mlir::Block *block = - builder.createBlock(&cleanupRegion, cleanupRegion.end(), {redTy}, {loc}); - builder.setInsertionPointToEnd(block); - - auto typeError = [loc]() { - fir::emitFatalError(loc, - "Attempt to create an omp reduction cleanup region " - "for a type that wasn't allocated", - /*genCrashDiag=*/true); - }; - - mlir::Type valTy = fir::unwrapRefType(redTy); - if (auto boxTy = mlir::dyn_cast_or_null(valTy)) { - if (!mlir::isa(boxTy.getEleTy())) { - mlir::Type innerTy = fir::extractSequenceType(boxTy); - if (!mlir::isa(innerTy)) - typeError(); - } - - mlir::Value arg = block->getArgument(0); - arg = builder.loadIfRef(loc, arg); - assert(mlir::isa(arg.getType())); - - // Deallocate box - // The FIR type system doesn't nesecarrily know that this is a mutable box - // if we allocated the thread local array on the heap to avoid looped stack - // allocations. - mlir::Value addr = - hlfir::genVariableRawAddress(loc, builder, hlfir::Entity{arg}); - mlir::Value isAllocated = builder.genIsNotNullAddr(loc, addr); - fir::IfOp ifOp = - builder.create(loc, isAllocated, /*withElseRegion=*/false); - builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); - - mlir::Value cast = builder.createConvert( - loc, fir::HeapType::get(fir::dyn_cast_ptrEleTy(addr.getType())), addr); - builder.create(loc, cast); - - builder.setInsertionPointAfter(ifOp); - builder.create(loc); - return; - } - - typeError(); -} - // like fir::unwrapSeqOrBoxedSeqType except it also works for non-sequence boxes static mlir::Type unwrapSeqOrBoxedType(mlir::Type ty) { if (auto seqTy = mlir::dyn_cast(ty)) @@ -517,154 +438,31 @@ static void createReductionAllocAndInitRegions( mlir::Value initValue = ReductionProcessor::getReductionInitValue( loc, unwrapSeqOrBoxedType(ty), redId, builder); + if (isByRef) { + populateByRefInitAndCleanupRegions(builder, loc, type, initValue, initBlock, + reductionDecl.getInitializerAllocArg(), + reductionDecl.getInitializerMoldArg(), + reductionDecl.getCleanupRegion()); + } + if (fir::isa_trivial(ty)) { if (isByRef) { // alloc region - { - builder.setInsertionPointToEnd(allocBlock); - mlir::Value alloca = builder.create(loc, ty); - yield(alloca); - } - - // init region - { - builder.setInsertionPointToEnd(initBlock); - // block arg is mapped to the alloca yielded from the alloc region - mlir::Value alloc = reductionDecl.getInitializerAllocArg(); - builder.createStoreWithConvert(loc, initValue, alloc); - yield(alloc); - } + builder.setInsertionPointToEnd(allocBlock); + mlir::Value alloca = builder.create(loc, ty); + yield(alloca); return; } // by val yield(initValue); return; } + assert(isByRef && "passing non-trivial types by val is unsupported"); - // check if an allocatable box is unallocated. If so, initialize the boxAlloca - // to be unallocated e.g. - // %box_alloca = fir.alloca !fir.box> - // %addr = fir.box_addr %box - // if (%addr == 0) { - // %nullbox = fir.embox %addr - // fir.store %nullbox to %box_alloca - // } else { - // // ... - // fir.store %something to %box_alloca - // } - // omp.yield %box_alloca - mlir::Value moldArg = - builder.loadIfRef(loc, reductionDecl.getInitializerMoldArg()); - auto handleNullAllocatable = [&](mlir::Value boxAlloca) -> fir::IfOp { - mlir::Value addr = builder.create(loc, moldArg); - mlir::Value isNotAllocated = builder.genIsNullAddr(loc, addr); - fir::IfOp ifOp = builder.create(loc, isNotAllocated, - /*withElseRegion=*/true); - builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); - // just embox the null address and return - mlir::Value nullBox = builder.create(loc, ty, addr); - builder.create(loc, nullBox, boxAlloca); - return ifOp; - }; - - // all arrays are boxed - if (auto boxTy = mlir::dyn_cast_or_null(ty)) { - assert(isByRef && "passing boxes by value is unsupported"); - bool isAllocatableOrPointer = - mlir::isa(boxTy.getEleTy()); - - // alloc region - { - builder.setInsertionPointToEnd(allocBlock); - mlir::Value boxAlloca = builder.create(loc, ty); - yield(boxAlloca); - } - - // init region - builder.setInsertionPointToEnd(initBlock); - mlir::Value boxAlloca = reductionDecl.getInitializerAllocArg(); - mlir::Type innerTy = fir::unwrapRefType(boxTy.getEleTy()); - if (fir::isa_trivial(innerTy)) { - // boxed non-sequence value e.g. !fir.box> - if (!isAllocatableOrPointer) - TODO(loc, "Reduction of non-allocatable trivial typed box"); - - fir::IfOp ifUnallocated = handleNullAllocatable(boxAlloca); - - builder.setInsertionPointToStart(&ifUnallocated.getElseRegion().front()); - mlir::Value valAlloc = builder.create(loc, innerTy); - builder.createStoreWithConvert(loc, initValue, valAlloc); - mlir::Value box = builder.create(loc, ty, valAlloc); - builder.create(loc, box, boxAlloca); - - auto insPt = builder.saveInsertionPoint(); - createReductionCleanupRegion(builder, loc, reductionDecl); - builder.restoreInsertionPoint(insPt); - builder.setInsertionPointAfter(ifUnallocated); - yield(boxAlloca); - return; - } - innerTy = fir::extractSequenceType(boxTy); - if (!mlir::isa(innerTy)) - TODO(loc, "Unsupported boxed type for reduction"); - - fir::IfOp ifUnallocated{nullptr}; - if (isAllocatableOrPointer) { - ifUnallocated = handleNullAllocatable(boxAlloca); - builder.setInsertionPointToStart(&ifUnallocated.getElseRegion().front()); - } - - // Create the private copy from the initial fir.box: - mlir::Value loadedBox = builder.loadIfRef(loc, moldArg); - hlfir::Entity source = hlfir::Entity{loadedBox}; - - // Allocating on the heap in case the whole reduction is nested inside of a - // loop - // TODO: compare performance here to using allocas - this could be made to - // work by inserting stacksave/stackrestore around the reduction in - // openmpirbuilder - auto [temp, needsDealloc] = createTempFromMold(loc, builder, source); - // if needsDealloc isn't statically false, add cleanup region. Always - // do this for allocatable boxes because they might have been re-allocated - // in the body of the loop/parallel region - - std::optional cstNeedsDealloc = - fir::getIntIfConstant(needsDealloc); - assert(cstNeedsDealloc.has_value() && - "createTempFromMold decides this statically"); - if (cstNeedsDealloc.has_value() && *cstNeedsDealloc != false) { - mlir::OpBuilder::InsertionGuard guard(builder); - createReductionCleanupRegion(builder, loc, reductionDecl); - } else { - assert(!isAllocatableOrPointer && - "Pointer-like arrays must be heap allocated"); - } - - // Put the temporary inside of a box: - // hlfir::genVariableBox doesn't handle non-default lower bounds - mlir::Value box; - fir::ShapeShiftOp shapeShift = getShapeShift(builder, loc, loadedBox); - mlir::Type boxType = loadedBox.getType(); - if (mlir::isa(temp.getType())) - // the box created by the declare form createTempFromMold is missing lower - // bounds info - box = builder.create(loc, boxType, temp, shapeShift, - /*shift=*/mlir::Value{}); - else - box = builder.create( - loc, boxType, temp, shapeShift, - /*slice=*/mlir::Value{}, - /*typeParams=*/llvm::ArrayRef{}); - - builder.create(loc, initValue, box); - builder.create(loc, box, boxAlloca); - if (ifUnallocated) - builder.setInsertionPointAfter(ifUnallocated); - yield(boxAlloca); - return; - } - - TODO(loc, "createReductionInitRegion for unsupported type"); + // alloc region + builder.setInsertionPointToEnd(allocBlock); + mlir::Value boxAlloca = builder.create(loc, ty); + yield(boxAlloca); } mlir::omp::DeclareReductionOp ReductionProcessor::createDeclareReduction( diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp index cdcf9bda49a62..fa83aa380e489 100644 --- a/flang/lib/Optimizer/Dialect/FIROps.cpp +++ b/flang/lib/Optimizer/Dialect/FIROps.cpp @@ -1313,7 +1313,8 @@ void fir::ConvertOp::getCanonicalizationPatterns( results.insert(context); + ForwardConstantConvertPattern, ChainedPointerConvertsPattern>( + context); } mlir::OpFoldResult fir::ConvertOp::fold(FoldAdaptor adaptor) { diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp index 20e4599587c4b..e1d7376ec3805 100644 --- a/flang/lib/Optimizer/Passes/Pipelines.cpp +++ b/flang/lib/Optimizer/Passes/Pipelines.cpp @@ -240,6 +240,16 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, bool enableOpenMP, pm.addPass(hlfir::createLowerHLFIROrderedAssignments()); pm.addPass(hlfir::createLowerHLFIRIntrinsics()); pm.addPass(hlfir::createBufferizeHLFIR()); + // Run hlfir.assign inlining again after BufferizeHLFIR, + // because the latter may introduce new hlfir.assign operations, + // e.g. for copying an array into a temporary due to + // hlfir.associate. + // TODO: we can remove the previous InlineHLFIRAssign, when + // FIR AliasAnalysis is good enough to say that a temporary + // array does not alias with any user object. + if (optLevel.isOptimizingForSpeed()) + addNestedPassToAllTopLevelOperations( + pm, hlfir::createInlineHLFIRAssign); pm.addPass(hlfir::createConvertHLFIRtoFIR()); if (enableOpenMP) pm.addPass(flangomp::createLowerWorkshare()); diff --git a/flang/runtime/derived.cpp b/flang/runtime/derived.cpp index 7c164ff890452..10813c62e5da1 100644 --- a/flang/runtime/derived.cpp +++ b/flang/runtime/derived.cpp @@ -129,6 +129,10 @@ RT_API_ATTRS int InitializeClone(const Descriptor &clone, std::size_t elements{orig.Elements()}; int stat{StatOk}; + // Skip pointers and unallocated variables. + if (orig.IsPointer() || !orig.IsAllocated()) { + return stat; + } // Initialize each data component. std::size_t components{componentDesc.Elements()}; for (std::size_t i{0}; i < components; ++i) { diff --git a/flang/test/Driver/mlir-pass-pipeline.f90 b/flang/test/Driver/mlir-pass-pipeline.f90 index 9655afce96d92..55e86da2dfdf1 100644 --- a/flang/test/Driver/mlir-pass-pipeline.f90 +++ b/flang/test/Driver/mlir-pass-pipeline.f90 @@ -49,6 +49,15 @@ ! ALL: LowerHLFIROrderedAssignments ! ALL-NEXT: LowerHLFIRIntrinsics ! ALL-NEXT: BufferizeHLFIR +! O2-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private'] +! O2-NEXT: 'fir.global' Pipeline +! O2-NEXT: InlineHLFIRAssign +! O2-NEXT: 'func.func' Pipeline +! O2-NEXT: InlineHLFIRAssign +! O2-NEXT: 'omp.declare_reduction' Pipeline +! O2-NEXT: InlineHLFIRAssign +! O2-NEXT: 'omp.private' Pipeline +! O2-NEXT: InlineHLFIRAssign ! ALL-NEXT: ConvertHLFIRtoFIR ! ALL-NEXT: CSE ! Ideally, we need an output with only the pass names, but diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir index 620882ebbed2a..29a0f66157971 100644 --- a/flang/test/Fir/basic-program.fir +++ b/flang/test/Fir/basic-program.fir @@ -50,6 +50,15 @@ func.func @_QQmain() { // PASSES-NEXT: LowerHLFIROrderedAssignments // PASSES-NEXT: LowerHLFIRIntrinsics // PASSES-NEXT: BufferizeHLFIR +// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private'] +// PASSES-NEXT: 'fir.global' Pipeline +// PASSES-NEXT: InlineHLFIRAssign +// PASSES-NEXT: 'func.func' Pipeline +// PASSES-NEXT: InlineHLFIRAssign +// PASSES-NEXT: 'omp.declare_reduction' Pipeline +// PASSES-NEXT: InlineHLFIRAssign +// PASSES-NEXT: 'omp.private' Pipeline +// PASSES-NEXT: InlineHLFIRAssign // PASSES-NEXT: ConvertHLFIRtoFIR // PASSES-NEXT: LowerWorkshare // PASSES-NEXT: CSE diff --git a/flang/test/Fir/convert-fold.fir b/flang/test/Fir/convert-fold.fir index ebb6c8db7c891..fb30e634ba5e6 100644 --- a/flang/test/Fir/convert-fold.fir +++ b/flang/test/Fir/convert-fold.fir @@ -35,3 +35,12 @@ func.func @ctest() -> index { // CHECK-NEXT: return %{{.*}} : index return %2 : index } + +// CHECK-LABEL: func.func @ptrtest( +// CHECK-SAME: %[[VAL_0:.*]]: !fir.heap>) -> !fir.heap> { +func.func @ptrtest(%0 : !fir.heap>) -> !fir.heap> { + %1 = fir.convert %0 : (!fir.heap>) -> !fir.ref> + %2 = fir.convert %1 : (!fir.ref>) -> !fir.heap> +// CHECK: return %[[VAL_0]] : !fir.heap> + return %2 : !fir.heap> +} diff --git a/flang/test/Lower/OpenMP/derived-type-allocatable.f90 b/flang/test/Lower/OpenMP/derived-type-allocatable.f90 index d265954ef1ce1..2dc4e20f27af2 100644 --- a/flang/test/Lower/OpenMP/derived-type-allocatable.f90 +++ b/flang/test/Lower/OpenMP/derived-type-allocatable.f90 @@ -13,6 +13,10 @@ module m1 contains +!CHECK-LABEL: omp.private {type = private} @_QMm1Ftest_pointer +!CHECK-NOT: fir.call @_FortranAInitializeClone +!CHECK: omp.yield + !CHECK-LABEL: omp.private {type = private} @_QMm1Ftest_nested !CHECK: fir.call @_FortranAInitializeClone !CHECK-NEXT: omp.yield @@ -91,4 +95,11 @@ subroutine test_nested() !$omp parallel private(d2) !$omp end parallel end subroutine + + subroutine test_pointer() + type(x), pointer :: ptr + + !$omp parallel private(ptr) + !$omp end parallel + end subroutine end module diff --git a/flang/test/Lower/array-substring.f90 b/flang/test/Lower/array-substring.f90 index 02101039120e9..7544fbb989627 100644 --- a/flang/test/Lower/array-substring.f90 +++ b/flang/test/Lower/array-substring.f90 @@ -24,9 +24,8 @@ ! CHECK: %[[VAL_16:.*]] = fir.array_coor %[[VAL_7]](%[[VAL_9]]) {{\[}}%[[VAL_10]]] %[[VAL_15]] : (!fir.ref>>, !fir.shape<1>, !fir.slice<1>, index) -> !fir.ref> ! CHECK: %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (!fir.ref>) -> !fir.ref>> ! CHECK: %[[VAL_18:.*]] = fir.coordinate_of %[[VAL_17]], %[[VAL_2]] : (!fir.ref>>, index) -> !fir.ref> -! CHECK: %[[VAL_19:.*]] = fir.convert %[[VAL_18]] : (!fir.ref>) -> !fir.ref> ! CHECK: %[[VAL_20:.*]] = fir.array_coor %[[VAL_11]](%[[VAL_9]]) %[[VAL_15]] : (!fir.ref>>, !fir.shape<1>, index) -> !fir.ref> -! CHECK: %[[VAL_21:.*]] = fir.convert %[[VAL_19]] : (!fir.ref>) -> !fir.ref +! CHECK: %[[VAL_21:.*]] = fir.convert %[[VAL_18]] : (!fir.ref>) -> !fir.ref ! CHECK: %[[VAL_22:.*]] = fir.convert %[[VAL_20]] : (!fir.ref>) -> !fir.ref ! CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_4]] : (index) -> i64 ! CHECK: %[[VAL_24:.*]] = fir.call @_FortranACharacterCompareScalar1(%[[VAL_21]], %[[VAL_22]], %[[VAL_23]], %[[VAL_23]]) {{.*}}: (!fir.ref, !fir.ref, i64, i64) -> i32 diff --git a/flang/test/Lower/vector-subscript-io.f90 b/flang/test/Lower/vector-subscript-io.f90 index 372130fd09907..9a041af16c88c 100644 --- a/flang/test/Lower/vector-subscript-io.f90 +++ b/flang/test/Lower/vector-subscript-io.f90 @@ -325,12 +325,11 @@ subroutine substring(x, y, i, j) ! CHECK: %[[VAL_230:.*]] = arith.subi %[[VAL_216]], %[[VAL_210]] : index ! CHECK: %[[VAL_231:.*]] = fir.convert %[[VAL_228]] : (!fir.ref>) -> !fir.ref>> ! CHECK: %[[VAL_232:.*]] = fir.coordinate_of %[[VAL_231]], %[[VAL_230]] : (!fir.ref>>, index) -> !fir.ref> -! CHECK: %[[VAL_233:.*]] = fir.convert %[[VAL_232]] : (!fir.ref>) -> !fir.ref> ! CHECK: %[[VAL_234:.*]] = arith.subi %[[VAL_219]], %[[VAL_216]] : index ! CHECK: %[[VAL_235:.*]] = arith.addi %[[VAL_234]], %[[VAL_210]] : index ! CHECK: %[[VAL_236:.*]] = arith.cmpi slt, %[[VAL_235]], %[[VAL_209]] : index ! CHECK: %[[VAL_237:.*]] = arith.select %[[VAL_236]], %[[VAL_209]], %[[VAL_235]] : index -! CHECK: %[[VAL_238:.*]] = fir.convert %[[VAL_233]] : (!fir.ref>) -> !fir.ref +! CHECK: %[[VAL_238:.*]] = fir.convert %[[VAL_232]] : (!fir.ref>) -> !fir.ref ! CHECK: %[[VAL_239:.*]] = fir.convert %[[VAL_237]] : (index) -> i64 ! CHECK: %[[VAL_240:.*]] = fir.call @_FortranAioInputAscii(%[[VAL_213]], %[[VAL_238]], %[[VAL_239]]) {{.*}}: (!fir.ref, !fir.ref, i64) -> i1 ! CHECK: %[[VAL_241:.*]] = arith.addi %[[VAL_221]], %[[VAL_210]] overflow : index diff --git a/libc/src/__support/GPU/CMakeLists.txt b/libc/src/__support/GPU/CMakeLists.txt index 28fd9a1ebcc97..9b359f65cdb33 100644 --- a/libc/src/__support/GPU/CMakeLists.txt +++ b/libc/src/__support/GPU/CMakeLists.txt @@ -1,16 +1,12 @@ -if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE}) +# These utilities are GPU only. +if(NOT LIBC_TARGET_OS_IS_GPU) return() endif() -add_subdirectory(${LIBC_TARGET_ARCHITECTURE}) -set(target_gpu_utils libc.src.__support.GPU.${LIBC_TARGET_ARCHITECTURE}.${LIBC_TARGET_ARCHITECTURE}_utils) - add_header_library( utils HDRS utils.h - DEPENDS - ${target_gpu_utils} ) add_object_library( @@ -21,6 +17,6 @@ add_object_library( allocator.h DEPENDS libc.src.__support.common - libc.src.__support.GPU.utils libc.src.__support.RPC.rpc_client + .utils ) diff --git a/libc/src/__support/GPU/amdgpu/CMakeLists.txt b/libc/src/__support/GPU/amdgpu/CMakeLists.txt deleted file mode 100644 index f2b98fc03b218..0000000000000 --- a/libc/src/__support/GPU/amdgpu/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -add_header_library( - amdgpu_utils - HDRS - utils.h - DEPENDS - libc.src.__support.common -) diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h deleted file mode 100644 index 6ab95403ca389..0000000000000 --- a/libc/src/__support/GPU/amdgpu/utils.h +++ /dev/null @@ -1,183 +0,0 @@ -//===-------------- AMDGPU implementation of GPU utils ----------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_AMDGPU_IO_H -#define LLVM_LIBC_SRC___SUPPORT_GPU_AMDGPU_IO_H - -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -#include - -namespace LIBC_NAMESPACE_DECL { -namespace gpu { - -/// Type aliases to the address spaces used by the AMDGPU backend. -template using Private = [[clang::opencl_private]] T; -template using Constant = [[clang::opencl_constant]] T; -template using Local = [[clang::opencl_local]] T; -template using Global = [[clang::opencl_global]] T; - -/// Returns the number of workgroups in the 'x' dimension of the grid. -LIBC_INLINE uint32_t get_num_blocks_x() { - return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x(); -} - -/// Returns the number of workgroups in the 'y' dimension of the grid. -LIBC_INLINE uint32_t get_num_blocks_y() { - return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y(); -} - -/// Returns the number of workgroups in the 'z' dimension of the grid. -LIBC_INLINE uint32_t get_num_blocks_z() { - return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z(); -} - -/// Returns the total number of workgruops in the grid. -LIBC_INLINE uint64_t get_num_blocks() { - return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z(); -} - -/// Returns the 'x' dimension of the current AMD workgroup's id. -LIBC_INLINE uint32_t get_block_id_x() { - return __builtin_amdgcn_workgroup_id_x(); -} - -/// Returns the 'y' dimension of the current AMD workgroup's id. -LIBC_INLINE uint32_t get_block_id_y() { - return __builtin_amdgcn_workgroup_id_y(); -} - -/// Returns the 'z' dimension of the current AMD workgroup's id. -LIBC_INLINE uint32_t get_block_id_z() { - return __builtin_amdgcn_workgroup_id_z(); -} - -/// Returns the absolute id of the AMD workgroup. -LIBC_INLINE uint64_t get_block_id() { - return get_block_id_x() + get_num_blocks_x() * get_block_id_y() + - get_num_blocks_x() * get_num_blocks_y() * get_block_id_z(); -} - -/// Returns the number of workitems in the 'x' dimension. -LIBC_INLINE uint32_t get_num_threads_x() { - return __builtin_amdgcn_workgroup_size_x(); -} - -/// Returns the number of workitems in the 'y' dimension. -LIBC_INLINE uint32_t get_num_threads_y() { - return __builtin_amdgcn_workgroup_size_y(); -} - -/// Returns the number of workitems in the 'z' dimension. -LIBC_INLINE uint32_t get_num_threads_z() { - return __builtin_amdgcn_workgroup_size_z(); -} - -/// Returns the total number of workitems in the workgroup. -LIBC_INLINE uint64_t get_num_threads() { - return get_num_threads_x() * get_num_threads_y() * get_num_threads_z(); -} - -/// Returns the 'x' dimension id of the workitem in the current AMD workgroup. -LIBC_INLINE uint32_t get_thread_id_x() { - return __builtin_amdgcn_workitem_id_x(); -} - -/// Returns the 'y' dimension id of the workitem in the current AMD workgroup. -LIBC_INLINE uint32_t get_thread_id_y() { - return __builtin_amdgcn_workitem_id_y(); -} - -/// Returns the 'z' dimension id of the workitem in the current AMD workgroup. -LIBC_INLINE uint32_t get_thread_id_z() { - return __builtin_amdgcn_workitem_id_z(); -} - -/// Returns the absolute id of the thread in the current AMD workgroup. -LIBC_INLINE uint64_t get_thread_id() { - return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() + - get_num_threads_x() * get_num_threads_y() * get_thread_id_z(); -} - -/// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware -/// and compilation options. -LIBC_INLINE uint32_t get_lane_size() { - return __builtin_amdgcn_wavefrontsize(); -} - -/// Returns the id of the thread inside of an AMD wavefront executing together. -[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() { - return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); -} - -/// Returns the bit-mask of active threads in the current wavefront. -[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() { - return __builtin_amdgcn_read_exec(); -} - -/// Copies the value from the first active thread in the wavefront to the rest. -[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t, - uint32_t x) { - return __builtin_amdgcn_readfirstlane(x); -} - -/// Returns a bitmask of threads in the current lane for which \p x is true. -[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) { - // the lane_mask & gives the nvptx semantics when lane_mask is a subset of - // the active threads - return lane_mask & __builtin_amdgcn_ballot_w64(x); -} - -/// Waits for all the threads in the block to converge and issues a fence. -[[clang::convergent]] LIBC_INLINE void sync_threads() { - __builtin_amdgcn_s_barrier(); - __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup"); -} - -/// Waits for all pending memory operations to complete in program order. -[[clang::convergent]] LIBC_INLINE void memory_fence() { - __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, ""); -} - -/// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU. -[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t) { - __builtin_amdgcn_wave_barrier(); -} - -/// Shuffles the the lanes inside the wavefront according to the given index. -[[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t, uint32_t idx, - uint32_t x) { - return __builtin_amdgcn_ds_bpermute(idx << 2, x); -} - -/// Returns the current value of the GPU's processor clock. -/// NOTE: The RDNA3 and RDNA2 architectures use a 20-bit cycle counter. -LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); } - -/// Returns a fixed-frequency timestamp. The actual frequency is dependent on -/// the card and can only be queried via the driver. -LIBC_INLINE uint64_t fixed_frequency_clock() { - return __builtin_readsteadycounter(); -} - -/// Terminates execution of the associated wavefront. -[[noreturn]] LIBC_INLINE void end_program() { __builtin_amdgcn_endpgm(); } - -/// Returns a unique identifier for the process cluster the current wavefront is -/// executing on. Here we use the identifier for the compute unit (CU) and -/// shader engine. -/// FIXME: Currently unimplemented on AMDGPU until we have a simpler interface -/// than the one at -/// https://github.com/ROCm/clr/blob/develop/hipamd/include/hip/amd_detail/amd_device_functions.h#L899 -LIBC_INLINE uint32_t get_cluster_id() { return 0; } - -} // namespace gpu -} // namespace LIBC_NAMESPACE_DECL - -#endif diff --git a/libc/src/__support/GPU/generic/CMakeLists.txt b/libc/src/__support/GPU/generic/CMakeLists.txt deleted file mode 100644 index 68ba7d1ec80e9..0000000000000 --- a/libc/src/__support/GPU/generic/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -add_header_library( - generic_utils - HDRS - utils.h - DEPENDS - libc.src.__support.common -) diff --git a/libc/src/__support/GPU/generic/utils.h b/libc/src/__support/GPU/generic/utils.h deleted file mode 100644 index 9461ef0aa245b..0000000000000 --- a/libc/src/__support/GPU/generic/utils.h +++ /dev/null @@ -1,84 +0,0 @@ -//===-------------- Generic implementation of GPU utils ---------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_GENERIC_UTILS_H -#define LLVM_LIBC_SRC___SUPPORT_GPU_GENERIC_UTILS_H - -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -#include - -namespace LIBC_NAMESPACE_DECL { -namespace gpu { - -template using Private = T; -template using Constant = T; -template using Shared = T; -template using Global = T; - -LIBC_INLINE uint32_t get_num_blocks_x() { return 1; } - -LIBC_INLINE uint32_t get_num_blocks_y() { return 1; } - -LIBC_INLINE uint32_t get_num_blocks_z() { return 1; } - -LIBC_INLINE uint64_t get_num_blocks() { return 1; } - -LIBC_INLINE uint32_t get_block_id_x() { return 0; } - -LIBC_INLINE uint32_t get_block_id_y() { return 0; } - -LIBC_INLINE uint32_t get_block_id_z() { return 0; } - -LIBC_INLINE uint64_t get_block_id() { return 0; } - -LIBC_INLINE uint32_t get_num_threads_x() { return 1; } - -LIBC_INLINE uint32_t get_num_threads_y() { return 1; } - -LIBC_INLINE uint32_t get_num_threads_z() { return 1; } - -LIBC_INLINE uint64_t get_num_threads() { return 1; } - -LIBC_INLINE uint32_t get_thread_id_x() { return 0; } - -LIBC_INLINE uint32_t get_thread_id_y() { return 0; } - -LIBC_INLINE uint32_t get_thread_id_z() { return 0; } - -LIBC_INLINE uint64_t get_thread_id() { return 0; } - -LIBC_INLINE uint32_t get_lane_size() { return 1; } - -LIBC_INLINE uint32_t get_lane_id() { return 0; } - -LIBC_INLINE uint64_t get_lane_mask() { return 1; } - -LIBC_INLINE uint32_t broadcast_value(uint64_t, uint32_t x) { return x; } - -LIBC_INLINE uint64_t ballot(uint64_t, bool x) { return x; } - -LIBC_INLINE void sync_threads() {} - -LIBC_INLINE void sync_lane(uint64_t) {} - -LIBC_INLINE uint32_t shuffle(uint64_t, uint32_t, uint32_t x) { return x; } - -LIBC_INLINE uint64_t processor_clock() { return 0; } - -LIBC_INLINE uint64_t fixed_frequency_clock() { return 0; } - -[[noreturn]] LIBC_INLINE void end_program() { __builtin_unreachable(); } - -LIBC_INLINE uint32_t get_cluster_id() { return 0; } - -} // namespace gpu -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC___SUPPORT_GPU_GENERIC_UTILS_H diff --git a/libc/src/__support/GPU/nvptx/CMakeLists.txt b/libc/src/__support/GPU/nvptx/CMakeLists.txt deleted file mode 100644 index 0d3f8c7933c86..0000000000000 --- a/libc/src/__support/GPU/nvptx/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -add_header_library( - nvptx_utils - HDRS - utils.h - DEPENDS - libc.src.__support.common -) diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h deleted file mode 100644 index 1a43a839a9ce4..0000000000000 --- a/libc/src/__support/GPU/nvptx/utils.h +++ /dev/null @@ -1,160 +0,0 @@ -//===-------------- NVPTX implementation of GPU utils -----------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-id: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_NVPTX_IO_H -#define LLVM_LIBC_SRC___SUPPORT_GPU_NVPTX_IO_H - -#include "src/__support/common.h" -#include "src/__support/macros/config.h" - -#include - -namespace LIBC_NAMESPACE_DECL { -namespace gpu { - -/// Type aliases to the address spaces used by the NVPTX backend. -template using Private = [[clang::opencl_private]] T; -template using Constant = [[clang::opencl_constant]] T; -template using Local = [[clang::opencl_local]] T; -template using Global = [[clang::opencl_global]] T; - -/// Returns the number of CUDA blocks in the 'x' dimension. -LIBC_INLINE uint32_t get_num_blocks_x() { - return __nvvm_read_ptx_sreg_nctaid_x(); -} - -/// Returns the number of CUDA blocks in the 'y' dimension. -LIBC_INLINE uint32_t get_num_blocks_y() { - return __nvvm_read_ptx_sreg_nctaid_y(); -} - -/// Returns the number of CUDA blocks in the 'z' dimension. -LIBC_INLINE uint32_t get_num_blocks_z() { - return __nvvm_read_ptx_sreg_nctaid_z(); -} - -/// Returns the total number of CUDA blocks. -LIBC_INLINE uint64_t get_num_blocks() { - return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z(); -} - -/// Returns the 'x' dimension of the current CUDA block's id. -LIBC_INLINE uint32_t get_block_id_x() { return __nvvm_read_ptx_sreg_ctaid_x(); } - -/// Returns the 'y' dimension of the current CUDA block's id. -LIBC_INLINE uint32_t get_block_id_y() { return __nvvm_read_ptx_sreg_ctaid_y(); } - -/// Returns the 'z' dimension of the current CUDA block's id. -LIBC_INLINE uint32_t get_block_id_z() { return __nvvm_read_ptx_sreg_ctaid_z(); } - -/// Returns the absolute id of the CUDA block. -LIBC_INLINE uint64_t get_block_id() { - return get_block_id_x() + get_num_blocks_x() * get_block_id_y() + - get_num_blocks_x() * get_num_blocks_y() * get_block_id_z(); -} - -/// Returns the number of CUDA threads in the 'x' dimension. -LIBC_INLINE uint32_t get_num_threads_x() { - return __nvvm_read_ptx_sreg_ntid_x(); -} - -/// Returns the number of CUDA threads in the 'y' dimension. -LIBC_INLINE uint32_t get_num_threads_y() { - return __nvvm_read_ptx_sreg_ntid_y(); -} - -/// Returns the number of CUDA threads in the 'z' dimension. -LIBC_INLINE uint32_t get_num_threads_z() { - return __nvvm_read_ptx_sreg_ntid_z(); -} - -/// Returns the total number of threads in the block. -LIBC_INLINE uint64_t get_num_threads() { - return get_num_threads_x() * get_num_threads_y() * get_num_threads_z(); -} - -/// Returns the 'x' dimension id of the thread in the current CUDA block. -LIBC_INLINE uint32_t get_thread_id_x() { return __nvvm_read_ptx_sreg_tid_x(); } - -/// Returns the 'y' dimension id of the thread in the current CUDA block. -LIBC_INLINE uint32_t get_thread_id_y() { return __nvvm_read_ptx_sreg_tid_y(); } - -/// Returns the 'z' dimension id of the thread in the current CUDA block. -LIBC_INLINE uint32_t get_thread_id_z() { return __nvvm_read_ptx_sreg_tid_z(); } - -/// Returns the absolute id of the thread in the current CUDA block. -LIBC_INLINE uint64_t get_thread_id() { - return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() + - get_num_threads_x() * get_num_threads_y() * get_thread_id_z(); -} - -/// Returns the size of a CUDA warp, always 32 on NVIDIA hardware. -LIBC_INLINE uint32_t get_lane_size() { return 32; } - -/// Returns the id of the thread inside of a CUDA warp executing together. -[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() { - return __nvvm_read_ptx_sreg_laneid(); -} - -/// Returns the bit-mask of active threads in the current warp. -[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() { - return __nvvm_activemask(); -} - -/// Copies the value from the first active thread in the warp to the rest. -[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask, - uint32_t x) { - uint32_t mask = static_cast(lane_mask); - uint32_t id = __builtin_ffs(mask) - 1; - return __nvvm_shfl_sync_idx_i32(mask, x, id, get_lane_size() - 1); -} - -/// Returns a bitmask of threads in the current lane for which \p x is true. -[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) { - uint32_t mask = static_cast(lane_mask); - return __nvvm_vote_ballot_sync(mask, x); -} - -/// Waits for all the threads in the block to converge and issues a fence. -[[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); } - -/// Waits for all pending memory operations to complete in program order. -[[clang::convergent]] LIBC_INLINE void memory_fence() { __nvvm_membar_sys(); } - -/// Waits for all threads in the warp to reconverge for independent scheduling. -[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t mask) { - __nvvm_bar_warp_sync(static_cast(mask)); -} - -/// Shuffles the the lanes inside the warp according to the given index. -[[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, - uint32_t idx, uint32_t x) { - uint32_t mask = static_cast(lane_mask); - uint32_t bitmask = (mask >> idx) & 1; - return -bitmask & __nvvm_shfl_sync_idx_i32(mask, x, idx, get_lane_size() - 1); -} - -/// Returns the current value of the GPU's processor clock. -LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); } - -/// Returns a global fixed-frequency timer at nanosecond frequency. -LIBC_INLINE uint64_t fixed_frequency_clock() { - return __builtin_readsteadycounter(); -} - -/// Terminates execution of the calling thread. -[[noreturn]] LIBC_INLINE void end_program() { __nvvm_exit(); } - -/// Returns a unique identifier for the process cluster the current warp is -/// executing on. Here we use the identifier for the symmetric multiprocessor. -LIBC_INLINE uint32_t get_cluster_id() { return __nvvm_read_ptx_sreg_smid(); } - -} // namespace gpu -} // namespace LIBC_NAMESPACE_DECL - -#endif diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h index ae52e7a088ad5..e138c84c0cb22 100644 --- a/libc/src/__support/GPU/utils.h +++ b/libc/src/__support/GPU/utils.h @@ -9,48 +9,108 @@ #ifndef LLVM_LIBC_SRC___SUPPORT_GPU_UTILS_H #define LLVM_LIBC_SRC___SUPPORT_GPU_UTILS_H +#include "src/__support/macros/attributes.h" #include "src/__support/macros/config.h" #include "src/__support/macros/properties/architectures.h" -#if defined(LIBC_TARGET_ARCH_IS_AMDGPU) -#include "amdgpu/utils.h" -#elif defined(LIBC_TARGET_ARCH_IS_NVPTX) -#include "nvptx/utils.h" -#else -#include "generic/utils.h" +#if !__has_include() +#error "Unsupported compiler" #endif +#include + namespace LIBC_NAMESPACE_DECL { namespace gpu { -/// Get the first active thread inside the lane. -LIBC_INLINE uint64_t get_first_lane_id(uint64_t lane_mask) { - return __builtin_ffsll(lane_mask) - 1; + +template using Private = __gpu_private T; +template using Constant = __gpu_constant T; +template using Local = __gpu_local T; +template using Global = __gpu_local T; + +LIBC_INLINE uint32_t get_num_blocks_x() { return __gpu_num_blocks(0); } + +LIBC_INLINE uint32_t get_num_blocks_y() { return __gpu_num_blocks(1); } + +LIBC_INLINE uint32_t get_num_blocks_z() { return __gpu_num_blocks(2); } + +LIBC_INLINE uint64_t get_num_blocks() { + return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z(); +} + +LIBC_INLINE uint32_t get_block_id_x() { return __gpu_block_id(0); } + +LIBC_INLINE uint32_t get_block_id_y() { return __gpu_block_id(1); } + +LIBC_INLINE uint32_t get_block_id_z() { return __gpu_block_id(2); } + +LIBC_INLINE uint64_t get_block_id() { + return get_block_id_x() + get_num_blocks_x() * get_block_id_y() + + get_num_blocks_x() * get_num_blocks_y() * get_block_id_z(); +} + +LIBC_INLINE uint32_t get_num_threads_x() { return __gpu_num_threads(0); } + +LIBC_INLINE uint32_t get_num_threads_y() { return __gpu_num_threads(1); } + +LIBC_INLINE uint32_t get_num_threads_z() { return __gpu_num_threads(2); } + +LIBC_INLINE uint64_t get_num_threads() { + return get_num_threads_x() * get_num_threads_y() * get_num_threads_z(); +} + +LIBC_INLINE uint32_t get_thread_id_x() { return __gpu_thread_id(0); } + +LIBC_INLINE uint32_t get_thread_id_y() { return __gpu_thread_id(1); } + +LIBC_INLINE uint32_t get_thread_id_z() { return __gpu_thread_id(2); } + +LIBC_INLINE uint64_t get_thread_id() { + return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() + + get_num_threads_x() * get_num_threads_y() * get_thread_id_z(); +} + +LIBC_INLINE uint32_t get_lane_size() { return __gpu_num_lanes(); } + +LIBC_INLINE uint32_t get_lane_id() { return __gpu_lane_id(); } + +LIBC_INLINE uint64_t get_lane_mask() { return __gpu_lane_mask(); } + +LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask, uint32_t x) { + return __gpu_read_first_lane_u32(lane_mask, x); +} + +LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) { + return __gpu_ballot(lane_mask, x); +} + +LIBC_INLINE void sync_threads() { __gpu_sync_threads(); } + +LIBC_INLINE void sync_lane(uint64_t lane_mask) { __gpu_sync_lane(lane_mask); } + +LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x) { + return __gpu_shuffle_idx_u32(lane_mask, idx, x); } -/// Conditional that is only true for a single thread in a lane. +[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); } + LIBC_INLINE bool is_first_lane(uint64_t lane_mask) { - return gpu::get_lane_id() == get_first_lane_id(lane_mask); + return __gpu_is_first_in_lane(lane_mask); } -/// Gets the sum of all lanes inside the warp or wavefront. LIBC_INLINE uint32_t reduce(uint64_t lane_mask, uint32_t x) { - for (uint32_t step = gpu::get_lane_size() / 2; step > 0; step /= 2) { - uint32_t index = step + gpu::get_lane_id(); - x += gpu::shuffle(lane_mask, index, x); - } - return gpu::broadcast_value(lane_mask, x); + return __gpu_lane_sum_u32(lane_mask, x); } -/// Gets the accumulator scan of the threads in the warp or wavefront. LIBC_INLINE uint32_t scan(uint64_t lane_mask, uint32_t x) { - for (uint32_t step = 1; step < gpu::get_lane_size(); step *= 2) { - uint32_t index = gpu::get_lane_id() - step; - uint32_t bitmask = gpu::get_lane_id() >= step; - x += -bitmask & gpu::shuffle(lane_mask, index, x); - } - return x; + return __gpu_lane_scan_u32(lane_mask, x); +} + +LIBC_INLINE uint64_t fixed_frequency_clock() { + return __builtin_readsteadycounter(); } +LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); } + } // namespace gpu } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/time/gpu/clock.cpp b/libc/src/time/gpu/clock.cpp index add5b2725ef8f..8609c5cd6b6b7 100644 --- a/libc/src/time/gpu/clock.cpp +++ b/libc/src/time/gpu/clock.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "src/time/clock.h" + +#include "src/__support/common.h" #include "src/__support/macros/config.h" #include "src/__support/time/gpu/time_utils.h" diff --git a/libc/src/time/gpu/nanosleep.cpp b/libc/src/time/gpu/nanosleep.cpp index a92f660f225cb..d22d9d6bd8d79 100644 --- a/libc/src/time/gpu/nanosleep.cpp +++ b/libc/src/time/gpu/nanosleep.cpp @@ -8,6 +8,7 @@ #include "src/time/nanosleep.h" +#include "src/__support/common.h" #include "src/__support/macros/config.h" #include "src/__support/time/gpu/time_utils.h" diff --git a/libc/src/time/mktime.cpp b/libc/src/time/mktime.cpp index 72cd229120538..b5d1da5fa8fba 100644 --- a/libc/src/time/mktime.cpp +++ b/libc/src/time/mktime.cpp @@ -106,7 +106,8 @@ LLVM_LIBC_FUNCTION(time_t, mktime, (struct tm * tm_out)) { } } - // TODO(rtenneti): Need to handle timezone and update of tm_isdst. + // TODO: https://github.com/llvm/llvm-project/issues/121962 + // Need to handle timezone and update of tm_isdst. int64_t seconds = tm_out->tm_sec + tm_out->tm_min * TimeConstants::SECONDS_PER_MIN + tm_out->tm_hour * TimeConstants::SECONDS_PER_HOUR + diff --git a/libcxx/docs/TestingLibcxx.rst b/libcxx/docs/TestingLibcxx.rst index cf092fabd046f..e98b96bfb478f 100644 --- a/libcxx/docs/TestingLibcxx.rst +++ b/libcxx/docs/TestingLibcxx.rst @@ -459,6 +459,29 @@ we only want to make sure they don't rot. Do not rely on the results of benchmar run through ``check-cxx`` for anything, instead run the benchmarks manually using the instructions for running individual tests. +If you want to compare the results of different benchmark runs, we recommend using the +``libcxx-compare-benchmarks`` helper tool. First, configure CMake in a build directory +and run the benchmark: + +.. code-block:: bash + + $ cmake -S runtimes -B [...] + $ libcxx/utils/libcxx-lit libcxx/test/benchmarks/string.bench.cpp --param optimization=speed + +Then, do the same for the second configuration you want to test. Use a different build +directory for that configuration: + +.. code-block:: bash + + $ cmake -S runtimes -B [...] + $ libcxx/utils/libcxx-lit libcxx/test/benchmarks/string.bench.cpp --param optimization=speed + +Finally, use ``libcxx-compare-benchmarks`` to compare both: + +.. code-block:: bash + + $ libcxx/utils/libcxx-compare-benchmarks libcxx/test/benchmarks/string.bench.cpp + .. _`Google Benchmark`: https://github.com/google/benchmark .. _testing-hardening-assertions: diff --git a/libcxx/src/filesystem/operations.cpp b/libcxx/src/filesystem/operations.cpp index bd37c5af86f6c..208a55723d883 100644 --- a/libcxx/src/filesystem/operations.cpp +++ b/libcxx/src/filesystem/operations.cpp @@ -238,8 +238,14 @@ bool copy_file_impl_copy_file_range(FileDescriptor& read_fd, FileDescriptor& wri return false; } // do not modify the fd positions as copy_file_impl_sendfile may be called after a partial copy +# if defined(__linux__) + loff_t off_in = 0; + loff_t off_out = 0; +# else off_t off_in = 0; off_t off_out = 0; +# endif + do { ssize_t res; diff --git a/libcxx/test/std/atomics/atomics.ref/exchange.pass.cpp b/libcxx/test/std/atomics/atomics.ref/exchange.pass.cpp index cd998d46b7e8f..c2afa6b8dfd07 100644 --- a/libcxx/test/std/atomics/atomics.ref/exchange.pass.cpp +++ b/libcxx/test/std/atomics/atomics.ref/exchange.pass.cpp @@ -17,24 +17,47 @@ #include #include "atomic_helpers.h" +#include "test_helper.h" #include "test_macros.h" template struct TestExchange { void operator()() const { - T x(T(1)); - std::atomic_ref const a(x); + { + T x(T(1)); + std::atomic_ref const a(x); + + { + std::same_as decltype(auto) y = a.exchange(T(2)); + assert(y == T(1)); + ASSERT_NOEXCEPT(a.exchange(T(2))); + } + + { + std::same_as decltype(auto) y = a.exchange(T(3), std::memory_order_seq_cst); + assert(y == T(2)); + ASSERT_NOEXCEPT(a.exchange(T(3), std::memory_order_seq_cst)); + } + } + // memory_order::release { - std::same_as decltype(auto) y = a.exchange(T(2)); - assert(y == T(1)); - ASSERT_NOEXCEPT(a.exchange(T(2))); + auto exchange = [](std::atomic_ref const& x, T, T new_val) { + x.exchange(new_val, std::memory_order::release); + }; + auto load = [](std::atomic_ref const& x) { return x.load(std::memory_order::acquire); }; + test_acquire_release(exchange, load); } + // memory_order::seq_cst { - std::same_as decltype(auto) y = a.exchange(T(3), std::memory_order_seq_cst); - assert(y == T(2)); - ASSERT_NOEXCEPT(a.exchange(T(3), std::memory_order_seq_cst)); + auto exchange_no_arg = [](std::atomic_ref const& x, T, T new_val) { x.exchange(new_val); }; + auto exchange_with_order = [](std::atomic_ref const& x, T, T new_val) { + x.exchange(new_val, std::memory_order::seq_cst); + }; + auto load = [](std::atomic_ref const& x) { return x.load(); }; + test_seq_cst(exchange_no_arg, load); + test_seq_cst(exchange_with_order, load); } } }; diff --git a/libcxx/utils/libcxx-benchmark-json b/libcxx/utils/libcxx-benchmark-json new file mode 100755 index 0000000000000..7f743c32caf40 --- /dev/null +++ b/libcxx/utils/libcxx-benchmark-json @@ -0,0 +1,57 @@ +#!/usr/bin/env bash + +set -e + +PROGNAME="$(basename "${0}")" +MONOREPO_ROOT="$(realpath $(dirname "${PROGNAME}"))" +function usage() { +cat < benchmarks... + +Print the path to the JSON files containing benchmark results for the given benchmarks. + +This requires those benchmarks to have already been run, i.e. this only resolves the path +to the benchmark .json file within the build directory. + + The path to the build directory. +benchmarks... Paths of the benchmarks to extract the results for. Those paths are relative to ''. + +Example +======= +$ cmake -S runtimes -B build/ -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi" +$ libcxx-lit build/ -sv libcxx/test/benchmarks/algorithms/for_each.bench.cpp +$ less \$(${PROGNAME} build/ libcxx/test/benchmarks/algorithms/for_each.bench.cpp) +EOF +} + +if [[ "${1}" == "-h" || "${1}" == "--help" ]]; then + usage + exit 0 +fi + +if [[ $# -lt 1 ]]; then + usage + exit 1 +fi + +build_dir="${1}" +shift + +for benchmark in ${@}; do + # Normalize the paths by turning all benchmarks paths into absolute ones and then making them + # relative to the root of the monorepo. + benchmark="$(realpath ${benchmark})" + relative=$(python -c "import os; import sys; print(os.path.relpath(sys.argv[1], sys.argv[2]))" "${benchmark}" "${MONOREPO_ROOT}") + + # Extract components of the benchmark path + directory="$(dirname ${relative})" + file="$(basename ${relative})" + + # Reconstruct the (slightly weird) path to the benchmark json file. This should be kept in sync + # whenever the test suite changes. + json="${build_dir}/${directory}/Output/${file}.dir/benchmark-result.json" + if [[ -f "${json}" ]]; then + echo "${json}" + fi +done diff --git a/libcxx/utils/libcxx-compare-benchmarks b/libcxx/utils/libcxx-compare-benchmarks new file mode 100755 index 0000000000000..e04820fc57ed9 --- /dev/null +++ b/libcxx/utils/libcxx-compare-benchmarks @@ -0,0 +1,62 @@ +#!/usr/bin/env bash + +set -e + +PROGNAME="$(basename "${0}")" +MONOREPO_ROOT="$(realpath $(dirname "${PROGNAME}"))" +function usage() { +cat < benchmarks... + +Compare the given benchmarks between the baseline and the candidate build directories. + +This requires those benchmarks to have already been generated in both build directories. + + The path to the build directory considered the baseline. + The path to the build directory considered the candidate. +benchmarks... Paths of the benchmarks to compare. Those paths are relative to ''. + +Example +======= +$ libcxx-lit build1/ -sv libcxx/test/benchmarks/algorithms/for_each.bench.cpp +$ libcxx-lit build2/ -sv libcxx/test/benchmarks/algorithms/for_each.bench.cpp +$ ${PROGNAME} build1/ build2/ libcxx/test/benchmarks/algorithms/for_each.bench.cpp +EOF +} + +if [[ "${1}" == "-h" || "${1}" == "--help" ]]; then + usage + exit 0 +fi + +if [[ $# -lt 1 ]]; then + usage + exit 1 +fi + +baseline="${1}" +candidate="${2}" +shift; shift + +GBENCH="${MONOREPO_ROOT}/third-party/benchmark" + +python3 -m venv /tmp/libcxx-compare-benchmarks-venv +source /tmp/libcxx-compare-benchmarks-venv/bin/activate +pip3 install -r ${GBENCH}/tools/requirements.txt + +for benchmark in ${@}; do + base="$(${MONOREPO_ROOT}/libcxx/utils/libcxx-benchmark-json ${baseline} ${benchmark})" + cand="$(${MONOREPO_ROOT}/libcxx/utils/libcxx-benchmark-json ${candidate} ${benchmark})" + + if [[ ! -e "${base}" ]]; then + echo "Benchmark ${benchmark} does not exist in the baseline" + continue + fi + if [[ ! -e "${cand}" ]]; then + echo "Benchmark ${benchmark} does not exist in the candidate" + continue + fi + + "${GBENCH}/tools/compare.py" benchmarks "${base}" "${cand}" +done diff --git a/lldb/source/Host/posix/FileSystemPosix.cpp b/lldb/source/Host/posix/FileSystemPosix.cpp index d2e5b52917ac2..4c326a29812f7 100644 --- a/lldb/source/Host/posix/FileSystemPosix.cpp +++ b/lldb/source/Host/posix/FileSystemPosix.cpp @@ -11,9 +11,6 @@ // C includes #include #include -#if defined(__NetBSD__) -#include -#endif // lldb Includes #include "lldb/Host/Host.h" diff --git a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp index b3916cc913f7d..5f85f99ce7bdd 100644 --- a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp +++ b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp @@ -1031,6 +1031,8 @@ UUID ProcessElfCore::FindBuidIdInCoreMemory(lldb::addr_t address) { std::vector ph_bytes; ph_bytes.resize(elf_header.e_phentsize); + lldb::addr_t base_addr = 0; + bool found_first_load_segment = false; for (unsigned int i = 0; i < elf_header.e_phnum; ++i) { byte_read = ReadMemory(ph_addr + i * elf_header.e_phentsize, ph_bytes.data(), elf_header.e_phentsize, error); @@ -1041,6 +1043,11 @@ UUID ProcessElfCore::FindBuidIdInCoreMemory(lldb::addr_t address) { offset = 0; elf::ELFProgramHeader program_header; program_header.Parse(program_header_data, &offset); + if (program_header.p_type == llvm::ELF::PT_LOAD && + !found_first_load_segment) { + base_addr = program_header.p_vaddr; + found_first_load_segment = true; + } if (program_header.p_type != llvm::ELF::PT_NOTE) continue; @@ -1049,7 +1056,7 @@ UUID ProcessElfCore::FindBuidIdInCoreMemory(lldb::addr_t address) { // We need to slide the address of the p_vaddr as these values don't get // relocated in memory. - const lldb::addr_t vaddr = program_header.p_vaddr + address; + const lldb::addr_t vaddr = program_header.p_vaddr + address - base_addr; byte_read = ReadMemory(vaddr, note_bytes.data(), program_header.p_memsz, error); if (byte_read != program_header.p_memsz) diff --git a/lldb/tools/lldb-dap/CMakeLists.txt b/lldb/tools/lldb-dap/CMakeLists.txt index d68098bf7b326..43fc18873feb3 100644 --- a/lldb/tools/lldb-dap/CMakeLists.txt +++ b/lldb/tools/lldb-dap/CMakeLists.txt @@ -1,7 +1,3 @@ -if ( CMAKE_SYSTEM_NAME MATCHES "Windows" OR CMAKE_SYSTEM_NAME MATCHES "NetBSD" ) - list(APPEND extra_libs lldbHost) -endif () - if (HAVE_LIBPTHREAD) list(APPEND extra_libs pthread) endif () @@ -26,9 +22,11 @@ add_lldb_tool(lldb-dap lldb-dap.cpp Breakpoint.cpp BreakpointBase.cpp + DAP.cpp ExceptionBreakpoint.cpp FifoFiles.cpp FunctionBreakpoint.cpp + InstructionBreakpoint.cpp IOStream.cpp JSONUtils.cpp LLDBUtils.cpp @@ -36,12 +34,11 @@ add_lldb_tool(lldb-dap ProgressEvent.cpp RunInTerminal.cpp SourceBreakpoint.cpp - DAP.cpp Watchpoint.cpp - InstructionBreakpoint.cpp LINK_LIBS liblldb + lldbHost ${extra_libs} LINK_COMPONENTS diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp index 35250d9eef608..a67abe582abd4 100644 --- a/lldb/tools/lldb-dap/DAP.cpp +++ b/lldb/tools/lldb-dap/DAP.cpp @@ -6,34 +6,62 @@ // //===----------------------------------------------------------------------===// -#include -#include -#include -#include - #include "DAP.h" #include "JSONUtils.h" #include "LLDBUtils.h" +#include "OutputRedirector.h" +#include "lldb/API/SBBreakpoint.h" #include "lldb/API/SBCommandInterpreter.h" +#include "lldb/API/SBCommandReturnObject.h" #include "lldb/API/SBLanguageRuntime.h" #include "lldb/API/SBListener.h" +#include "lldb/API/SBProcess.h" #include "lldb/API/SBStream.h" +#include "lldb/Host/FileSystem.h" +#include "lldb/Utility/Status.h" +#include "lldb/lldb-defines.h" +#include "lldb/lldb-enumerations.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include +#include +#include +#include +#include +#include #if defined(_WIN32) #define NOMINMAX #include #include #include +#else +#include #endif using namespace lldb_dap; +namespace { +#ifdef _WIN32 +const char DEV_NULL[] = "nul"; +#else +const char DEV_NULL[] = "/dev/null"; +#endif +} // namespace + namespace lldb_dap { -DAP::DAP(llvm::StringRef path, ReplMode repl_mode) - : debug_adaptor_path(path), broadcaster("lldb-dap"), +DAP::DAP(llvm::StringRef path, std::ofstream *log, ReplMode repl_mode, + StreamDescriptor input, StreamDescriptor output) + : debug_adaptor_path(path), log(log), input(std::move(input)), + output(std::move(output)), broadcaster("lldb-dap"), exception_breakpoints(), focus_tid(LLDB_INVALID_THREAD_ID), stop_at_entry(false), is_attach(false), enable_auto_variable_summaries(false), @@ -43,21 +71,7 @@ DAP::DAP(llvm::StringRef path, ReplMode repl_mode) configuration_done_sent(false), waiting_for_run_in_terminal(false), progress_event_reporter( [&](const ProgressEvent &event) { SendJSON(event.ToJSON()); }), - reverse_request_seq(0), repl_mode(repl_mode) { - const char *log_file_path = getenv("LLDBDAP_LOG"); -#if defined(_WIN32) - // Windows opens stdout and stdin in text mode which converts \n to 13,10 - // while the value is just 10 on Darwin/Linux. Setting the file mode to binary - // fixes this. - int result = _setmode(fileno(stdout), _O_BINARY); - assert(result); - result = _setmode(fileno(stdin), _O_BINARY); - UNUSED_IF_ASSERT_DISABLED(result); - assert(result); -#endif - if (log_file_path) - log.reset(new std::ofstream(log_file_path)); -} + reverse_request_seq(0), repl_mode(repl_mode) {} DAP::~DAP() = default; @@ -173,6 +187,45 @@ ExceptionBreakpoint *DAP::GetExceptionBreakpoint(const lldb::break_id_t bp_id) { return nullptr; } +llvm::Error DAP::ConfigureIO(std::FILE *overrideOut, std::FILE *overrideErr) { + in = lldb::SBFile(std::fopen(DEV_NULL, "r"), /*transfer_ownership=*/true); + + if (auto Error = out.RedirectTo([this](llvm::StringRef output) { + SendOutput(OutputType::Stdout, output); + })) + return Error; + + if (overrideOut) { + auto fd = out.GetWriteFileDescriptor(); + if (auto Error = fd.takeError()) + return Error; + + if (dup2(*fd, fileno(overrideOut)) == -1) + return llvm::errorCodeToError(llvm::errnoAsErrorCode()); + } + + if (auto Error = err.RedirectTo([this](llvm::StringRef output) { + SendOutput(OutputType::Stderr, output); + })) + return Error; + + if (overrideErr) { + auto fd = err.GetWriteFileDescriptor(); + if (auto Error = fd.takeError()) + return Error; + + if (dup2(*fd, fileno(overrideErr)) == -1) + return llvm::errorCodeToError(llvm::errnoAsErrorCode()); + } + + return llvm::Error::success(); +} + +void DAP::StopIO() { + out.Stop(); + err.Stop(); +} + // Send the JSON in "json_str" to the "out" stream. Correctly send the // "Content-Length:" field followed by the length, followed by the raw // JSON bytes. @@ -208,19 +261,19 @@ std::string DAP::ReadJSON() { std::string json_str; int length; - if (!input.read_expected(log.get(), "Content-Length: ")) + if (!input.read_expected(log, "Content-Length: ")) return json_str; - if (!input.read_line(log.get(), length_str)) + if (!input.read_line(log, length_str)) return json_str; if (!llvm::to_integer(length_str, length)) return json_str; - if (!input.read_expected(log.get(), "\r\n")) + if (!input.read_expected(log, "\r\n")) return json_str; - if (!input.read_full(log.get(), length, json_str)) + if (!input.read_full(log, length, json_str)) return json_str; if (log) { diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h index ae496236f1336..846300cb945b0 100644 --- a/lldb/tools/lldb-dap/DAP.h +++ b/lldb/tools/lldb-dap/DAP.h @@ -9,36 +9,38 @@ #ifndef LLDB_TOOLS_LLDB_DAP_DAP_H #define LLDB_TOOLS_LLDB_DAP_DAP_H -#include -#include -#include -#include -#include - -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/StringMap.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/Support/JSON.h" -#include "llvm/Support/Threading.h" -#include "llvm/Support/raw_ostream.h" - -#include "lldb/API/SBAttachInfo.h" -#include "lldb/API/SBCommandInterpreter.h" -#include "lldb/API/SBCommandReturnObject.h" -#include "lldb/API/SBDebugger.h" -#include "lldb/API/SBEvent.h" -#include "lldb/API/SBFormat.h" -#include "lldb/API/SBLaunchInfo.h" -#include "lldb/API/SBTarget.h" -#include "lldb/API/SBThread.h" - +#include "DAPForward.h" #include "ExceptionBreakpoint.h" #include "FunctionBreakpoint.h" #include "IOStream.h" #include "InstructionBreakpoint.h" +#include "OutputRedirector.h" #include "ProgressEvent.h" #include "SourceBreakpoint.h" +#include "lldb/API/SBBroadcaster.h" +#include "lldb/API/SBCommandInterpreter.h" +#include "lldb/API/SBDebugger.h" +#include "lldb/API/SBError.h" +#include "lldb/API/SBFile.h" +#include "lldb/API/SBFormat.h" +#include "lldb/API/SBFrame.h" +#include "lldb/API/SBTarget.h" +#include "lldb/API/SBThread.h" +#include "lldb/API/SBValue.h" +#include "lldb/API/SBValueList.h" +#include "lldb/lldb-types.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/JSON.h" +#include "llvm/Support/Threading.h" +#include +#include +#include +#include +#include #define VARREF_LOCALS (int64_t)1 #define VARREF_GLOBALS (int64_t)2 @@ -138,15 +140,18 @@ struct SendEventRequestHandler : public lldb::SBCommandPluginInterface { struct DAP { llvm::StringRef debug_adaptor_path; + std::ofstream *log; InputStream input; OutputStream output; + lldb::SBFile in; + OutputRedirector out; + OutputRedirector err; lldb::SBDebugger debugger; lldb::SBTarget target; Variables variables; lldb::SBBroadcaster broadcaster; std::thread event_thread; std::thread progress_event_thread; - std::unique_ptr log; llvm::StringMap source_breakpoints; FunctionBreakpointMap function_breakpoints; InstructionBreakpointMap instruction_breakpoints; @@ -198,13 +203,23 @@ struct DAP { // will contain that expression. std::string last_nonempty_var_expression; - DAP(llvm::StringRef path, ReplMode repl_mode); + DAP(llvm::StringRef path, std::ofstream *log, ReplMode repl_mode, + StreamDescriptor input, StreamDescriptor output); ~DAP(); DAP(const DAP &rhs) = delete; void operator=(const DAP &rhs) = delete; ExceptionBreakpoint *GetExceptionBreakpoint(const std::string &filter); ExceptionBreakpoint *GetExceptionBreakpoint(const lldb::break_id_t bp_id); + /// Redirect stdout and stderr fo the IDE's console output. + /// + /// Errors in this operation will be printed to the log file and the IDE's + /// console output as well. + llvm::Error ConfigureIO(std::FILE *overrideOut, std::FILE *overrideErr); + + /// Stop the redirected IO threads and associated pipes. + void StopIO(); + // Serialize the JSON value into a string and send the JSON packet to // the "out" stream. void SendJSON(const llvm::json::Value &json); diff --git a/lldb/tools/lldb-dap/IOStream.h b/lldb/tools/lldb-dap/IOStream.h index 57d5fd458b716..74889eb2e5a86 100644 --- a/lldb/tools/lldb-dap/IOStream.h +++ b/lldb/tools/lldb-dap/IOStream.h @@ -52,6 +52,9 @@ struct StreamDescriptor { struct InputStream { StreamDescriptor descriptor; + explicit InputStream(StreamDescriptor descriptor) + : descriptor(std::move(descriptor)) {} + bool read_full(std::ofstream *log, size_t length, std::string &text); bool read_line(std::ofstream *log, std::string &line); @@ -62,6 +65,9 @@ struct InputStream { struct OutputStream { StreamDescriptor descriptor; + explicit OutputStream(StreamDescriptor descriptor) + : descriptor(std::move(descriptor)) {} + bool write_full(llvm::StringRef str); }; } // namespace lldb_dap diff --git a/lldb/tools/lldb-dap/OutputRedirector.cpp b/lldb/tools/lldb-dap/OutputRedirector.cpp index 2c2f49569869b..8fcbcfec99c44 100644 --- a/lldb/tools/lldb-dap/OutputRedirector.cpp +++ b/lldb/tools/lldb-dap/OutputRedirector.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===/ +#include "llvm/Support/Error.h" +#include #if defined(_WIN32) #include #include @@ -17,47 +19,59 @@ #include "OutputRedirector.h" #include "llvm/ADT/StringRef.h" -using namespace llvm; +using lldb_private::Pipe; +using lldb_private::Status; +using llvm::createStringError; +using llvm::Error; +using llvm::Expected; +using llvm::StringRef; namespace lldb_dap { -Error RedirectFd(int fd, std::function callback) { - int new_fd[2]; -#if defined(_WIN32) - if (_pipe(new_fd, 4096, O_TEXT) == -1) { -#else - if (pipe(new_fd) == -1) { -#endif - int error = errno; - return createStringError(inconvertibleErrorCode(), - "Couldn't create new pipe for fd %d. %s", fd, - strerror(error)); - } +Expected OutputRedirector::GetWriteFileDescriptor() { + if (!m_pipe.CanWrite()) + return createStringError(std::errc::bad_file_descriptor, + "write handle is not open for writing"); + return m_pipe.GetWriteFileDescriptor(); +} - if (dup2(new_fd[1], fd) == -1) { - int error = errno; - return createStringError(inconvertibleErrorCode(), - "Couldn't override the fd %d. %s", fd, - strerror(error)); - } +Error OutputRedirector::RedirectTo(std::function callback) { + Status status = m_pipe.CreateNew(/*child_process_inherit=*/false); + if (status.Fail()) + return status.takeError(); - int read_fd = new_fd[0]; - std::thread t([read_fd, callback]() { + m_forwarder = std::thread([this, callback]() { char buffer[OutputBufferSize]; - while (true) { - ssize_t bytes_count = read(read_fd, &buffer, sizeof(buffer)); - if (bytes_count == 0) - return; - if (bytes_count == -1) { - if (errno == EAGAIN || errno == EINTR) - continue; + while (m_pipe.CanRead() && !m_stopped) { + size_t bytes_read; + Status status = m_pipe.Read(&buffer, sizeof(buffer), bytes_read); + if (status.Fail()) + continue; + + // EOF detected + if (bytes_read == 0 || m_stopped) break; - } - callback(StringRef(buffer, bytes_count)); + + callback(StringRef(buffer, bytes_read)); } }); - t.detach(); + return Error::success(); } +void OutputRedirector::Stop() { + m_stopped = true; + + if (m_pipe.CanWrite()) { + // Closing the pipe may not be sufficient to wake up the thread in case the + // write descriptor is duplicated (to stdout/err or to another process). + // Write a null byte to ensure the read call returns. + char buf[] = "\0"; + size_t bytes_written; + m_pipe.Write(buf, sizeof(buf), bytes_written); + m_pipe.CloseWriteFileDescriptor(); + m_forwarder.join(); + } +} + } // namespace lldb_dap diff --git a/lldb/tools/lldb-dap/OutputRedirector.h b/lldb/tools/lldb-dap/OutputRedirector.h index e26d1648b104f..41ea05c22c691 100644 --- a/lldb/tools/lldb-dap/OutputRedirector.h +++ b/lldb/tools/lldb-dap/OutputRedirector.h @@ -9,17 +9,39 @@ #ifndef LLDB_TOOLS_LLDB_DAP_OUTPUT_REDIRECTOR_H #define LLDB_TOOLS_LLDB_DAP_OUTPUT_REDIRECTOR_H +#include "lldb/Host/Pipe.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Error.h" +#include +#include +#include namespace lldb_dap { -/// Redirects the output of a given file descriptor to a callback. -/// -/// \return -/// \a Error::success if the redirection was set up correctly, or an error -/// otherwise. -llvm::Error RedirectFd(int fd, std::function callback); +class OutputRedirector { +public: + /// Creates writable file descriptor that will invoke the given callback on + /// each write in a background thread. + /// + /// \return + /// \a Error::success if the redirection was set up correctly, or an error + /// otherwise. + llvm::Error RedirectTo(std::function callback); + + llvm::Expected GetWriteFileDescriptor(); + void Stop(); + + ~OutputRedirector() { Stop(); } + + OutputRedirector() = default; + OutputRedirector(const OutputRedirector &) = delete; + OutputRedirector &operator=(const OutputRedirector &) = delete; + +private: + std::atomic m_stopped = false; + lldb_private::Pipe m_pipe; + std::thread m_forwarder; +}; } // namespace lldb_dap diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp index 7e8f7b5f6df67..6c524081c493e 100644 --- a/lldb/tools/lldb-dap/lldb-dap.cpp +++ b/lldb/tools/lldb-dap/lldb-dap.cpp @@ -10,10 +10,10 @@ #include "FifoFiles.h" #include "JSONUtils.h" #include "LLDBUtils.h" -#include "OutputRedirector.h" #include "RunInTerminal.h" #include "Watchpoint.h" #include "lldb/API/SBDeclaration.h" +#include "lldb/API/SBEvent.h" #include "lldb/API/SBInstruction.h" #include "lldb/API/SBListener.h" #include "lldb/API/SBMemoryRegionInfo.h" @@ -41,9 +41,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -140,15 +142,14 @@ lldb::SBValueList *GetTopLevelScope(DAP &dap, int64_t variablesReference) { } } -SOCKET AcceptConnection(DAP &dap, int portno) { +SOCKET AcceptConnection(std::ofstream *log, int portno) { // Accept a socket connection from any host on "portno". SOCKET newsockfd = -1; struct sockaddr_in serv_addr, cli_addr; SOCKET sockfd = socket(AF_INET, SOCK_STREAM, 0); if (sockfd < 0) { - if (dap.log) - *dap.log << "error: opening socket (" << strerror(errno) << ")" - << std::endl; + if (log) + *log << "error: opening socket (" << strerror(errno) << ")" << std::endl; } else { memset((char *)&serv_addr, 0, sizeof(serv_addr)); serv_addr.sin_family = AF_INET; @@ -156,9 +157,9 @@ SOCKET AcceptConnection(DAP &dap, int portno) { serv_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); serv_addr.sin_port = htons(portno); if (bind(sockfd, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) { - if (dap.log) - *dap.log << "error: binding socket (" << strerror(errno) << ")" - << std::endl; + if (log) + *log << "error: binding socket (" << strerror(errno) << ")" + << std::endl; } else { listen(sockfd, 5); socklen_t clilen = sizeof(cli_addr); @@ -166,8 +167,8 @@ SOCKET AcceptConnection(DAP &dap, int portno) { llvm::sys::RetryAfterSignal(static_cast(-1), accept, sockfd, (struct sockaddr *)&cli_addr, &clilen); if (newsockfd < 0) - if (dap.log) - *dap.log << "error: accept (" << strerror(errno) << ")" << std::endl; + if (log) + *log << "error: accept (" << strerror(errno) << ")" << std::endl; } #if defined(_WIN32) closesocket(sockfd); @@ -1102,6 +1103,7 @@ void request_disconnect(DAP &dap, const llvm::json::Object &request) { dap.broadcaster.BroadcastEventByType(eBroadcastBitStopProgressThread); dap.progress_event_thread.join(); } + dap.StopIO(); dap.disconnecting = true; } @@ -1871,7 +1873,36 @@ void request_initialize(DAP &dap, const llvm::json::Object &request) { // which may affect the outcome of tests. bool source_init_file = GetBoolean(arguments, "sourceInitFile", true); - dap.debugger = lldb::SBDebugger::Create(source_init_file); + // Do not source init files until in/out/err are configured. + dap.debugger = lldb::SBDebugger::Create(false); + dap.debugger.SetInputFile(dap.in); + auto out_fd = dap.out.GetWriteFileDescriptor(); + if (llvm::Error err = out_fd.takeError()) { + response["success"] = false; + EmplaceSafeString(response, "message", llvm::toString(std::move(err))); + dap.SendJSON(llvm::json::Value(std::move(response))); + return; + } + dap.debugger.SetOutputFile(lldb::SBFile(*out_fd, "w", false)); + auto err_fd = dap.err.GetWriteFileDescriptor(); + if (llvm::Error err = err_fd.takeError()) { + response["success"] = false; + EmplaceSafeString(response, "message", llvm::toString(std::move(err))); + dap.SendJSON(llvm::json::Value(std::move(response))); + return; + } + dap.debugger.SetErrorFile(lldb::SBFile(*err_fd, "w", false)); + + auto interp = dap.debugger.GetCommandInterpreter(); + + if (source_init_file) { + dap.debugger.SkipLLDBInitFiles(false); + dap.debugger.SkipAppInitFiles(false); + lldb::SBCommandReturnObject init; + interp.SourceInitFileInGlobalDirectory(init); + interp.SourceInitFileInHomeDirectory(init); + } + if (llvm::Error err = dap.RunPreInitCommands()) { response["success"] = false; EmplaceSafeString(response, "message", llvm::toString(std::move(err))); @@ -4910,36 +4941,14 @@ static void redirection_test() { fflush(stderr); } -/// Redirect stdout and stderr fo the IDE's console output. -/// -/// Errors in this operation will be printed to the log file and the IDE's -/// console output as well. -/// -/// \return -/// A fd pointing to the original stdout. -static int SetupStdoutStderrRedirection(DAP &dap) { - int stdoutfd = fileno(stdout); - int new_stdout_fd = dup(stdoutfd); - auto output_callback_stderr = [&dap](llvm::StringRef data) { - dap.SendOutput(OutputType::Stderr, data); - }; - auto output_callback_stdout = [&dap](llvm::StringRef data) { - dap.SendOutput(OutputType::Stdout, data); - }; - if (llvm::Error err = RedirectFd(stdoutfd, output_callback_stdout)) { - std::string error_message = llvm::toString(std::move(err)); - if (dap.log) - *dap.log << error_message << std::endl; - output_callback_stderr(error_message); - } - if (llvm::Error err = RedirectFd(fileno(stderr), output_callback_stderr)) { - std::string error_message = llvm::toString(std::move(err)); - if (dap.log) - *dap.log << error_message << std::endl; - output_callback_stderr(error_message); - } - - return new_stdout_fd; +/// Duplicates a file descriptor, setting FD_CLOEXEC if applicable. +static int DuplicateFileDescriptor(int fd) { +#if defined(F_DUPFD_CLOEXEC) + // Ensure FD_CLOEXEC is set. + return ::fcntl(fd, F_DUPFD_CLOEXEC, 0); +#else + return ::dup(fd); +#endif } int main(int argc, char *argv[]) { @@ -5030,47 +5039,88 @@ int main(int argc, char *argv[]) { } #endif + std::unique_ptr log = nullptr; + const char *log_file_path = getenv("LLDBDAP_LOG"); + if (log_file_path) + log = std::make_unique(log_file_path); + // Initialize LLDB first before we do anything. - lldb::SBDebugger::Initialize(); + lldb::SBError error = lldb::SBDebugger::InitializeWithErrorHandling(); + if (error.Fail()) { + lldb::SBStream os; + error.GetDescription(os); + llvm::errs() << "lldb initialize failed: " << os.GetData() << "\n"; + return EXIT_FAILURE; + } // Terminate the debugger before the C++ destructor chain kicks in. auto terminate_debugger = llvm::make_scope_exit([] { lldb::SBDebugger::Terminate(); }); - DAP dap = DAP(program_path.str(), default_repl_mode); - - RegisterRequestCallbacks(dap); - - // stdout/stderr redirection to the IDE's console - int new_stdout_fd = SetupStdoutStderrRedirection(dap); - + StreamDescriptor input; + StreamDescriptor output; + std::FILE *redirectOut = nullptr; + std::FILE *redirectErr = nullptr; if (portno != -1) { printf("Listening on port %i...\n", portno); - SOCKET socket_fd = AcceptConnection(dap, portno); - if (socket_fd >= 0) { - dap.input.descriptor = StreamDescriptor::from_socket(socket_fd, true); - dap.output.descriptor = StreamDescriptor::from_socket(socket_fd, false); - } else { + SOCKET socket_fd = AcceptConnection(log.get(), portno); + if (socket_fd < 0) return EXIT_FAILURE; - } + + input = StreamDescriptor::from_socket(socket_fd, true); + output = StreamDescriptor::from_socket(socket_fd, false); } else { - dap.input.descriptor = StreamDescriptor::from_file(fileno(stdin), false); - dap.output.descriptor = StreamDescriptor::from_file(new_stdout_fd, false); +#if defined(_WIN32) + // Windows opens stdout and stdin in text mode which converts \n to 13,10 + // while the value is just 10 on Darwin/Linux. Setting the file mode to + // binary fixes this. + int result = _setmode(fileno(stdout), _O_BINARY); + assert(result); + result = _setmode(fileno(stdin), _O_BINARY); + UNUSED_IF_ASSERT_DISABLED(result); + assert(result); +#endif - /// used only by TestVSCode_redirection_to_console.py - if (getenv("LLDB_DAP_TEST_STDOUT_STDERR_REDIRECTION") != nullptr) - redirection_test(); + int stdout_fd = DuplicateFileDescriptor(fileno(stdout)); + if (stdout_fd == -1) { + llvm::logAllUnhandledErrors( + llvm::errorCodeToError(llvm::errnoAsErrorCode()), llvm::errs(), + "Failed to configure stdout redirect: "); + return EXIT_FAILURE; + } + + redirectOut = stdout; + redirectErr = stderr; + + input = StreamDescriptor::from_file(fileno(stdin), false); + output = StreamDescriptor::from_file(stdout_fd, false); + } + + DAP dap = DAP(program_path.str(), log.get(), default_repl_mode, + std::move(input), std::move(output)); + + // stdout/stderr redirection to the IDE's console + if (auto Err = dap.ConfigureIO(redirectOut, redirectErr)) { + llvm::logAllUnhandledErrors(std::move(Err), llvm::errs(), + "Failed to configure lldb-dap IO operations: "); + return EXIT_FAILURE; } + RegisterRequestCallbacks(dap); + for (const std::string &arg : input_args.getAllArgValues(OPT_pre_init_command)) { dap.pre_init_commands.push_back(arg); } + // used only by TestVSCode_redirection_to_console.py + if (getenv("LLDB_DAP_TEST_STDOUT_STDERR_REDIRECTION") != nullptr) + redirection_test(); + bool CleanExit = true; if (auto Err = dap.Loop()) { - if (dap.log) - *dap.log << "Transport Error: " << llvm::toString(std::move(Err)) << "\n"; + if (log) + *log << "Transport Error: " << llvm::toString(std::move(Err)) << "\n"; CleanExit = false; } diff --git a/llvm/docs/DirectX/DXILResources.rst b/llvm/docs/DirectX/DXILResources.rst index 3971d3788b8a0..fafcb247f49c9 100644 --- a/llvm/docs/DirectX/DXILResources.rst +++ b/llvm/docs/DirectX/DXILResources.rst @@ -274,6 +274,50 @@ Examples: @llvm.dx.handle.fromHeap.tdx.RawBuffer_v4f32_1_0( i32 2, i1 false) +Accessing Resources as Memory +----------------------------- + +*relevant types: Buffers, CBuffer, and Textures* + +Loading and storing from resources is generally represented in LLVM using +operations on memory that is only accessible via a handle object. Given a +handle, `llvm.dx.resource.getpointer` gives a pointer that can be used to read +and (depending on type) write to the resource. + +Accesses using `llvm.dx.resource.getpointer` are replaced with direct load and +store operations in the `DXILResourceAccess` pass. These direct loads and +stores are described later in this document. + +.. note:: Currently the pointers returned by `dx.resource.getpointer` are in + the default address space, but that will likely change in the future. + +.. list-table:: ``@llvm.dx.resource.getpointer`` + :header-rows: 1 + + * - Argument + - + - Type + - Description + * - Return value + - + - Pointer + - A pointer to an object in the buffer + * - ``%buffer`` + - 0 + - ``target(dx.TypedBuffer, ...)`` + - The buffer to access + * - ``%index`` + - 1 + - ``i32`` + - Index into the buffer + +Examples: + +.. code-block:: llvm + + %ptr = call ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_v4f32_0_0_0t( + target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 %index) + 16-byte Loads, Samples, and Gathers ----------------------------------- @@ -296,8 +340,8 @@ instead. That is, ``llvm.dx.resource.load.typedbuffer`` from a of 4 floats, and from ``Buffer`` a vector of two doubles, etc. The operations are then expanded out to match DXIL's format during lowering. -In cases where we need ``CheckAccessFullyMapped``, we have a second intrinsic -that returns an anonymous struct with element-0 being the contained type, and +In order to support ``CheckAccessFullyMapped``, we need these intrinsics to +return an anonymous struct with element-0 being the contained type, and element-1 being the ``i1`` result of a ``CheckAccessFullyMapped`` call. We don't have a separate call to ``CheckAccessFullyMapped`` at all, since that's the only operation that can possibly be done on this value. In practice this @@ -317,8 +361,8 @@ HLSL source, but this actually matches DXC's behaviour in practice. - Description * - Return value - - - The contained type of the buffer - - The data loaded from the buffer + - A structure of the contained type and the check bit + - The data loaded from the buffer and the check bit * - ``%buffer`` - 0 - ``target(dx.TypedBuffer, ...)`` @@ -332,48 +376,22 @@ Examples: .. code-block:: llvm - %ret = call <4 x float> + %ret = call {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_0_0_0t( target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 %index) - %ret = call float + %ret = call {float, i1} @llvm.dx.resource.load.typedbuffer.f32.tdx.TypedBuffer_f32_0_0_0t( target("dx.TypedBuffer", float, 0, 0, 0) %buffer, i32 %index) - %ret = call <4 x i32> + %ret = call {<4 x i32>, i1} @llvm.dx.resource.load.typedbuffer.v4i32.tdx.TypedBuffer_v4i32_0_0_0t( target("dx.TypedBuffer", <4 x i32>, 0, 0, 0) %buffer, i32 %index) - %ret = call <4 x half> + %ret = call {<4 x half>, i1} @llvm.dx.resource.load.typedbuffer.v4f16.tdx.TypedBuffer_v4f16_0_0_0t( target("dx.TypedBuffer", <4 x half>, 0, 0, 0) %buffer, i32 %index) - %ret = call <2 x double> + %ret = call {<2 x double>, i1} @llvm.dx.resource.load.typedbuffer.v2f64.tdx.TypedBuffer_v2f64_0_0t( target("dx.TypedBuffer", <2 x double>, 0, 0, 0) %buffer, i32 %index) -.. list-table:: ``@llvm.dx.resource.loadchecked.typedbuffer`` - :header-rows: 1 - - * - Argument - - - - Type - - Description - * - Return value - - - - A structure of the contained type and the check bit - - The data loaded from the buffer and the check bit - * - ``%buffer`` - - 0 - - ``target(dx.TypedBuffer, ...)`` - - The buffer to load from - * - ``%index`` - - 1 - - ``i32`` - - Index into the buffer - -.. code-block:: llvm - - %ret = call {<4 x float>, i1} - @llvm.dx.resource.loadchecked.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_0_0_0t( - target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 %index) - Texture and Typed Buffer Stores ------------------------------- diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h index 8abacf1b546a0..1bc69f791bd84 100644 --- a/llvm/include/llvm/BinaryFormat/ELF.h +++ b/llvm/include/llvm/BinaryFormat/ELF.h @@ -689,6 +689,9 @@ enum : unsigned { // ELF Relocation types for RISC-V enum { #include "ELFRelocs/RISCV.def" +#define ELF_RISCV_NONSTANDARD_RELOC(_vendor, name, value) name = value, +#include "ELFRelocs/RISCV_nonstandard.def" +#undef ELF_RISCV_NONSTANDARD_RELOC }; enum { diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def new file mode 100644 index 0000000000000..7ae3d3f205772 --- /dev/null +++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def @@ -0,0 +1,28 @@ +//===--- RISC-V Nonstandard Relocation List ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef ELF_RISCV_NONSTANDARD_RELOC +#error "ELF_RISCV_NONSTANDARD_RELOC must be defined" +#endif + +// ELF_RISCV_NONSTANDARD_RELOC(VENDOR, NAME, ID) defines information about +// nonstandard relocation codes. This can be used when parsing relocations, or +// when printing them, to provide better information. +// +// VENDOR should be the symbol name expected in the associated `R_RISCV_VENDOR` +// relocation. NAME and ID work like `ELF_RELOC` but the mapping is not expected +// to be 1:1. +// +// The mapping in RISCV.def is 1:1, and should be used when the only information +// available is the relocation enum value. + +// Qualcomm Nonstandard Relocations +ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_ABS20_U, 192) +ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_E_BRANCH, 193) +ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_E_32, 194) +ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_E_JUMP_PLT, 195) diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td index d31d5afe5145a..dfdfda963b627 100644 --- a/llvm/include/llvm/IR/IntrinsicsDirectX.td +++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td @@ -31,9 +31,6 @@ def int_dx_resource_getpointer : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [llvm_any_ty, llvm_i32_ty], [IntrNoMem]>; def int_dx_resource_load_typedbuffer - : DefaultAttrsIntrinsic<[llvm_any_ty], [llvm_any_ty, llvm_i32_ty], - [IntrReadMem]>; -def int_dx_resource_loadchecked_typedbuffer : DefaultAttrsIntrinsic<[llvm_any_ty, llvm_i1_ty], [llvm_any_ty, llvm_i32_ty], [IntrReadMem]>; def int_dx_resource_store_typedbuffer @@ -43,7 +40,7 @@ def int_dx_resource_store_typedbuffer def int_dx_resource_updatecounter : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_i8_ty], [IntrInaccessibleMemOrArgMemOnly]>; - + // Cast between target extension handle types and dxil-style opaque handles def int_dx_resource_casthandle : Intrinsic<[llvm_any_ty], [llvm_any_ty]>; @@ -105,7 +102,7 @@ def int_dx_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrCon def int_dx_wave_readlane : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrConvergent, IntrNoMem]>; def int_dx_sign : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_any_ty], [IntrNoMem]>; def int_dx_step : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>], [IntrNoMem]>; -def int_dx_splitdouble : DefaultAttrsIntrinsic<[llvm_anyint_ty, LLVMMatchType<0>], +def int_dx_splitdouble : DefaultAttrsIntrinsic<[llvm_anyint_ty, LLVMMatchType<0>], [LLVMScalarOrSameVectorWidth<0, llvm_double_ty>], [IntrNoMem]>; def int_dx_radians : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; def int_dx_discard : DefaultAttrsIntrinsic<[], [llvm_i1_ty], []>; diff --git a/llvm/include/llvm/IR/NVVMIntrinsicFlags.h b/llvm/include/llvm/IR/NVVMIntrinsicFlags.h deleted file mode 100644 index dfb6e857b3a6a..0000000000000 --- a/llvm/include/llvm/IR/NVVMIntrinsicFlags.h +++ /dev/null @@ -1,39 +0,0 @@ -//===--- NVVMIntrinsicFlags.h -----------------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This file contains the definitions of the enumerations and flags -/// associated with NVVM Intrinsics. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_IR_NVVMINTRINSICFLAGS_H -#define LLVM_IR_NVVMINTRINSICFLAGS_H - -#include - -namespace llvm { -namespace nvvm { - -// Reduction Ops supported with TMA Copy from Shared -// to Global Memory for the "cp.reduce.async.bulk.tensor.*" -// family of PTX instructions. -enum class TMAReductionOp : uint8_t { - ADD = 0, - MIN = 1, - MAX = 2, - INC = 3, - DEC = 4, - AND = 5, - OR = 6, - XOR = 7, -}; - -} // namespace nvvm -} // namespace llvm -#endif // LLVM_IR_NVVMINTRINSICFLAGS_H diff --git a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h new file mode 100644 index 0000000000000..8ca073ba82253 --- /dev/null +++ b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h @@ -0,0 +1,176 @@ +//===--- NVVMIntrinsicUtils.h -----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This file contains the definitions of the enumerations and flags +/// associated with NVVM Intrinsics, along with some helper functions. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_IR_NVVMINTRINSICUTILS_H +#define LLVM_IR_NVVMINTRINSICUTILS_H + +#include + +#include "llvm/ADT/APFloat.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsNVPTX.h" + +namespace llvm { +namespace nvvm { + +// Reduction Ops supported with TMA Copy from Shared +// to Global Memory for the "cp.reduce.async.bulk.tensor.*" +// family of PTX instructions. +enum class TMAReductionOp : uint8_t { + ADD = 0, + MIN = 1, + MAX = 2, + INC = 3, + DEC = 4, + AND = 5, + OR = 6, + XOR = 7, +}; + +inline bool IntrinsicShouldFTZ(Intrinsic::ID IntrinsicID) { + switch (IntrinsicID) { + // Float to i32 / i64 conversion intrinsics: + case Intrinsic::nvvm_f2i_rm_ftz: + case Intrinsic::nvvm_f2i_rn_ftz: + case Intrinsic::nvvm_f2i_rp_ftz: + case Intrinsic::nvvm_f2i_rz_ftz: + + case Intrinsic::nvvm_f2ui_rm_ftz: + case Intrinsic::nvvm_f2ui_rn_ftz: + case Intrinsic::nvvm_f2ui_rp_ftz: + case Intrinsic::nvvm_f2ui_rz_ftz: + + case Intrinsic::nvvm_f2ll_rm_ftz: + case Intrinsic::nvvm_f2ll_rn_ftz: + case Intrinsic::nvvm_f2ll_rp_ftz: + case Intrinsic::nvvm_f2ll_rz_ftz: + + case Intrinsic::nvvm_f2ull_rm_ftz: + case Intrinsic::nvvm_f2ull_rn_ftz: + case Intrinsic::nvvm_f2ull_rp_ftz: + case Intrinsic::nvvm_f2ull_rz_ftz: + return true; + } + return false; +} + +inline bool IntrinsicConvertsToSignedInteger(Intrinsic::ID IntrinsicID) { + switch (IntrinsicID) { + // f2i + case Intrinsic::nvvm_f2i_rm: + case Intrinsic::nvvm_f2i_rm_ftz: + case Intrinsic::nvvm_f2i_rn: + case Intrinsic::nvvm_f2i_rn_ftz: + case Intrinsic::nvvm_f2i_rp: + case Intrinsic::nvvm_f2i_rp_ftz: + case Intrinsic::nvvm_f2i_rz: + case Intrinsic::nvvm_f2i_rz_ftz: + // d2i + case Intrinsic::nvvm_d2i_rm: + case Intrinsic::nvvm_d2i_rn: + case Intrinsic::nvvm_d2i_rp: + case Intrinsic::nvvm_d2i_rz: + // f2ll + case Intrinsic::nvvm_f2ll_rm: + case Intrinsic::nvvm_f2ll_rm_ftz: + case Intrinsic::nvvm_f2ll_rn: + case Intrinsic::nvvm_f2ll_rn_ftz: + case Intrinsic::nvvm_f2ll_rp: + case Intrinsic::nvvm_f2ll_rp_ftz: + case Intrinsic::nvvm_f2ll_rz: + case Intrinsic::nvvm_f2ll_rz_ftz: + // d2ll + case Intrinsic::nvvm_d2ll_rm: + case Intrinsic::nvvm_d2ll_rn: + case Intrinsic::nvvm_d2ll_rp: + case Intrinsic::nvvm_d2ll_rz: + return true; + } + return false; +} + +inline APFloat::roundingMode +IntrinsicGetRoundingMode(Intrinsic::ID IntrinsicID) { + switch (IntrinsicID) { + // RM: + case Intrinsic::nvvm_f2i_rm: + case Intrinsic::nvvm_f2ui_rm: + case Intrinsic::nvvm_f2i_rm_ftz: + case Intrinsic::nvvm_f2ui_rm_ftz: + case Intrinsic::nvvm_d2i_rm: + case Intrinsic::nvvm_d2ui_rm: + + case Intrinsic::nvvm_f2ll_rm: + case Intrinsic::nvvm_f2ull_rm: + case Intrinsic::nvvm_f2ll_rm_ftz: + case Intrinsic::nvvm_f2ull_rm_ftz: + case Intrinsic::nvvm_d2ll_rm: + case Intrinsic::nvvm_d2ull_rm: + return APFloat::rmTowardNegative; + + // RN: + case Intrinsic::nvvm_f2i_rn: + case Intrinsic::nvvm_f2ui_rn: + case Intrinsic::nvvm_f2i_rn_ftz: + case Intrinsic::nvvm_f2ui_rn_ftz: + case Intrinsic::nvvm_d2i_rn: + case Intrinsic::nvvm_d2ui_rn: + + case Intrinsic::nvvm_f2ll_rn: + case Intrinsic::nvvm_f2ull_rn: + case Intrinsic::nvvm_f2ll_rn_ftz: + case Intrinsic::nvvm_f2ull_rn_ftz: + case Intrinsic::nvvm_d2ll_rn: + case Intrinsic::nvvm_d2ull_rn: + return APFloat::rmNearestTiesToEven; + + // RP: + case Intrinsic::nvvm_f2i_rp: + case Intrinsic::nvvm_f2ui_rp: + case Intrinsic::nvvm_f2i_rp_ftz: + case Intrinsic::nvvm_f2ui_rp_ftz: + case Intrinsic::nvvm_d2i_rp: + case Intrinsic::nvvm_d2ui_rp: + + case Intrinsic::nvvm_f2ll_rp: + case Intrinsic::nvvm_f2ull_rp: + case Intrinsic::nvvm_f2ll_rp_ftz: + case Intrinsic::nvvm_f2ull_rp_ftz: + case Intrinsic::nvvm_d2ll_rp: + case Intrinsic::nvvm_d2ull_rp: + return APFloat::rmTowardPositive; + + // RZ: + case Intrinsic::nvvm_f2i_rz: + case Intrinsic::nvvm_f2ui_rz: + case Intrinsic::nvvm_f2i_rz_ftz: + case Intrinsic::nvvm_f2ui_rz_ftz: + case Intrinsic::nvvm_d2i_rz: + case Intrinsic::nvvm_d2ui_rz: + + case Intrinsic::nvvm_f2ll_rz: + case Intrinsic::nvvm_f2ull_rz: + case Intrinsic::nvvm_f2ll_rz_ftz: + case Intrinsic::nvvm_f2ull_rz_ftz: + case Intrinsic::nvvm_d2ll_rz: + case Intrinsic::nvvm_d2ull_rz: + return APFloat::rmTowardZero; + } + llvm_unreachable("Invalid f2i/d2i rounding mode intrinsic"); + return APFloat::roundingMode::Invalid; +} + +} // namespace nvvm +} // namespace llvm +#endif // LLVM_IR_NVVMINTRINSICUTILS_H diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h index b37f967191aaa..cd9a36029e6db 100644 --- a/llvm/include/llvm/IR/PatternMatch.h +++ b/llvm/include/llvm/IR/PatternMatch.h @@ -2870,7 +2870,7 @@ template struct Signum_match { return false; unsigned ShiftWidth = TypeSize - 1; - Value *OpL = nullptr, *OpR = nullptr; + Value *Op; // This is the representation of signum we match: // @@ -2882,11 +2882,11 @@ template struct Signum_match { // // for i1 values. - auto LHS = m_AShr(m_Value(OpL), m_SpecificInt(ShiftWidth)); - auto RHS = m_LShr(m_Neg(m_Value(OpR)), m_SpecificInt(ShiftWidth)); - auto Signum = m_Or(LHS, RHS); + auto LHS = m_AShr(m_Value(Op), m_SpecificInt(ShiftWidth)); + auto RHS = m_LShr(m_Neg(m_Deferred(Op)), m_SpecificInt(ShiftWidth)); + auto Signum = m_c_Or(LHS, RHS); - return Signum.match(V) && OpL == OpR && Val.match(OpL); + return Signum.match(V) && Val.match(Op); } }; diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h index 76914ab34c1f6..844f11feef414 100644 --- a/llvm/include/llvm/TargetParser/Triple.h +++ b/llvm/include/llvm/TargetParser/Triple.h @@ -564,11 +564,6 @@ class Triple { bool isOSzOS() const { return getOS() == Triple::ZOS; } - /// Is this an Apple MachO triple. - bool isAppleMachO() const { - return (getVendor() == Triple::Apple) && isOSBinFormatMachO(); - } - /// Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, XROS, or DriverKit). bool isOSDarwin() const { return isMacOSX() || isiOS() || isWatchOS() || isDriverKit() || isXROS(); diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 88533f2972fa6..031d675c330ec 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -45,8 +45,10 @@ #include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsARM.h" +#include "llvm/IR/IntrinsicsNVPTX.h" #include "llvm/IR/IntrinsicsWebAssembly.h" #include "llvm/IR/IntrinsicsX86.h" +#include "llvm/IR/NVVMIntrinsicUtils.h" #include "llvm/IR/Operator.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" @@ -1687,6 +1689,58 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) { case Intrinsic::x86_avx512_cvttsd2usi64: return !Call->isStrictFP(); + // NVVM float/double to int32/uint32 conversion intrinsics + case Intrinsic::nvvm_f2i_rm: + case Intrinsic::nvvm_f2i_rn: + case Intrinsic::nvvm_f2i_rp: + case Intrinsic::nvvm_f2i_rz: + case Intrinsic::nvvm_f2i_rm_ftz: + case Intrinsic::nvvm_f2i_rn_ftz: + case Intrinsic::nvvm_f2i_rp_ftz: + case Intrinsic::nvvm_f2i_rz_ftz: + case Intrinsic::nvvm_f2ui_rm: + case Intrinsic::nvvm_f2ui_rn: + case Intrinsic::nvvm_f2ui_rp: + case Intrinsic::nvvm_f2ui_rz: + case Intrinsic::nvvm_f2ui_rm_ftz: + case Intrinsic::nvvm_f2ui_rn_ftz: + case Intrinsic::nvvm_f2ui_rp_ftz: + case Intrinsic::nvvm_f2ui_rz_ftz: + case Intrinsic::nvvm_d2i_rm: + case Intrinsic::nvvm_d2i_rn: + case Intrinsic::nvvm_d2i_rp: + case Intrinsic::nvvm_d2i_rz: + case Intrinsic::nvvm_d2ui_rm: + case Intrinsic::nvvm_d2ui_rn: + case Intrinsic::nvvm_d2ui_rp: + case Intrinsic::nvvm_d2ui_rz: + + // NVVM float/double to int64/uint64 conversion intrinsics + case Intrinsic::nvvm_f2ll_rm: + case Intrinsic::nvvm_f2ll_rn: + case Intrinsic::nvvm_f2ll_rp: + case Intrinsic::nvvm_f2ll_rz: + case Intrinsic::nvvm_f2ll_rm_ftz: + case Intrinsic::nvvm_f2ll_rn_ftz: + case Intrinsic::nvvm_f2ll_rp_ftz: + case Intrinsic::nvvm_f2ll_rz_ftz: + case Intrinsic::nvvm_f2ull_rm: + case Intrinsic::nvvm_f2ull_rn: + case Intrinsic::nvvm_f2ull_rp: + case Intrinsic::nvvm_f2ull_rz: + case Intrinsic::nvvm_f2ull_rm_ftz: + case Intrinsic::nvvm_f2ull_rn_ftz: + case Intrinsic::nvvm_f2ull_rp_ftz: + case Intrinsic::nvvm_f2ull_rz_ftz: + case Intrinsic::nvvm_d2ll_rm: + case Intrinsic::nvvm_d2ll_rn: + case Intrinsic::nvvm_d2ll_rp: + case Intrinsic::nvvm_d2ll_rz: + case Intrinsic::nvvm_d2ull_rm: + case Intrinsic::nvvm_d2ull_rn: + case Intrinsic::nvvm_d2ull_rp: + case Intrinsic::nvvm_d2ull_rz: + // Sign operations are actually bitwise operations, they do not raise // exceptions even for SNANs. case Intrinsic::fabs: @@ -1849,6 +1903,12 @@ inline bool llvm_fenv_testexcept() { return false; } +static const APFloat FTZPreserveSign(const APFloat &V) { + if (V.isDenormal()) + return APFloat::getZero(V.getSemantics(), V.isNegative()); + return V; +} + Constant *ConstantFoldFP(double (*NativeFP)(double), const APFloat &V, Type *Ty) { llvm_fenv_clearexcept(); @@ -2309,6 +2369,85 @@ static Constant *ConstantFoldScalarCall1(StringRef Name, return ConstantFP::get(Ty->getContext(), U); } + // NVVM float/double to signed/unsigned int32/int64 conversions: + switch (IntrinsicID) { + // f2i + case Intrinsic::nvvm_f2i_rm: + case Intrinsic::nvvm_f2i_rn: + case Intrinsic::nvvm_f2i_rp: + case Intrinsic::nvvm_f2i_rz: + case Intrinsic::nvvm_f2i_rm_ftz: + case Intrinsic::nvvm_f2i_rn_ftz: + case Intrinsic::nvvm_f2i_rp_ftz: + case Intrinsic::nvvm_f2i_rz_ftz: + // f2ui + case Intrinsic::nvvm_f2ui_rm: + case Intrinsic::nvvm_f2ui_rn: + case Intrinsic::nvvm_f2ui_rp: + case Intrinsic::nvvm_f2ui_rz: + case Intrinsic::nvvm_f2ui_rm_ftz: + case Intrinsic::nvvm_f2ui_rn_ftz: + case Intrinsic::nvvm_f2ui_rp_ftz: + case Intrinsic::nvvm_f2ui_rz_ftz: + // d2i + case Intrinsic::nvvm_d2i_rm: + case Intrinsic::nvvm_d2i_rn: + case Intrinsic::nvvm_d2i_rp: + case Intrinsic::nvvm_d2i_rz: + // d2ui + case Intrinsic::nvvm_d2ui_rm: + case Intrinsic::nvvm_d2ui_rn: + case Intrinsic::nvvm_d2ui_rp: + case Intrinsic::nvvm_d2ui_rz: + // f2ll + case Intrinsic::nvvm_f2ll_rm: + case Intrinsic::nvvm_f2ll_rn: + case Intrinsic::nvvm_f2ll_rp: + case Intrinsic::nvvm_f2ll_rz: + case Intrinsic::nvvm_f2ll_rm_ftz: + case Intrinsic::nvvm_f2ll_rn_ftz: + case Intrinsic::nvvm_f2ll_rp_ftz: + case Intrinsic::nvvm_f2ll_rz_ftz: + // f2ull + case Intrinsic::nvvm_f2ull_rm: + case Intrinsic::nvvm_f2ull_rn: + case Intrinsic::nvvm_f2ull_rp: + case Intrinsic::nvvm_f2ull_rz: + case Intrinsic::nvvm_f2ull_rm_ftz: + case Intrinsic::nvvm_f2ull_rn_ftz: + case Intrinsic::nvvm_f2ull_rp_ftz: + case Intrinsic::nvvm_f2ull_rz_ftz: + // d2ll + case Intrinsic::nvvm_d2ll_rm: + case Intrinsic::nvvm_d2ll_rn: + case Intrinsic::nvvm_d2ll_rp: + case Intrinsic::nvvm_d2ll_rz: + // d2ull + case Intrinsic::nvvm_d2ull_rm: + case Intrinsic::nvvm_d2ull_rn: + case Intrinsic::nvvm_d2ull_rp: + case Intrinsic::nvvm_d2ull_rz: { + // In float-to-integer conversion, NaN inputs are converted to 0. + if (U.isNaN()) + return ConstantInt::get(Ty, 0); + + APFloat::roundingMode RMode = nvvm::IntrinsicGetRoundingMode(IntrinsicID); + bool IsFTZ = nvvm::IntrinsicShouldFTZ(IntrinsicID); + bool IsSigned = nvvm::IntrinsicConvertsToSignedInteger(IntrinsicID); + + APSInt ResInt(Ty->getIntegerBitWidth(), !IsSigned); + auto FloatToRound = IsFTZ ? FTZPreserveSign(U) : U; + + bool IsExact = false; + APFloat::opStatus Status = + FloatToRound.convertToInteger(ResInt, RMode, &IsExact); + + if (Status != APFloat::opInvalidOp) + return ConstantInt::get(Ty, ResInt); + return nullptr; + } + } + /// We only fold functions with finite arguments. Folding NaN and inf is /// likely to be aborted with an exception anyway, and some host libms /// have known errors raising exceptions. diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 515806428cbb2..999386c0a0491 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -4275,25 +4275,27 @@ Value *llvm::simplifyFCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS, return ::simplifyFCmpInst(Predicate, LHS, RHS, FMF, Q, RecursionLimit); } -static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, - const SimplifyQuery &Q, - bool AllowRefinement, - SmallVectorImpl *DropFlags, - unsigned MaxRecurse) { +static Value *simplifyWithOpsReplaced(Value *V, + ArrayRef> Ops, + const SimplifyQuery &Q, + bool AllowRefinement, + SmallVectorImpl *DropFlags, + unsigned MaxRecurse) { assert((AllowRefinement || !Q.CanUseUndef) && "If AllowRefinement=false then CanUseUndef=false"); + for (const auto &OpAndRepOp : Ops) { + // We cannot replace a constant, and shouldn't even try. + if (isa(OpAndRepOp.first)) + return nullptr; - // Trivial replacement. - if (V == Op) - return RepOp; + // Trivial replacement. + if (V == OpAndRepOp.first) + return OpAndRepOp.second; + } if (!MaxRecurse--) return nullptr; - // We cannot replace a constant, and shouldn't even try. - if (isa(Op)) - return nullptr; - auto *I = dyn_cast(V); if (!I) return nullptr; @@ -4303,11 +4305,6 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, if (isa(I)) return nullptr; - // For vector types, the simplification must hold per-lane, so forbid - // potentially cross-lane operations like shufflevector. - if (Op->getType()->isVectorTy() && !isNotCrossLaneOperation(I)) - return nullptr; - // Don't fold away llvm.is.constant checks based on assumptions. if (match(I, m_Intrinsic())) return nullptr; @@ -4316,12 +4313,20 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, if (isa(I)) return nullptr; + for (const auto &OpAndRepOp : Ops) { + // For vector types, the simplification must hold per-lane, so forbid + // potentially cross-lane operations like shufflevector. + if (OpAndRepOp.first->getType()->isVectorTy() && + !isNotCrossLaneOperation(I)) + return nullptr; + } + // Replace Op with RepOp in instruction operands. SmallVector NewOps; bool AnyReplaced = false; for (Value *InstOp : I->operands()) { - if (Value *NewInstOp = simplifyWithOpReplaced( - InstOp, Op, RepOp, Q, AllowRefinement, DropFlags, MaxRecurse)) { + if (Value *NewInstOp = simplifyWithOpsReplaced( + InstOp, Ops, Q, AllowRefinement, DropFlags, MaxRecurse)) { NewOps.push_back(NewInstOp); AnyReplaced = InstOp != NewInstOp; } else { @@ -4372,7 +4377,8 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, // by assumption and this case never wraps, so nowrap flags can be // ignored. if ((Opcode == Instruction::Sub || Opcode == Instruction::Xor) && - NewOps[0] == RepOp && NewOps[1] == RepOp) + NewOps[0] == NewOps[1] && + any_of(Ops, [=](const auto &Rep) { return NewOps[0] == Rep.second; })) return Constant::getNullValue(I->getType()); // If we are substituting an absorber constant into a binop and extra @@ -4382,10 +4388,10 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, // (Op == 0) ? 0 : (Op & -Op) --> Op & -Op // (Op == 0) ? 0 : (Op * (binop Op, C)) --> Op * (binop Op, C) // (Op == -1) ? -1 : (Op | (binop C, Op) --> Op | (binop C, Op) - Constant *Absorber = - ConstantExpr::getBinOpAbsorber(Opcode, I->getType()); + Constant *Absorber = ConstantExpr::getBinOpAbsorber(Opcode, I->getType()); if ((NewOps[0] == Absorber || NewOps[1] == Absorber) && - impliesPoison(BO, Op)) + any_of(Ops, + [=](const auto &Rep) { return impliesPoison(BO, Rep.first); })) return Absorber; } @@ -4453,6 +4459,15 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, /*AllowNonDeterministic=*/false); } +static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, + const SimplifyQuery &Q, + bool AllowRefinement, + SmallVectorImpl *DropFlags, + unsigned MaxRecurse) { + return simplifyWithOpsReplaced(V, {{Op, RepOp}}, Q, AllowRefinement, + DropFlags, MaxRecurse); +} + Value *llvm::simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, const SimplifyQuery &Q, bool AllowRefinement, @@ -4595,21 +4610,20 @@ static Value *simplifySelectWithFakeICmpEq(Value *CmpLHS, Value *CmpRHS, /// Try to simplify a select instruction when its condition operand is an /// integer equality or floating-point equivalence comparison. -static Value *simplifySelectWithEquivalence(Value *CmpLHS, Value *CmpRHS, - Value *TrueVal, Value *FalseVal, - const SimplifyQuery &Q, - unsigned MaxRecurse) { +static Value *simplifySelectWithEquivalence( + ArrayRef> Replacements, Value *TrueVal, + Value *FalseVal, const SimplifyQuery &Q, unsigned MaxRecurse) { Value *SimplifiedFalseVal = - simplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q.getWithoutUndef(), - /* AllowRefinement */ false, - /* DropFlags */ nullptr, MaxRecurse); + simplifyWithOpsReplaced(FalseVal, Replacements, Q.getWithoutUndef(), + /* AllowRefinement */ false, + /* DropFlags */ nullptr, MaxRecurse); if (!SimplifiedFalseVal) SimplifiedFalseVal = FalseVal; Value *SimplifiedTrueVal = - simplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q, - /* AllowRefinement */ true, - /* DropFlags */ nullptr, MaxRecurse); + simplifyWithOpsReplaced(TrueVal, Replacements, Q, + /* AllowRefinement */ true, + /* DropFlags */ nullptr, MaxRecurse); if (!SimplifiedTrueVal) SimplifiedTrueVal = TrueVal; @@ -4707,10 +4721,10 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal, // the arms of the select. See if substituting this value into the arm and // simplifying the result yields the same value as the other arm. if (Pred == ICmpInst::ICMP_EQ) { - if (Value *V = simplifySelectWithEquivalence(CmpLHS, CmpRHS, TrueVal, + if (Value *V = simplifySelectWithEquivalence({{CmpLHS, CmpRHS}}, TrueVal, FalseVal, Q, MaxRecurse)) return V; - if (Value *V = simplifySelectWithEquivalence(CmpRHS, CmpLHS, TrueVal, + if (Value *V = simplifySelectWithEquivalence({{CmpRHS, CmpLHS}}, TrueVal, FalseVal, Q, MaxRecurse)) return V; @@ -4720,11 +4734,8 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal, if (match(CmpLHS, m_Or(m_Value(X), m_Value(Y))) && match(CmpRHS, m_Zero())) { // (X | Y) == 0 implies X == 0 and Y == 0. - if (Value *V = simplifySelectWithEquivalence(X, CmpRHS, TrueVal, FalseVal, - Q, MaxRecurse)) - return V; - if (Value *V = simplifySelectWithEquivalence(Y, CmpRHS, TrueVal, FalseVal, - Q, MaxRecurse)) + if (Value *V = simplifySelectWithEquivalence( + {{X, CmpRHS}, {Y, CmpRHS}}, TrueVal, FalseVal, Q, MaxRecurse)) return V; } @@ -4732,11 +4743,8 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal, if (match(CmpLHS, m_And(m_Value(X), m_Value(Y))) && match(CmpRHS, m_AllOnes())) { // (X & Y) == -1 implies X == -1 and Y == -1. - if (Value *V = simplifySelectWithEquivalence(X, CmpRHS, TrueVal, FalseVal, - Q, MaxRecurse)) - return V; - if (Value *V = simplifySelectWithEquivalence(Y, CmpRHS, TrueVal, FalseVal, - Q, MaxRecurse)) + if (Value *V = simplifySelectWithEquivalence( + {{X, CmpRHS}, {Y, CmpRHS}}, TrueVal, FalseVal, Q, MaxRecurse)) return V; } } @@ -4765,11 +4773,11 @@ static Value *simplifySelectWithFCmp(Value *Cond, Value *T, Value *F, // This transforms is safe if at least one operand is known to not be zero. // Otherwise, the select can change the sign of a zero operand. if (IsEquiv) { - if (Value *V = - simplifySelectWithEquivalence(CmpLHS, CmpRHS, T, F, Q, MaxRecurse)) + if (Value *V = simplifySelectWithEquivalence({{CmpLHS, CmpRHS}}, T, F, Q, + MaxRecurse)) return V; - if (Value *V = - simplifySelectWithEquivalence(CmpRHS, CmpLHS, T, F, Q, MaxRecurse)) + if (Value *V = simplifySelectWithEquivalence({{CmpRHS, CmpLHS}}, T, F, Q, + MaxRecurse)) return V; } diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index 6f351e138e89d..7dac0deed7b7e 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -1526,17 +1526,18 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, // In a situation like the following: // - // undef %2.subreg:reg = INST %1:reg ; DefMI (rematerializable), - // ; DefSubIdx = subreg - // %3:reg = COPY %2 ; SrcIdx = DstIdx = 0 - // .... = SOMEINSTR %3:reg + // undef %2.subreg:reg = INST %1:reg ; DefMI (rematerializable), + // ; Defines only some of lanes, + // ; so DefSubIdx = NewIdx = subreg + // %3:reg = COPY %2 ; Copy full reg + // .... = SOMEINSTR %3:reg ; Use full reg // // there are no subranges for %3 so after rematerialization we need // to explicitly create them. Undefined subranges are removed later on. - if (DefSubIdx && !CP.getSrcIdx() && !CP.getDstIdx() && - MRI->shouldTrackSubRegLiveness(DstReg) && !DstInt.hasSubRanges()) { + if (NewIdx && !DstInt.hasSubRanges() && + MRI->shouldTrackSubRegLiveness(DstReg)) { LaneBitmask FullMask = MRI->getMaxLaneMaskForVReg(DstReg); - LaneBitmask UsedLanes = TRI->getSubRegIndexLaneMask(DefSubIdx); + LaneBitmask UsedLanes = TRI->getSubRegIndexLaneMask(NewIdx); LaneBitmask UnusedLanes = FullMask & ~UsedLanes; VNInfo::Allocator &Alloc = LIS->getVNInfoAllocator(); DstInt.createSubRangeFrom(Alloc, UsedLanes, DstInt); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 3593e550f1ece..c860188d32f36 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -20457,10 +20457,8 @@ SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) { Value.hasOneUse()) { LoadSDNode *LD = cast(Value); EVT VT = LD->getMemoryVT(); - if (!VT.isFloatingPoint() || - VT != ST->getMemoryVT() || - LD->isNonTemporal() || - ST->isNonTemporal() || + if (!VT.isSimple() || !VT.isFloatingPoint() || VT != ST->getMemoryVT() || + LD->isNonTemporal() || ST->isNonTemporal() || LD->getPointerInfo().getAddrSpace() != 0 || ST->getPointerInfo().getAddrSpace() != 0) return SDValue(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 074350347842a..80cf04dd39386 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -753,6 +753,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(Op, MVT::v8bf16, Expand); } + // For bf16, fpextend is custom lowered to be optionally expanded into shifts. + setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f32, Custom); + auto LegalizeNarrowFP = [this](MVT ScalarVT) { for (auto Op : { ISD::SETCC, @@ -893,10 +901,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(Op, MVT::f16, Legal); } - // Strict conversion to a larger type is legal - for (auto VT : {MVT::f32, MVT::f64}) - setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal); - setOperationAction(ISD::PREFETCH, MVT::Other, Custom); setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom); @@ -4498,6 +4502,54 @@ SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) return LowerFixedLengthFPExtendToSVE(Op, DAG); + bool IsStrict = Op->isStrictFPOpcode(); + SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0); + EVT Op0VT = Op0.getValueType(); + if (VT == MVT::f64) { + // FP16->FP32 extends are legal for v32 and v4f32. + if (Op0VT == MVT::f32 || Op0VT == MVT::f16) + return Op; + // Split bf16->f64 extends into two fpextends. + if (Op0VT == MVT::bf16 && IsStrict) { + SDValue Ext1 = + DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {MVT::f32, MVT::Other}, + {Op0, Op.getOperand(0)}); + return DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {VT, MVT::Other}, + {Ext1, Ext1.getValue(1)}); + } + if (Op0VT == MVT::bf16) + return DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), VT, + DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, Op0)); + return SDValue(); + } + + if (VT.getScalarType() == MVT::f32) { + // FP16->FP32 extends are legal for v32 and v4f32. + if (Op0VT.getScalarType() == MVT::f16) + return Op; + if (Op0VT.getScalarType() == MVT::bf16) { + SDLoc DL(Op); + EVT IVT = VT.changeTypeToInteger(); + if (!Op0VT.isVector()) { + Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4bf16, Op0); + IVT = MVT::v4i32; + } + + EVT Op0IVT = Op0.getValueType().changeTypeToInteger(); + SDValue Ext = + DAG.getNode(ISD::ANY_EXTEND, DL, IVT, DAG.getBitcast(Op0IVT, Op0)); + SDValue Shift = + DAG.getNode(ISD::SHL, DL, IVT, Ext, DAG.getConstant(16, DL, IVT)); + if (!Op0VT.isVector()) + Shift = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Shift, + DAG.getConstant(0, DL, MVT::i64)); + Shift = DAG.getBitcast(VT, Shift); + return IsStrict ? DAG.getMergeValues({Shift, Op.getOperand(0)}, DL) + : Shift; + } + return SDValue(); + } + assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); return SDValue(); } @@ -7345,6 +7397,7 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG); case ISD::FP_EXTEND: + case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index ec891ea4bac85..c6f5cdcd1d5fe 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -5123,22 +5123,6 @@ let Predicates = [HasFullFP16] in { //===----------------------------------------------------------------------===// defm FCVT : FPConversion<"fcvt">; -// Helper to get bf16 into fp32. -def cvt_bf16_to_fp32 : - OutPatFrag<(ops node:$Rn), - (f32 (COPY_TO_REGCLASS - (i32 (UBFMWri - (i32 (COPY_TO_REGCLASS (INSERT_SUBREG (f32 (IMPLICIT_DEF)), - node:$Rn, hsub), GPR32)), - (i64 (i32shift_a (i64 16))), - (i64 (i32shift_b (i64 16))))), - FPR32))>; -// Pattern for bf16 -> fp32. -def : Pat<(f32 (any_fpextend (bf16 FPR16:$Rn))), - (cvt_bf16_to_fp32 FPR16:$Rn)>; -// Pattern for bf16 -> fp64. -def : Pat<(f64 (any_fpextend (bf16 FPR16:$Rn))), - (FCVTDSr (f32 (cvt_bf16_to_fp32 FPR16:$Rn)))>; //===----------------------------------------------------------------------===// // Floating point single operand instructions. @@ -8333,8 +8317,6 @@ def : Pat<(v4i32 (anyext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))> def : Pat<(v2i64 (sext (v2i32 V64:$Rn))), (SSHLLv2i32_shift V64:$Rn, (i32 0))>; def : Pat<(v2i64 (zext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>; def : Pat<(v2i64 (anyext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>; -// Vector bf16 -> fp32 is implemented morally as a zext + shift. -def : Pat<(v4f32 (any_fpextend (v4bf16 V64:$Rn))), (SHLLv4i16 V64:$Rn)>; // Also match an extend from the upper half of a 128 bit source register. def : Pat<(v8i16 (anyext (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn)) ))), (USHLLv16i8_shift V128:$Rn, (i32 0))>; diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index c2199fd587bea..2bc19137b1ca0 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1096,21 +1096,8 @@ void SIFoldOperandsImpl::foldOperand( B.addImm(Defs[I].second); } LLVM_DEBUG(dbgs() << "Folded " << *UseMI); - return; } - if (Size != 4) - return; - - Register Reg0 = UseMI->getOperand(0).getReg(); - Register Reg1 = UseMI->getOperand(1).getReg(); - if (TRI->isAGPR(*MRI, Reg0) && TRI->isVGPR(*MRI, Reg1)) - UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64)); - else if (TRI->isVGPR(*MRI, Reg0) && TRI->isAGPR(*MRI, Reg1)) - UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64)); - else if (ST->hasGFX90AInsts() && TRI->isAGPR(*MRI, Reg0) && - TRI->isAGPR(*MRI, Reg1)) - UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_MOV_B32)); return; } diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index b09a7c31dba8e..fd3425714837c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3731,12 +3731,15 @@ def : IntMinMaxPat; def : IntMinMaxPat; def : FPMinMaxPat; def : FPMinMaxPat; -def : FPMinMaxPat; -def : FPMinMaxPat; def : FPMinCanonMaxPat; def : FPMinCanonMaxPat; -def : FPMinCanonMaxPat; -def : FPMinCanonMaxPat; +} + +let True16Predicate = UseFakeTrue16Insts in { +def : FPMinMaxPat; +def : FPMinMaxPat; +def : FPMinCanonMaxPat; +def : FPMinCanonMaxPat; } let SubtargetPredicate = isGFX9Plus in { @@ -3744,6 +3747,10 @@ let True16Predicate = NotHasTrue16BitInsts in { defm : Int16Med3Pat; defm : Int16Med3Pat; } +let True16Predicate = UseRealTrue16Insts in { + defm : Int16Med3Pat; + defm : Int16Med3Pat; +} let True16Predicate = UseFakeTrue16Insts in { defm : Int16Med3Pat; defm : Int16Med3Pat; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index cef1f20f3420a..24a2eede9ca3f 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1374,8 +1374,8 @@ class VOP3_DOT_Profile_fake16>; defm V_MINMAX_F32 : VOP3Inst<"v_minmax_f32", VOP3_Profile>; - defm V_MAXMIN_F16 : VOP3Inst<"v_maxmin_f16", VOP3_Profile>; - defm V_MINMAX_F16 : VOP3Inst<"v_minmax_f16", VOP3_Profile>; + defm V_MAXMIN_F16 : VOP3Inst_t16<"v_maxmin_f16", VOP_F16_F16_F16_F16>; + defm V_MINMAX_F16 : VOP3Inst_t16<"v_minmax_f16", VOP_F16_F16_F16_F16>; defm V_MAXMIN_U32 : VOP3Inst<"v_maxmin_u32", VOP3_Profile>; defm V_MINMAX_U32 : VOP3Inst<"v_minmax_u32", VOP3_Profile>; defm V_MAXMIN_I32 : VOP3Inst<"v_maxmin_i32", VOP3_Profile>; @@ -1588,8 +1588,8 @@ defm V_MED3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x231, "V_MED3_F32", defm V_MED3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x232, "v_med3_num_f16", "V_MED3_F16", "v_med3_f16">; defm V_MINMAX_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x268, "V_MINMAX_F32", "v_minmax_num_f32">; defm V_MAXMIN_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x269, "V_MAXMIN_F32", "v_maxmin_num_f32">; -defm V_MINMAX_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x26a, "V_MINMAX_F16", "v_minmax_num_f16">; -defm V_MAXMIN_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x26b, "V_MAXMIN_F16", "v_maxmin_num_f16">; +defm V_MINMAX_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x26a, "v_minmax_num_f16", "V_MINMAX_F16", "v_minmax_f16">; +defm V_MAXMIN_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x26b, "v_maxmin_num_f16", "V_MAXMIN_F16", "v_maxmin_f16">; defm V_MINIMUMMAXIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26c>; defm V_MAXIMUMMINIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26d>; defm V_MINIMUMMAXIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x26e>; @@ -1730,8 +1730,8 @@ defm V_PERMLANE16_B32 : VOP3_Real_Base_gfx11_gfx12<0x25b>; defm V_PERMLANEX16_B32 : VOP3_Real_Base_gfx11_gfx12<0x25c>; defm V_MAXMIN_F32 : VOP3_Realtriple_gfx11<0x25e>; defm V_MINMAX_F32 : VOP3_Realtriple_gfx11<0x25f>; -defm V_MAXMIN_F16 : VOP3_Realtriple_gfx11<0x260>; -defm V_MINMAX_F16 : VOP3_Realtriple_gfx11<0x261>; +defm V_MAXMIN_F16 : VOP3_Realtriple_t16_and_fake16_gfx11<0x260, "v_maxmin_f16">; +defm V_MINMAX_F16 : VOP3_Realtriple_t16_and_fake16_gfx11<0x261, "v_minmax_f16">; defm V_MAXMIN_U32 : VOP3_Realtriple_gfx11_gfx12<0x262>; defm V_MINMAX_U32 : VOP3_Realtriple_gfx11_gfx12<0x263>; defm V_MAXMIN_I32 : VOP3_Realtriple_gfx11_gfx12<0x264>; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 0f568ba90a9ef..4cb42e1737a35 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -1908,8 +1908,8 @@ multiclass VOP3_Realtriple_t16_gfx11 op, string asmName, string opName multiclass VOP3_Realtriple_t16_and_fake16_gfx11 op, string asmName, string opName = NAME, string pseudo_mnemonic = "", bit isSingle = 0> { - defm _t16: VOP3_Realtriple_t16_gfx11; - defm _fake16: VOP3_Realtriple_t16_gfx11; + defm _t16: VOP3_Realtriple_t16_gfx11; + defm _fake16: VOP3_Realtriple_t16_gfx11; } multiclass VOP3Only_Realtriple_t16_gfx11 op, string asmName, diff --git a/llvm/lib/Target/ARM/ARMSystemRegister.td b/llvm/lib/Target/ARM/ARMSystemRegister.td index c03db15d10411..3afc410e04568 100644 --- a/llvm/lib/Target/ARM/ARMSystemRegister.td +++ b/llvm/lib/Target/ARM/ARMSystemRegister.td @@ -19,17 +19,13 @@ class MClassSysReg UniqMask1, bits<1> UniqMask2, bits<1> UniqMask3, bits<12> Enc12, - string name> : SearchableTable { - let SearchableFields = ["Name", "M1Encoding12", "M2M3Encoding8", "Encoding"]; + string name> { string Name; bits<13> M1Encoding12; bits<10> M2M3Encoding8; bits<12> Encoding; let Name = name; - let EnumValueField = "M1Encoding12"; - let EnumValueField = "M2M3Encoding8"; - let EnumValueField = "Encoding"; let M1Encoding12{12} = UniqMask1; let M1Encoding12{11-00} = Enc12; @@ -41,6 +37,27 @@ class MClassSysReg UniqMask1, code Requires = [{ {} }]; } +def MClassSysRegsList : GenericTable { + let FilterClass = "MClassSysReg"; + let Fields = ["Name", "M1Encoding12", "M2M3Encoding8", "Encoding", + "Requires"]; +} + +def lookupMClassSysRegByName : SearchIndex { + let Table = MClassSysRegsList; + let Key = ["Name"]; +} + +def lookupMClassSysRegByM1Encoding12 : SearchIndex { + let Table = MClassSysRegsList; + let Key = ["M1Encoding12"]; +} + +def lookupMClassSysRegByM2M3Encoding8 : SearchIndex { + let Table = MClassSysRegsList; + let Key = ["M2M3Encoding8"]; +} + // [|i|e|x]apsr_nzcvq has alias [|i|e|x]apsr. // Mask1 Mask2 Mask3 Enc12, Name let Requires = [{ {ARM::FeatureDSP} }] in { @@ -127,15 +144,29 @@ def : MClassSysReg<0, 0, 1, 0x8a7, "pac_key_u_3_ns">; // Banked Registers // -class BankedReg enc> - : SearchableTable { +class BankedReg enc> { string Name; bits<8> Encoding; let Name = name; let Encoding = enc; - let SearchableFields = ["Name", "Encoding"]; } +def BankedRegsList : GenericTable { + let FilterClass = "BankedReg"; + let Fields = ["Name", "Encoding"]; +} + +def lookupBankedRegByName : SearchIndex { + let Table = BankedRegsList; + let Key = ["Name"]; +} + +def lookupBankedRegByEncoding : SearchIndex { + let Table = BankedRegsList; + let Key = ["Encoding"]; +} + + // The values here come from B9.2.3 of the ARM ARM, where bits 4-0 are SysM // and bit 5 is R. def : BankedReg<"r8_usr", 0x00>; diff --git a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp index 494c67d4b7768..e76a70b3610a8 100644 --- a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp +++ b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp @@ -62,13 +62,13 @@ const MClassSysReg *lookupMClassSysRegBy8bitSYSmValue(unsigned SYSm) { return ARMSysReg::lookupMClassSysRegByM2M3Encoding8((1<<8)|(SYSm & 0xFF)); } -#define GET_MCLASSSYSREG_IMPL +#define GET_MClassSysRegsList_IMPL #include "ARMGenSystemRegister.inc" } // end namespace ARMSysReg namespace ARMBankedReg { -#define GET_BANKEDREG_IMPL +#define GET_BankedRegsList_IMPL #include "ARMGenSystemRegister.inc" } // end namespce ARMSysReg } // end namespace llvm diff --git a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h index 5562572c5abf4..dc4f811e075c6 100644 --- a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h +++ b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h @@ -206,8 +206,8 @@ namespace ARMSysReg { } }; - #define GET_MCLASSSYSREG_DECL - #include "ARMGenSystemRegister.inc" +#define GET_MClassSysRegsList_DECL +#include "ARMGenSystemRegister.inc" // lookup system register using 12-bit SYSm value. // Note: the search is uniqued using M1 mask @@ -228,8 +228,8 @@ namespace ARMBankedReg { const char *Name; uint16_t Encoding; }; - #define GET_BANKEDREG_DECL - #include "ARMGenSystemRegister.inc" +#define GET_BankedRegsList_DECL +#include "ARMGenSystemRegister.inc" } // end namespace ARMBankedReg } // end namespace llvm diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp index 4e01dd1145a55..e0ee4d6d6b130 100644 --- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp +++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp @@ -415,8 +415,16 @@ class OpLowerer { } } - OldResult = cast( - IRB.CreateExtractValue(Op, 0, OldResult->getName())); + if (OldResult->use_empty()) { + // Only the check bit was used, so we're done here. + OldResult->eraseFromParent(); + return Error::success(); + } + + assert(OldResult->hasOneUse() && + isa(*OldResult->user_begin()) && + "Expected only use to be extract of first element"); + OldResult = cast(*OldResult->user_begin()); OldTy = ST->getElementType(0); } @@ -723,9 +731,6 @@ class OpLowerer { HasErrors |= lowerGetPointer(F); break; case Intrinsic::dx_resource_load_typedbuffer: - HasErrors |= lowerTypedBufferLoad(F, /*HasCheckBit=*/false); - break; - case Intrinsic::dx_resource_loadchecked_typedbuffer: HasErrors |= lowerTypedBufferLoad(F, /*HasCheckBit=*/true); break; case Intrinsic::dx_resource_store_typedbuffer: diff --git a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp index 1ff8f09f066db..837624935c5fa 100644 --- a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp +++ b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp @@ -30,6 +30,9 @@ static void replaceTypedBufferAccess(IntrinsicInst *II, "Unexpected typed buffer type"); Type *ContainedType = HandleType->getTypeParameter(0); + Type *LoadType = + StructType::get(ContainedType, Type::getInt1Ty(II->getContext())); + // We need the size of an element in bytes so that we can calculate the offset // in elements given a total offset in bytes later. Type *ScalarType = ContainedType->getScalarType(); @@ -81,13 +84,15 @@ static void replaceTypedBufferAccess(IntrinsicInst *II, // We're storing a scalar, so we need to load the current value and only // replace the relevant part. auto *Load = Builder.CreateIntrinsic( - ContainedType, Intrinsic::dx_resource_load_typedbuffer, + LoadType, Intrinsic::dx_resource_load_typedbuffer, {II->getOperand(0), II->getOperand(1)}); + auto *Struct = Builder.CreateExtractValue(Load, {0}); + // If we have an offset from seeing a GEP earlier, use it. Value *IndexOp = Current.Index ? Current.Index : ConstantInt::get(Builder.getInt32Ty(), 0); - V = Builder.CreateInsertElement(Load, V, IndexOp); + V = Builder.CreateInsertElement(Struct, V, IndexOp); } else { llvm_unreachable("Store to typed resource has invalid type"); } @@ -101,8 +106,10 @@ static void replaceTypedBufferAccess(IntrinsicInst *II, } else if (auto *LI = dyn_cast(Current.Access)) { IRBuilder<> Builder(LI); Value *V = Builder.CreateIntrinsic( - ContainedType, Intrinsic::dx_resource_load_typedbuffer, + LoadType, Intrinsic::dx_resource_load_typedbuffer, {II->getOperand(0), II->getOperand(1)}); + V = Builder.CreateExtractValue(V, {0}); + if (Current.Index) V = Builder.CreateExtractElement(V, Current.Index); diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index 65e1893d3f3bd..d34f45fcac008 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -14,7 +14,7 @@ #include "NVPTX.h" #include "NVPTXUtilities.h" #include "llvm/ADT/StringRef.h" -#include "llvm/IR/NVVMIntrinsicFlags.h" +#include "llvm/IR/NVVMIntrinsicUtils.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index c51729e224bf5..5b4ac50c8fd7b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -17,7 +17,7 @@ #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsNVPTX.h" -#include "llvm/IR/NVVMIntrinsicFlags.h" +#include "llvm/IR/NVVMIntrinsicUtils.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt index 44661647a8631..98d3615ebab58 100644 --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -15,6 +15,7 @@ tablegen(LLVM RISCVGenRegisterBank.inc -gen-register-bank) tablegen(LLVM RISCVGenRegisterInfo.inc -gen-register-info) tablegen(LLVM RISCVGenSearchableTables.inc -gen-searchable-tables) tablegen(LLVM RISCVGenSubtargetInfo.inc -gen-subtarget) +tablegen(LLVM RISCVGenExegesis.inc -gen-exegesis) set(LLVM_TARGET_DEFINITIONS RISCVGISel.td) tablegen(LLVM RISCVGenGlobalISel.inc -gen-global-isel) diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp index ef85057ba1264..3f1539da4a9c8 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp @@ -80,7 +80,6 @@ class RISCVInstructionSelector : public InstructionSelector { bool selectFPCompare(MachineInstr &MI, MachineIRBuilder &MIB) const; void emitFence(AtomicOrdering FenceOrdering, SyncScope::ID FenceSSID, MachineIRBuilder &MIB) const; - bool selectMergeValues(MachineInstr &MI, MachineIRBuilder &MIB) const; bool selectUnmergeValues(MachineInstr &MI, MachineIRBuilder &MIB) const; ComplexRendererFns selectShiftMask(MachineOperand &Root, @@ -732,8 +731,6 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) { } case TargetOpcode::G_IMPLICIT_DEF: return selectImplicitDef(MI, MIB); - case TargetOpcode::G_MERGE_VALUES: - return selectMergeValues(MI, MIB); case TargetOpcode::G_UNMERGE_VALUES: return selectUnmergeValues(MI, MIB); default: @@ -741,26 +738,13 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) { } } -bool RISCVInstructionSelector::selectMergeValues(MachineInstr &MI, - MachineIRBuilder &MIB) const { - assert(MI.getOpcode() == TargetOpcode::G_MERGE_VALUES); - - // Build a F64 Pair from operands - if (MI.getNumOperands() != 3) - return false; - Register Dst = MI.getOperand(0).getReg(); - Register Lo = MI.getOperand(1).getReg(); - Register Hi = MI.getOperand(2).getReg(); - if (!isRegInFprb(Dst) || !isRegInGprb(Lo) || !isRegInGprb(Hi)) - return false; - MI.setDesc(TII.get(RISCV::BuildPairF64Pseudo)); - return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); -} - bool RISCVInstructionSelector::selectUnmergeValues( MachineInstr &MI, MachineIRBuilder &MIB) const { assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES); + if (!Subtarget->hasStdExtZfa()) + return false; + // Split F64 Src into two s32 parts if (MI.getNumOperands() != 3) return false; @@ -769,8 +753,17 @@ bool RISCVInstructionSelector::selectUnmergeValues( Register Hi = MI.getOperand(1).getReg(); if (!isRegInFprb(Src) || !isRegInGprb(Lo) || !isRegInGprb(Hi)) return false; - MI.setDesc(TII.get(RISCV::SplitF64Pseudo)); - return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); + + MachineInstr *ExtractLo = MIB.buildInstr(RISCV::FMV_X_W_FPR64, {Lo}, {Src}); + if (!constrainSelectedInstRegOperands(*ExtractLo, TII, TRI, RBI)) + return false; + + MachineInstr *ExtractHi = MIB.buildInstr(RISCV::FMVH_X_D, {Hi}, {Src}); + if (!constrainSelectedInstRegOperands(*ExtractHi, TII, TRI, RBI)) + return false; + + MI.eraseFromParent(); + return true; } bool RISCVInstructionSelector::replacePtrWithInt(MachineOperand &Op, diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index eab4a5e77d96e..0cb1ef0a66b60 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -38,9 +38,12 @@ std::optional RISCVAsmBackend::getFixupKind(StringRef Name) const { if (STI.getTargetTriple().isOSBinFormatELF()) { unsigned Type; Type = llvm::StringSwitch(Name) -#define ELF_RELOC(X, Y) .Case(#X, Y) +#define ELF_RELOC(NAME, ID) .Case(#NAME, ID) #include "llvm/BinaryFormat/ELFRelocs/RISCV.def" #undef ELF_RELOC +#define ELF_RISCV_NONSTANDARD_RELOC(_VENDOR, NAME, ID) .Case(#NAME, ID) +#include "llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def" +#undef ELF_RISCV_NONSTANDARD_RELOC .Case("BFD_RELOC_NONE", ELF::R_RISCV_NONE) .Case("BFD_RELOC_32", ELF::R_RISCV_32) .Case("BFD_RELOC_64", ELF::R_RISCV_64) diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td index 963124140cd03..4e0c64a5ca2c6 100644 --- a/llvm/lib/Target/RISCV/RISCV.td +++ b/llvm/lib/Target/RISCV/RISCV.td @@ -63,6 +63,12 @@ include "RISCVSchedXiangShanNanHu.td" include "RISCVProcessors.td" +//===----------------------------------------------------------------------===// +// Pfm Counters +//===----------------------------------------------------------------------===// + +include "RISCVPfmCounters.td" + //===----------------------------------------------------------------------===// // Define the RISC-V target. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td index ae969bff82fd1..349bc361c90fe 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td @@ -23,7 +23,9 @@ def SDT_RISCVSplitF64 : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisVT<2, f64>]>; def RISCVBuildPairF64 : SDNode<"RISCVISD::BuildPairF64", SDT_RISCVBuildPairF64>; +def : GINodeEquiv; def RISCVSplitF64 : SDNode<"RISCVISD::SplitF64", SDT_RISCVSplitF64>; +def : GINodeEquiv; def AddrRegImmINX : ComplexPattern; diff --git a/llvm/lib/Target/RISCV/RISCVPfmCounters.td b/llvm/lib/Target/RISCV/RISCVPfmCounters.td new file mode 100644 index 0000000000000..013e789a9e921 --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVPfmCounters.td @@ -0,0 +1,18 @@ +//===---- RISCVPfmCounters.td - RISC-V Hardware Counters ---*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This describes the available hardware counters for RISC-V. +// +//===----------------------------------------------------------------------===// + +def CpuCyclesPfmCounter : PfmCounter<"CYCLES">; + +def DefaultPfmCounters : ProcPfmCounters { + let CycleCounter = CpuCyclesPfmCounter; +} +def : PfmCountersDefaultBinding; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 68bdeb1cebeb9..a6bfb489faff2 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -41694,6 +41694,8 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { MVT VT = N.getSimpleValueType(); + unsigned NumElts = VT.getVectorNumElements(); + SmallVector Mask; unsigned Opcode = N.getOpcode(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -41979,7 +41981,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, APInt Mask = APInt::getHighBitsSet(64, 32); if (DAG.MaskedValueIsZero(In, Mask)) { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In); - MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2); + MVT VecVT = MVT::getVectorVT(MVT::i32, NumElts * 2); SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc); SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec); return DAG.getBitcast(VT, Movl); @@ -41994,7 +41996,6 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, // Create a vector constant - scalar constant followed by zeros. EVT ScalarVT = N0.getOperand(0).getValueType(); Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext()); - unsigned NumElts = VT.getVectorNumElements(); Constant *Zero = ConstantInt::getNullValue(ScalarTy); SmallVector ConstantVec(NumElts, Zero); ConstantVec[0] = const_cast(C->getConstantIntValue()); @@ -42045,9 +42046,8 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, MVT SrcVT = N0.getOperand(0).getSimpleValueType(); unsigned SrcBits = SrcVT.getScalarSizeInBits(); if ((EltBits % SrcBits) == 0 && SrcBits >= 32) { - unsigned Size = VT.getVectorNumElements(); unsigned NewSize = SrcVT.getVectorNumElements(); - APInt BlendMask = N.getConstantOperandAPInt(2).zextOrTrunc(Size); + APInt BlendMask = N.getConstantOperandAPInt(2).zextOrTrunc(NumElts); APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize); return DAG.getBitcast( VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0), @@ -42460,7 +42460,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; DMask[DOffset + 0] = DOffset + 1; DMask[DOffset + 1] = DOffset + 0; - MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); + MVT DVT = MVT::getVectorVT(MVT::i32, NumElts / 2); V = DAG.getBitcast(DVT, V); V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V, getV4X86ShuffleImm8ForMask(DMask, DL, DAG)); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 9c14c82749f3c..9716152d47de3 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -4809,9 +4809,12 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, MVT MScalarTy = LT.second.getScalarType(); auto IsCheapPInsrPExtrInsertPS = [&]() { // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets. + // Inserting f32 into index0 is just movss. // Also, assume insertps is relatively cheap on all >= SSE41 targets. return (MScalarTy == MVT::i16 && ST->hasSSE2()) || (MScalarTy.isInteger() && ST->hasSSE41()) || + (MScalarTy == MVT::f32 && ST->hasSSE1() && Index == 0 && + Opcode == Instruction::InsertElement) || (MScalarTy == MVT::f32 && ST->hasSSE41() && Opcode == Instruction::InsertElement); }; diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp index 240d089ebeff8..7b59c39283ded 100644 --- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp +++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp @@ -69,7 +69,6 @@ static const char *const CoroIntrinsics[] = { "llvm.coro.async.context.dealloc", "llvm.coro.async.resume", "llvm.coro.async.size.replace", - "llvm.coro.async.store_resume", "llvm.coro.await.suspend.bool", "llvm.coro.await.suspend.handle", "llvm.coro.await.suspend.void", diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index e9bb2b8847563..184c75a1dd860 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -4964,8 +4964,8 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) { // (A & B) ^ (A | C) --> A ? ~B : C -- There are 4 commuted variants. if (I.getType()->isIntOrIntVectorTy(1) && - match(Op0, m_OneUse(m_LogicalAnd(m_Value(A), m_Value(B)))) && - match(Op1, m_OneUse(m_LogicalOr(m_Value(C), m_Value(D))))) { + match(&I, m_c_Xor(m_OneUse(m_LogicalAnd(m_Value(A), m_Value(B))), + m_OneUse(m_LogicalOr(m_Value(C), m_Value(D)))))) { bool NeedFreeze = isa(Op0) && isa(Op1) && B == D; if (B == C || B == D) std::swap(A, B); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e0f629e14f657..47866dac9ad91 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8354,17 +8354,22 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, auto *GEP = dyn_cast( Ptr->getUnderlyingValue()->stripPointerCasts()); VPSingleDefRecipe *VectorPtr; - if (Reverse) + if (Reverse) { + // When folding the tail, we may compute an address that we don't in the + // original scalar loop and it may not be inbounds. Drop Inbounds in that + // case. + GEPNoWrapFlags Flags = + (CM.foldTailByMasking() || !GEP || !GEP->isInBounds()) + ? GEPNoWrapFlags::none() + : GEPNoWrapFlags::inBounds(); VectorPtr = new VPReverseVectorPointerRecipe( - Ptr, &Plan.getVF(), getLoadStoreType(I), - GEP && GEP->isInBounds() ? GEPNoWrapFlags::inBounds() - : GEPNoWrapFlags::none(), - I->getDebugLoc()); - else + Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc()); + } else { VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(), I->getDebugLoc()); + } Builder.getInsertBlock()->appendRecipe(VectorPtr); Ptr = VectorPtr; } diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index ea53a1acebd1d..120eafae8c5ac 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -705,7 +705,8 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { InstructionCost NewCost = TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) + - TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, CostKind); + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, VecTy, Mask, + CostKind); bool NeedLenChg = SrcVecTy->getNumElements() != NumElts; // If the lengths of the two vectors are not equal, diff --git a/llvm/test/Analysis/CostModel/X86/vector-insert-value.ll b/llvm/test/Analysis/CostModel/X86/vector-insert-value.ll index 252497643a4f3..ee82e10f9ebb6 100644 --- a/llvm/test/Analysis/CostModel/X86/vector-insert-value.ll +++ b/llvm/test/Analysis/CostModel/X86/vector-insert-value.ll @@ -76,58 +76,58 @@ define i32 @insert_double(i32 %arg, double %val, <2 x double> %src128, <4 x doub define i32 @insert_float(i32 %arg, float %val, <2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) { ; SSE2-LABEL: 'insert_float' ; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f32_a = insertelement <2 x float> %src64, float %val, i32 %arg -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> %src64, float %val, i32 1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f32_a = insertelement <4 x float> %src128, float %val, i32 %arg -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_3 = insertelement <4 x float> %src128, float %val, i32 3 ; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v8f32_a = insertelement <8 x float> %src256, float %val, i32 %arg -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_3 = insertelement <8 x float> %src256, float %val, i32 3 -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = insertelement <8 x float> %src256, float %val, i32 7 ; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v16f32_a = insertelement <16 x float> %src512, float %val, i32 %arg -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_3 = insertelement <16 x float> %src512, float %val, i32 3 -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8 +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = insertelement <16 x float> %src512, float %val, i32 15 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE3-LABEL: 'insert_float' ; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f32_a = insertelement <2 x float> %src64, float %val, i32 %arg -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0 ; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> %src64, float %val, i32 1 ; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f32_a = insertelement <4 x float> %src128, float %val, i32 %arg -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0 ; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_3 = insertelement <4 x float> %src128, float %val, i32 3 ; SSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v8f32_a = insertelement <8 x float> %src256, float %val, i32 %arg -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0 ; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_3 = insertelement <8 x float> %src256, float %val, i32 3 -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4 ; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = insertelement <8 x float> %src256, float %val, i32 7 ; SSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v16f32_a = insertelement <16 x float> %src512, float %val, i32 %arg -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0 ; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_3 = insertelement <16 x float> %src512, float %val, i32 3 -; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8 +; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8 ; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = insertelement <16 x float> %src512, float %val, i32 15 ; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'insert_float' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f32_a = insertelement <2 x float> %src64, float %val, i32 %arg -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> %src64, float %val, i32 1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f32_a = insertelement <4 x float> %src128, float %val, i32 %arg -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32_3 = insertelement <4 x float> %src128, float %val, i32 3 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v8f32_a = insertelement <8 x float> %src256, float %val, i32 %arg -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_3 = insertelement <8 x float> %src256, float %val, i32 3 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = insertelement <8 x float> %src256, float %val, i32 7 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v16f32_a = insertelement <16 x float> %src512, float %val, i32 %arg -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_3 = insertelement <16 x float> %src512, float %val, i32 3 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = insertelement <16 x float> %src512, float %val, i32 15 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; diff --git a/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll index 1aa28f5c2733d..9a1203f18243d 100644 --- a/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll +++ b/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll @@ -156,11 +156,10 @@ define i32 @fptosi_bf(bfloat %a) nounwind ssp { ; CHECK-LABEL: fptosi_bf: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: fmov s1, s0 -; CHECK-NEXT: // implicit-def: $s0 +; CHECK-NEXT: // implicit-def: $d0 ; CHECK-NEXT: fmov s0, s1 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: fcvtzs w0, s0 ; CHECK-NEXT: ret entry: @@ -173,11 +172,10 @@ define i32 @fptoui_sbf(bfloat %a) nounwind ssp { ; CHECK-LABEL: fptoui_sbf: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: fmov s1, s0 -; CHECK-NEXT: // implicit-def: $s0 +; CHECK-NEXT: // implicit-def: $d0 ; CHECK-NEXT: fmov s0, s1 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: fcvtzu w0, s0 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll index ed9c1b037d0cc..fb40dfcbe101d 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll @@ -182,17 +182,14 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 { define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align2: ; NOLSE: // %bb.0: -; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; NOLSE-NEXT: shll v1.4s, v0.4h, #16 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s1, w9 ; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxrh w9, [x0] ; NOLSE-NEXT: fmov s0, w9 -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: shll v2.4s, v0.4h, #16 ; NOLSE-NEXT: fadd s2, s2, s1 ; NOLSE-NEXT: fmov w9, s2 ; NOLSE-NEXT: ubfx w10, w9, #16, #1 @@ -202,36 +199,34 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; NOLSE-NEXT: stlxrh w10, w9, [x0] ; NOLSE-NEXT: cbnz w10, .LBB2_1 ; NOLSE-NEXT: // %bb.2: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align2: ; LSE: // %bb.0: -; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; LSE-NEXT: shll v1.4s, v0.4h, #16 ; LSE-NEXT: mov w8, #32767 // =0x7fff ; LSE-NEXT: ldr h0, [x0] -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s1, w9 ; LSE-NEXT: .LBB2_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: fmov w9, s0 -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: shll v2.4s, v0.4h, #16 ; LSE-NEXT: fadd s2, s2, s1 ; LSE-NEXT: fmov w9, s2 ; LSE-NEXT: ubfx w10, w9, #16, #1 ; LSE-NEXT: add w9, w9, w8 ; LSE-NEXT: add w9, w10, w9 -; LSE-NEXT: fmov w10, s0 ; LSE-NEXT: lsr w9, w9, #16 -; LSE-NEXT: mov w11, w10 -; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: fmov w10, s2 +; LSE-NEXT: mov w11, w9 +; LSE-NEXT: casalh w11, w10, [x0] ; LSE-NEXT: fmov s0, w11 -; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: cmp w11, w9, uxth ; LSE-NEXT: b.ne .LBB2_1 ; LSE-NEXT: // %bb.2: // %atomicrmw.end -; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; LSE-NEXT: ret ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align2: @@ -281,17 +276,14 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value) define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align4: ; NOLSE: // %bb.0: -; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; NOLSE-NEXT: shll v1.4s, v0.4h, #16 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s1, w9 ; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxrh w9, [x0] ; NOLSE-NEXT: fmov s0, w9 -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: shll v2.4s, v0.4h, #16 ; NOLSE-NEXT: fadd s2, s2, s1 ; NOLSE-NEXT: fmov w9, s2 ; NOLSE-NEXT: ubfx w10, w9, #16, #1 @@ -301,36 +293,34 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; NOLSE-NEXT: stlxrh w10, w9, [x0] ; NOLSE-NEXT: cbnz w10, .LBB3_1 ; NOLSE-NEXT: // %bb.2: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align4: ; LSE: // %bb.0: -; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; LSE-NEXT: shll v1.4s, v0.4h, #16 ; LSE-NEXT: mov w8, #32767 // =0x7fff ; LSE-NEXT: ldr h0, [x0] -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s1, w9 ; LSE-NEXT: .LBB3_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: fmov w9, s0 -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: shll v2.4s, v0.4h, #16 ; LSE-NEXT: fadd s2, s2, s1 ; LSE-NEXT: fmov w9, s2 ; LSE-NEXT: ubfx w10, w9, #16, #1 ; LSE-NEXT: add w9, w9, w8 ; LSE-NEXT: add w9, w10, w9 -; LSE-NEXT: fmov w10, s0 ; LSE-NEXT: lsr w9, w9, #16 -; LSE-NEXT: mov w11, w10 -; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: fmov w10, s2 +; LSE-NEXT: mov w11, w9 +; LSE-NEXT: casalh w11, w10, [x0] ; LSE-NEXT: fmov s0, w11 -; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: cmp w11, w9, uxth ; LSE-NEXT: b.ne .LBB3_1 ; LSE-NEXT: // %bb.2: // %atomicrmw.end -; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; LSE-NEXT: ret ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align4: diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll index 888b795876f7d..818dcf3a0b487 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll @@ -184,17 +184,14 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align4(ptr %ptr, half %value) #0 { define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align2: ; NOLSE: // %bb.0: -; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; NOLSE-NEXT: shll v1.4s, v0.4h, #16 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s1, w9 ; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxrh w9, [x0] ; NOLSE-NEXT: fmov s0, w9 -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: shll v2.4s, v0.4h, #16 ; NOLSE-NEXT: fmaxnm s2, s2, s1 ; NOLSE-NEXT: fmov w9, s2 ; NOLSE-NEXT: ubfx w10, w9, #16, #1 @@ -204,36 +201,34 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; NOLSE-NEXT: stlxrh w10, w9, [x0] ; NOLSE-NEXT: cbnz w10, .LBB2_1 ; NOLSE-NEXT: // %bb.2: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align2: ; LSE: // %bb.0: -; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; LSE-NEXT: shll v1.4s, v0.4h, #16 ; LSE-NEXT: mov w8, #32767 // =0x7fff ; LSE-NEXT: ldr h0, [x0] -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s1, w9 ; LSE-NEXT: .LBB2_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: fmov w9, s0 -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: shll v2.4s, v0.4h, #16 ; LSE-NEXT: fmaxnm s2, s2, s1 ; LSE-NEXT: fmov w9, s2 ; LSE-NEXT: ubfx w10, w9, #16, #1 ; LSE-NEXT: add w9, w9, w8 ; LSE-NEXT: add w9, w10, w9 -; LSE-NEXT: fmov w10, s0 ; LSE-NEXT: lsr w9, w9, #16 -; LSE-NEXT: mov w11, w10 -; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: fmov w10, s2 +; LSE-NEXT: mov w11, w9 +; LSE-NEXT: casalh w11, w10, [x0] ; LSE-NEXT: fmov s0, w11 -; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: cmp w11, w9, uxth ; LSE-NEXT: b.ne .LBB2_1 ; LSE-NEXT: // %bb.2: // %atomicrmw.end -; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; LSE-NEXT: ret ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align2: @@ -283,17 +278,14 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value) define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align4: ; NOLSE: // %bb.0: -; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; NOLSE-NEXT: shll v1.4s, v0.4h, #16 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s1, w9 ; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxrh w9, [x0] ; NOLSE-NEXT: fmov s0, w9 -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: shll v2.4s, v0.4h, #16 ; NOLSE-NEXT: fmaxnm s2, s2, s1 ; NOLSE-NEXT: fmov w9, s2 ; NOLSE-NEXT: ubfx w10, w9, #16, #1 @@ -303,36 +295,34 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; NOLSE-NEXT: stlxrh w10, w9, [x0] ; NOLSE-NEXT: cbnz w10, .LBB3_1 ; NOLSE-NEXT: // %bb.2: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align4: ; LSE: // %bb.0: -; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; LSE-NEXT: shll v1.4s, v0.4h, #16 ; LSE-NEXT: mov w8, #32767 // =0x7fff ; LSE-NEXT: ldr h0, [x0] -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s1, w9 ; LSE-NEXT: .LBB3_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: fmov w9, s0 -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: shll v2.4s, v0.4h, #16 ; LSE-NEXT: fmaxnm s2, s2, s1 ; LSE-NEXT: fmov w9, s2 ; LSE-NEXT: ubfx w10, w9, #16, #1 ; LSE-NEXT: add w9, w9, w8 ; LSE-NEXT: add w9, w10, w9 -; LSE-NEXT: fmov w10, s0 ; LSE-NEXT: lsr w9, w9, #16 -; LSE-NEXT: mov w11, w10 -; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: fmov w10, s2 +; LSE-NEXT: mov w11, w9 +; LSE-NEXT: casalh w11, w10, [x0] ; LSE-NEXT: fmov s0, w11 -; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: cmp w11, w9, uxth ; LSE-NEXT: b.ne .LBB3_1 ; LSE-NEXT: // %bb.2: // %atomicrmw.end -; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; LSE-NEXT: ret ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align4: @@ -653,31 +643,23 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; NOLSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4: ; NOLSE: // %bb.0: ; NOLSE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NOLSE-NEXT: mov h1, v0.h[1] -; NOLSE-NEXT: fmov w10, s0 +; NOLSE-NEXT: dup v1.4h, v0.h[1] ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: lsl w10, w10, #16 -; NOLSE-NEXT: fmov w9, s1 -; NOLSE-NEXT: fmov s1, w10 -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s0, w9 +; NOLSE-NEXT: shll v0.4s, v0.4h, #16 +; NOLSE-NEXT: shll v1.4s, v1.4h, #16 ; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxr w9, [x0] ; NOLSE-NEXT: fmov s2, w9 -; NOLSE-NEXT: mov h3, v2.h[1] -; NOLSE-NEXT: fmov w11, s2 -; NOLSE-NEXT: lsl w11, w11, #16 -; NOLSE-NEXT: fmov w10, s3 -; NOLSE-NEXT: fmov s3, w11 -; NOLSE-NEXT: lsl w10, w10, #16 -; NOLSE-NEXT: fmaxnm s3, s3, s1 -; NOLSE-NEXT: fmov s2, w10 +; NOLSE-NEXT: dup v3.4h, v2.h[1] +; NOLSE-NEXT: shll v2.4s, v2.4h, #16 ; NOLSE-NEXT: fmaxnm s2, s2, s0 -; NOLSE-NEXT: fmov w11, s3 +; NOLSE-NEXT: shll v3.4s, v3.4h, #16 +; NOLSE-NEXT: fmaxnm s3, s3, s1 +; NOLSE-NEXT: fmov w11, s2 ; NOLSE-NEXT: ubfx w13, w11, #16, #1 ; NOLSE-NEXT: add w11, w11, w8 -; NOLSE-NEXT: fmov w10, s2 +; NOLSE-NEXT: fmov w10, s3 ; NOLSE-NEXT: add w11, w13, w11 ; NOLSE-NEXT: lsr w11, w11, #16 ; NOLSE-NEXT: ubfx w12, w10, #16, #1 @@ -697,25 +679,17 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; LSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4: ; LSE: // %bb.0: ; LSE-NEXT: // kill: def $d0 killed $d0 def $q0 -; LSE-NEXT: mov h1, v0.h[1] -; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: dup v1.4h, v0.h[1] +; LSE-NEXT: shll v2.4s, v0.4h, #16 ; LSE-NEXT: mov w8, #32767 // =0x7fff ; LSE-NEXT: ldr s0, [x0] -; LSE-NEXT: lsl w10, w10, #16 -; LSE-NEXT: fmov w9, s1 -; LSE-NEXT: fmov s2, w10 -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s1, w9 +; LSE-NEXT: shll v1.4s, v1.4h, #16 ; LSE-NEXT: .LBB7_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: mov h3, v0.h[1] -; LSE-NEXT: fmov w10, s0 -; LSE-NEXT: lsl w10, w10, #16 -; LSE-NEXT: fmov w9, s3 -; LSE-NEXT: fmov s4, w10 -; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: dup v3.4h, v0.h[1] +; LSE-NEXT: shll v4.4s, v0.4h, #16 ; LSE-NEXT: fmaxnm s4, s4, s2 -; LSE-NEXT: fmov s3, w9 +; LSE-NEXT: shll v3.4s, v3.4h, #16 ; LSE-NEXT: fmaxnm s3, s3, s1 ; LSE-NEXT: fmov w10, s4 ; LSE-NEXT: ubfx w12, w10, #16, #1 diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll index a3665c6e42860..b969241e8bf90 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll @@ -184,17 +184,14 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align4(ptr %ptr, half %value) #0 { define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align2: ; NOLSE: // %bb.0: -; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; NOLSE-NEXT: shll v1.4s, v0.4h, #16 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s1, w9 ; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxrh w9, [x0] ; NOLSE-NEXT: fmov s0, w9 -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: shll v2.4s, v0.4h, #16 ; NOLSE-NEXT: fminnm s2, s2, s1 ; NOLSE-NEXT: fmov w9, s2 ; NOLSE-NEXT: ubfx w10, w9, #16, #1 @@ -204,36 +201,34 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; NOLSE-NEXT: stlxrh w10, w9, [x0] ; NOLSE-NEXT: cbnz w10, .LBB2_1 ; NOLSE-NEXT: // %bb.2: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align2: ; LSE: // %bb.0: -; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; LSE-NEXT: shll v1.4s, v0.4h, #16 ; LSE-NEXT: mov w8, #32767 // =0x7fff ; LSE-NEXT: ldr h0, [x0] -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s1, w9 ; LSE-NEXT: .LBB2_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: fmov w9, s0 -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: shll v2.4s, v0.4h, #16 ; LSE-NEXT: fminnm s2, s2, s1 ; LSE-NEXT: fmov w9, s2 ; LSE-NEXT: ubfx w10, w9, #16, #1 ; LSE-NEXT: add w9, w9, w8 ; LSE-NEXT: add w9, w10, w9 -; LSE-NEXT: fmov w10, s0 ; LSE-NEXT: lsr w9, w9, #16 -; LSE-NEXT: mov w11, w10 -; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: fmov w10, s2 +; LSE-NEXT: mov w11, w9 +; LSE-NEXT: casalh w11, w10, [x0] ; LSE-NEXT: fmov s0, w11 -; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: cmp w11, w9, uxth ; LSE-NEXT: b.ne .LBB2_1 ; LSE-NEXT: // %bb.2: // %atomicrmw.end -; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; LSE-NEXT: ret ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align2: @@ -283,17 +278,14 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value) define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align4: ; NOLSE: // %bb.0: -; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; NOLSE-NEXT: shll v1.4s, v0.4h, #16 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s1, w9 ; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxrh w9, [x0] ; NOLSE-NEXT: fmov s0, w9 -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: shll v2.4s, v0.4h, #16 ; NOLSE-NEXT: fminnm s2, s2, s1 ; NOLSE-NEXT: fmov w9, s2 ; NOLSE-NEXT: ubfx w10, w9, #16, #1 @@ -303,36 +295,34 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; NOLSE-NEXT: stlxrh w10, w9, [x0] ; NOLSE-NEXT: cbnz w10, .LBB3_1 ; NOLSE-NEXT: // %bb.2: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align4: ; LSE: // %bb.0: -; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; LSE-NEXT: shll v1.4s, v0.4h, #16 ; LSE-NEXT: mov w8, #32767 // =0x7fff ; LSE-NEXT: ldr h0, [x0] -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s1, w9 ; LSE-NEXT: .LBB3_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: fmov w9, s0 -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: shll v2.4s, v0.4h, #16 ; LSE-NEXT: fminnm s2, s2, s1 ; LSE-NEXT: fmov w9, s2 ; LSE-NEXT: ubfx w10, w9, #16, #1 ; LSE-NEXT: add w9, w9, w8 ; LSE-NEXT: add w9, w10, w9 -; LSE-NEXT: fmov w10, s0 ; LSE-NEXT: lsr w9, w9, #16 -; LSE-NEXT: mov w11, w10 -; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: fmov w10, s2 +; LSE-NEXT: mov w11, w9 +; LSE-NEXT: casalh w11, w10, [x0] ; LSE-NEXT: fmov s0, w11 -; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: cmp w11, w9, uxth ; LSE-NEXT: b.ne .LBB3_1 ; LSE-NEXT: // %bb.2: // %atomicrmw.end -; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; LSE-NEXT: ret ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align4: @@ -653,31 +643,23 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; NOLSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4: ; NOLSE: // %bb.0: ; NOLSE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NOLSE-NEXT: mov h1, v0.h[1] -; NOLSE-NEXT: fmov w10, s0 +; NOLSE-NEXT: dup v1.4h, v0.h[1] ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: lsl w10, w10, #16 -; NOLSE-NEXT: fmov w9, s1 -; NOLSE-NEXT: fmov s1, w10 -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s0, w9 +; NOLSE-NEXT: shll v0.4s, v0.4h, #16 +; NOLSE-NEXT: shll v1.4s, v1.4h, #16 ; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxr w9, [x0] ; NOLSE-NEXT: fmov s2, w9 -; NOLSE-NEXT: mov h3, v2.h[1] -; NOLSE-NEXT: fmov w11, s2 -; NOLSE-NEXT: lsl w11, w11, #16 -; NOLSE-NEXT: fmov w10, s3 -; NOLSE-NEXT: fmov s3, w11 -; NOLSE-NEXT: lsl w10, w10, #16 -; NOLSE-NEXT: fminnm s3, s3, s1 -; NOLSE-NEXT: fmov s2, w10 +; NOLSE-NEXT: dup v3.4h, v2.h[1] +; NOLSE-NEXT: shll v2.4s, v2.4h, #16 ; NOLSE-NEXT: fminnm s2, s2, s0 -; NOLSE-NEXT: fmov w11, s3 +; NOLSE-NEXT: shll v3.4s, v3.4h, #16 +; NOLSE-NEXT: fminnm s3, s3, s1 +; NOLSE-NEXT: fmov w11, s2 ; NOLSE-NEXT: ubfx w13, w11, #16, #1 ; NOLSE-NEXT: add w11, w11, w8 -; NOLSE-NEXT: fmov w10, s2 +; NOLSE-NEXT: fmov w10, s3 ; NOLSE-NEXT: add w11, w13, w11 ; NOLSE-NEXT: lsr w11, w11, #16 ; NOLSE-NEXT: ubfx w12, w10, #16, #1 @@ -697,25 +679,17 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; LSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4: ; LSE: // %bb.0: ; LSE-NEXT: // kill: def $d0 killed $d0 def $q0 -; LSE-NEXT: mov h1, v0.h[1] -; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: dup v1.4h, v0.h[1] +; LSE-NEXT: shll v2.4s, v0.4h, #16 ; LSE-NEXT: mov w8, #32767 // =0x7fff ; LSE-NEXT: ldr s0, [x0] -; LSE-NEXT: lsl w10, w10, #16 -; LSE-NEXT: fmov w9, s1 -; LSE-NEXT: fmov s2, w10 -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s1, w9 +; LSE-NEXT: shll v1.4s, v1.4h, #16 ; LSE-NEXT: .LBB7_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: mov h3, v0.h[1] -; LSE-NEXT: fmov w10, s0 -; LSE-NEXT: lsl w10, w10, #16 -; LSE-NEXT: fmov w9, s3 -; LSE-NEXT: fmov s4, w10 -; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: dup v3.4h, v0.h[1] +; LSE-NEXT: shll v4.4s, v0.4h, #16 ; LSE-NEXT: fminnm s4, s4, s2 -; LSE-NEXT: fmov s3, w9 +; LSE-NEXT: shll v3.4s, v3.4h, #16 ; LSE-NEXT: fminnm s3, s3, s1 ; LSE-NEXT: fmov w10, s4 ; LSE-NEXT: ubfx w12, w10, #16, #1 diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll index 7725ce0e73185..e603337e7a569 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll @@ -182,17 +182,14 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align4(ptr %ptr, half %value) #0 { define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align2: ; NOLSE: // %bb.0: -; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; NOLSE-NEXT: shll v1.4s, v0.4h, #16 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s1, w9 ; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxrh w9, [x0] ; NOLSE-NEXT: fmov s0, w9 -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: shll v2.4s, v0.4h, #16 ; NOLSE-NEXT: fsub s2, s2, s1 ; NOLSE-NEXT: fmov w9, s2 ; NOLSE-NEXT: ubfx w10, w9, #16, #1 @@ -202,36 +199,34 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; NOLSE-NEXT: stlxrh w10, w9, [x0] ; NOLSE-NEXT: cbnz w10, .LBB2_1 ; NOLSE-NEXT: // %bb.2: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align2: ; LSE: // %bb.0: -; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; LSE-NEXT: shll v1.4s, v0.4h, #16 ; LSE-NEXT: mov w8, #32767 // =0x7fff ; LSE-NEXT: ldr h0, [x0] -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s1, w9 ; LSE-NEXT: .LBB2_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: fmov w9, s0 -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: shll v2.4s, v0.4h, #16 ; LSE-NEXT: fsub s2, s2, s1 ; LSE-NEXT: fmov w9, s2 ; LSE-NEXT: ubfx w10, w9, #16, #1 ; LSE-NEXT: add w9, w9, w8 ; LSE-NEXT: add w9, w10, w9 -; LSE-NEXT: fmov w10, s0 ; LSE-NEXT: lsr w9, w9, #16 -; LSE-NEXT: mov w11, w10 -; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: fmov w10, s2 +; LSE-NEXT: mov w11, w9 +; LSE-NEXT: casalh w11, w10, [x0] ; LSE-NEXT: fmov s0, w11 -; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: cmp w11, w9, uxth ; LSE-NEXT: b.ne .LBB2_1 ; LSE-NEXT: // %bb.2: // %atomicrmw.end -; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; LSE-NEXT: ret ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align2: @@ -281,17 +276,14 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value) define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align4: ; NOLSE: // %bb.0: -; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; NOLSE-NEXT: shll v1.4s, v0.4h, #16 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s1, w9 ; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxrh w9, [x0] ; NOLSE-NEXT: fmov s0, w9 -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: shll v2.4s, v0.4h, #16 ; NOLSE-NEXT: fsub s2, s2, s1 ; NOLSE-NEXT: fmov w9, s2 ; NOLSE-NEXT: ubfx w10, w9, #16, #1 @@ -301,36 +293,34 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; NOLSE-NEXT: stlxrh w10, w9, [x0] ; NOLSE-NEXT: cbnz w10, .LBB3_1 ; NOLSE-NEXT: // %bb.2: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align4: ; LSE: // %bb.0: -; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; LSE-NEXT: shll v1.4s, v0.4h, #16 ; LSE-NEXT: mov w8, #32767 // =0x7fff ; LSE-NEXT: ldr h0, [x0] -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s1, w9 ; LSE-NEXT: .LBB3_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: fmov w9, s0 -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: shll v2.4s, v0.4h, #16 ; LSE-NEXT: fsub s2, s2, s1 ; LSE-NEXT: fmov w9, s2 ; LSE-NEXT: ubfx w10, w9, #16, #1 ; LSE-NEXT: add w9, w9, w8 ; LSE-NEXT: add w9, w10, w9 -; LSE-NEXT: fmov w10, s0 ; LSE-NEXT: lsr w9, w9, #16 -; LSE-NEXT: mov w11, w10 -; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: fmov w10, s2 +; LSE-NEXT: mov w11, w9 +; LSE-NEXT: casalh w11, w10, [x0] ; LSE-NEXT: fmov s0, w11 -; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: cmp w11, w9, uxth ; LSE-NEXT: b.ne .LBB3_1 ; LSE-NEXT: // %bb.2: // %atomicrmw.end -; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; LSE-NEXT: ret ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align4: diff --git a/llvm/test/CodeGen/AArch64/bf16-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-instructions.ll index 33997614598c3..bc06453e9c01f 100644 --- a/llvm/test/CodeGen/AArch64/bf16-instructions.ll +++ b/llvm/test/CodeGen/AArch64/bf16-instructions.ll @@ -5,16 +5,12 @@ define bfloat @test_fadd(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-LABEL: test_fadd: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s1 -; CHECK-CVT-NEXT: fmov w10, s0 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: lsl w10, w10, #16 -; CHECK-CVT-NEXT: fmov s0, w9 -; CHECK-CVT-NEXT: fmov s1, w10 -; CHECK-CVT-NEXT: fadd s0, s1, s0 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: fadd s0, s0, s1 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 ; CHECK-CVT-NEXT: add w8, w9, w8 @@ -26,15 +22,11 @@ define bfloat @test_fadd(bfloat %a, bfloat %b) #0 { ; ; CHECK-BF16-LABEL: test_fadd: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s1 -; CHECK-BF16-NEXT: fmov w9, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 -; CHECK-BF16-NEXT: fadd s0, s1, s0 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fadd s0, s0, s1 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret %r = fadd bfloat %a, %b @@ -44,16 +36,12 @@ define bfloat @test_fadd(bfloat %a, bfloat %b) #0 { define bfloat @test_fsub(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-LABEL: test_fsub: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s1 -; CHECK-CVT-NEXT: fmov w10, s0 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: lsl w10, w10, #16 -; CHECK-CVT-NEXT: fmov s0, w9 -; CHECK-CVT-NEXT: fmov s1, w10 -; CHECK-CVT-NEXT: fsub s0, s1, s0 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: fsub s0, s0, s1 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 ; CHECK-CVT-NEXT: add w8, w9, w8 @@ -65,15 +53,11 @@ define bfloat @test_fsub(bfloat %a, bfloat %b) #0 { ; ; CHECK-BF16-LABEL: test_fsub: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s1 -; CHECK-BF16-NEXT: fmov w9, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 -; CHECK-BF16-NEXT: fsub s0, s1, s0 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fsub s0, s0, s1 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret %r = fsub bfloat %a, %b @@ -83,16 +67,12 @@ define bfloat @test_fsub(bfloat %a, bfloat %b) #0 { define bfloat @test_fmul(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-LABEL: test_fmul: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s1 -; CHECK-CVT-NEXT: fmov w10, s0 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: lsl w10, w10, #16 -; CHECK-CVT-NEXT: fmov s0, w9 -; CHECK-CVT-NEXT: fmov s1, w10 -; CHECK-CVT-NEXT: fmul s0, s1, s0 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: fmul s0, s0, s1 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 ; CHECK-CVT-NEXT: add w8, w9, w8 @@ -104,15 +84,11 @@ define bfloat @test_fmul(bfloat %a, bfloat %b) #0 { ; ; CHECK-BF16-LABEL: test_fmul: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s1 -; CHECK-BF16-NEXT: fmov w9, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 -; CHECK-BF16-NEXT: fmul s0, s1, s0 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fmul s0, s0, s1 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret %r = fmul bfloat %a, %b @@ -122,27 +98,21 @@ define bfloat @test_fmul(bfloat %a, bfloat %b) #0 { define bfloat @test_fmadd(bfloat %a, bfloat %b, bfloat %c) #0 { ; CHECK-CVT-LABEL: test_fmadd: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s1 -; CHECK-CVT-NEXT: fmov w9, s0 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w10, #32767 // =0x7fff -; CHECK-CVT-NEXT: // kill: def $h2 killed $h2 def $s2 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: fmov s1, w9 -; CHECK-CVT-NEXT: fmul s0, s1, s0 +; CHECK-CVT-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: fmul s0, s0, s1 +; CHECK-CVT-NEXT: shll v1.4s, v2.4h, #16 ; CHECK-CVT-NEXT: fmov w8, s0 ; CHECK-CVT-NEXT: ubfx w9, w8, #16, #1 ; CHECK-CVT-NEXT: add w8, w8, w10 ; CHECK-CVT-NEXT: add w8, w9, w8 -; CHECK-CVT-NEXT: fmov w9, s2 ; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: lsl w9, w9, #16 ; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: fmov s1, w9 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: fadd s0, s0, s1 ; CHECK-CVT-NEXT: fmov w8, s0 ; CHECK-CVT-NEXT: ubfx w9, w8, #16, #1 @@ -155,23 +125,15 @@ define bfloat @test_fmadd(bfloat %a, bfloat %b, bfloat %c) #0 { ; ; CHECK-BF16-LABEL: test_fmadd: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s1 -; CHECK-BF16-NEXT: fmov w9, s0 -; CHECK-BF16-NEXT: // kill: def $h2 killed $h2 def $s2 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 -; CHECK-BF16-NEXT: fmov w9, s2 -; CHECK-BF16-NEXT: fmul s0, s1, s0 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s1, w9 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fmul s0, s0, s1 +; CHECK-BF16-NEXT: shll v1.4s, v2.4h, #16 ; CHECK-BF16-NEXT: bfcvt h0, s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: fadd s0, s0, s1 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret @@ -183,16 +145,12 @@ define bfloat @test_fmadd(bfloat %a, bfloat %b, bfloat %c) #0 { define bfloat @test_fdiv(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-LABEL: test_fdiv: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s1 -; CHECK-CVT-NEXT: fmov w10, s0 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: lsl w10, w10, #16 -; CHECK-CVT-NEXT: fmov s0, w9 -; CHECK-CVT-NEXT: fmov s1, w10 -; CHECK-CVT-NEXT: fdiv s0, s1, s0 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: fdiv s0, s0, s1 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 ; CHECK-CVT-NEXT: add w8, w9, w8 @@ -204,15 +162,11 @@ define bfloat @test_fdiv(bfloat %a, bfloat %b) #0 { ; ; CHECK-BF16-LABEL: test_fdiv: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s1 -; CHECK-BF16-NEXT: fmov w9, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 -; CHECK-BF16-NEXT: fdiv s0, s1, s0 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fdiv s0, s0, s1 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret %r = fdiv bfloat %a, %b @@ -223,14 +177,12 @@ define bfloat @test_frem(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-LABEL: test_frem: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: fmov w9, s1 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: fmov s1, w9 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-CVT-NEXT: // kill: def $s1 killed $s1 killed $q1 ; CHECK-CVT-NEXT: bl fmodf ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -246,14 +198,12 @@ define bfloat @test_frem(bfloat %a, bfloat %b) #0 { ; CHECK-BF16-LABEL: test_frem: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: fmov w9, s1 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-BF16-NEXT: // kill: def $s1 killed $s1 killed $q1 ; CHECK-BF16-NEXT: bl fmodf ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -334,17 +284,13 @@ define bfloat @test_select(bfloat %a, bfloat %b, i1 zeroext %c) #0 { define bfloat @test_select_cc(bfloat %a, bfloat %b, bfloat %c, bfloat %d) #0 { ; CHECK-LABEL: test_select_cc: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h3 killed $h3 def $s3 -; CHECK-NEXT: // kill: def $h2 killed $h2 def $s2 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: // kill: def $h3 killed $h3 def $d3 +; CHECK-NEXT: // kill: def $h2 killed $h2 def $d2 ; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 ; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: fcmp s2, s3 ; CHECK-NEXT: fcsel s0, s0, s1, ne ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-NEXT: ret @@ -356,15 +302,11 @@ define bfloat @test_select_cc(bfloat %a, bfloat %b, bfloat %c, bfloat %d) #0 { define float @test_select_cc_f32_f16(float %a, float %b, bfloat %c, bfloat %d) #0 { ; CHECK-LABEL: test_select_cc_f32_f16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h3 killed $h3 def $s3 -; CHECK-NEXT: // kill: def $h2 killed $h2 def $s2 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: // kill: def $h3 killed $h3 def $d3 +; CHECK-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: fcmp s2, s3 ; CHECK-NEXT: fcsel s0, s0, s1, ne ; CHECK-NEXT: ret %cc = fcmp une bfloat %c, %d @@ -389,15 +331,11 @@ define bfloat @test_select_cc_f16_f32(bfloat %a, bfloat %b, float %c, float %d) define i1 @test_fcmp_une(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_une: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %r = fcmp une bfloat %a, %b @@ -407,15 +345,11 @@ define i1 @test_fcmp_une(bfloat %a, bfloat %b) #0 { define i1 @test_fcmp_ueq(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_ueq: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w8, eq ; CHECK-NEXT: csinc w0, w8, wzr, vc ; CHECK-NEXT: ret @@ -426,15 +360,11 @@ define i1 @test_fcmp_ueq(bfloat %a, bfloat %b) #0 { define i1 @test_fcmp_ugt(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_ugt: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w0, hi ; CHECK-NEXT: ret %r = fcmp ugt bfloat %a, %b @@ -444,15 +374,11 @@ define i1 @test_fcmp_ugt(bfloat %a, bfloat %b) #0 { define i1 @test_fcmp_uge(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_uge: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w0, pl ; CHECK-NEXT: ret %r = fcmp uge bfloat %a, %b @@ -462,15 +388,11 @@ define i1 @test_fcmp_uge(bfloat %a, bfloat %b) #0 { define i1 @test_fcmp_ult(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_ult: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w0, lt ; CHECK-NEXT: ret %r = fcmp ult bfloat %a, %b @@ -480,15 +402,11 @@ define i1 @test_fcmp_ult(bfloat %a, bfloat %b) #0 { define i1 @test_fcmp_ule(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_ule: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w0, le ; CHECK-NEXT: ret %r = fcmp ule bfloat %a, %b @@ -498,15 +416,11 @@ define i1 @test_fcmp_ule(bfloat %a, bfloat %b) #0 { define i1 @test_fcmp_uno(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_uno: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w0, vs ; CHECK-NEXT: ret %r = fcmp uno bfloat %a, %b @@ -516,15 +430,11 @@ define i1 @test_fcmp_uno(bfloat %a, bfloat %b) #0 { define i1 @test_fcmp_one(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_one: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w8, mi ; CHECK-NEXT: csinc w0, w8, wzr, le ; CHECK-NEXT: ret @@ -535,15 +445,11 @@ define i1 @test_fcmp_one(bfloat %a, bfloat %b) #0 { define i1 @test_fcmp_oeq(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_oeq: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %r = fcmp oeq bfloat %a, %b @@ -553,15 +459,11 @@ define i1 @test_fcmp_oeq(bfloat %a, bfloat %b) #0 { define i1 @test_fcmp_ogt(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_ogt: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w0, gt ; CHECK-NEXT: ret %r = fcmp ogt bfloat %a, %b @@ -571,15 +473,11 @@ define i1 @test_fcmp_ogt(bfloat %a, bfloat %b) #0 { define i1 @test_fcmp_oge(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_oge: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w0, ge ; CHECK-NEXT: ret %r = fcmp oge bfloat %a, %b @@ -589,15 +487,11 @@ define i1 @test_fcmp_oge(bfloat %a, bfloat %b) #0 { define i1 @test_fcmp_olt(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_olt: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w0, mi ; CHECK-NEXT: ret %r = fcmp olt bfloat %a, %b @@ -607,15 +501,11 @@ define i1 @test_fcmp_olt(bfloat %a, bfloat %b) #0 { define i1 @test_fcmp_ole(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_ole: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w0, ls ; CHECK-NEXT: ret %r = fcmp ole bfloat %a, %b @@ -625,15 +515,11 @@ define i1 @test_fcmp_ole(bfloat %a, bfloat %b) #0 { define i1 @test_fcmp_ord(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_ord: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w0, vc ; CHECK-NEXT: ret %r = fcmp ord bfloat %a, %b @@ -643,13 +529,11 @@ define i1 @test_fcmp_ord(bfloat %a, bfloat %b) #0 { define void @test_fccmp(bfloat %in, ptr %out) { ; CHECK-LABEL: test_fccmp: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: movi v1.2s, #69, lsl #24 -; CHECK-NEXT: movi v3.2s, #72, lsl #24 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov s2, w8 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v2.4s, v0.4h, #16 ; CHECK-NEXT: adrp x8, .LCPI29_0 +; CHECK-NEXT: movi v3.2s, #72, lsl #24 ; CHECK-NEXT: fcmp s2, s1 ; CHECK-NEXT: ldr h1, [x8, :lo12:.LCPI29_0] ; CHECK-NEXT: fccmp s2, s3, #4, mi @@ -667,15 +551,11 @@ define void @test_fccmp(bfloat %in, ptr %out) { define void @test_br_cc(bfloat %a, bfloat %b, ptr %p1, ptr %p2) #0 { ; CHECK-LABEL: test_br_cc: ; CHECK: // %bb.0: // %common.ret -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: csel x8, x0, x1, pl ; CHECK-NEXT: str wzr, [x8] ; CHECK-NEXT: ret @@ -725,10 +605,8 @@ declare i1 @test_dummy(ptr %p1) #0 define i32 @test_fptosi_i32(bfloat %a) #0 { ; CHECK-LABEL: test_fptosi_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: fcvtzs w0, s0 ; CHECK-NEXT: ret %r = fptosi bfloat %a to i32 @@ -738,10 +616,8 @@ define i32 @test_fptosi_i32(bfloat %a) #0 { define i64 @test_fptosi_i64(bfloat %a) #0 { ; CHECK-LABEL: test_fptosi_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: fcvtzs x0, s0 ; CHECK-NEXT: ret %r = fptosi bfloat %a to i64 @@ -751,10 +627,8 @@ define i64 @test_fptosi_i64(bfloat %a) #0 { define i32 @test_fptoui_i32(bfloat %a) #0 { ; CHECK-LABEL: test_fptoui_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: fcvtzu w0, s0 ; CHECK-NEXT: ret %r = fptoui bfloat %a to i32 @@ -764,10 +638,8 @@ define i32 @test_fptoui_i32(bfloat %a) #0 { define i64 @test_fptoui_i64(bfloat %a) #0 { ; CHECK-LABEL: test_fptoui_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: fcvtzu x0, s0 ; CHECK-NEXT: ret %r = fptoui bfloat %a to i64 @@ -927,7 +799,8 @@ define bfloat @test_uitofp_i32_fadd(i32 %a, bfloat %b) #0 { ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: ucvtf d1, w0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: fcvtxn s1, d1 ; CHECK-CVT-NEXT: fmov w9, s1 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 @@ -935,12 +808,7 @@ define bfloat @test_uitofp_i32_fadd(i32 %a, bfloat %b) #0 { ; CHECK-CVT-NEXT: add w9, w10, w9 ; CHECK-CVT-NEXT: lsr w9, w9, #16 ; CHECK-CVT-NEXT: fmov s1, w9 -; CHECK-CVT-NEXT: fmov w9, s0 -; CHECK-CVT-NEXT: fmov w10, s1 -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w9 -; CHECK-CVT-NEXT: lsl w10, w10, #16 -; CHECK-CVT-NEXT: fmov s1, w10 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-CVT-NEXT: fadd s0, s0, s1 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 @@ -954,15 +822,11 @@ define bfloat @test_uitofp_i32_fadd(i32 %a, bfloat %b) #0 { ; CHECK-BF16-LABEL: test_uitofp_i32_fadd: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: ucvtf d1, w0 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: fcvtxn s1, d1 -; CHECK-BF16-NEXT: fmov s0, w8 ; CHECK-BF16-NEXT: bfcvt h1, s1 -; CHECK-BF16-NEXT: fmov w9, s1 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s1, w9 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-BF16-NEXT: fadd s0, s0, s1 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret @@ -976,7 +840,8 @@ define bfloat @test_sitofp_i32_fadd(i32 %a, bfloat %b) #0 { ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: scvtf d1, w0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: fcvtxn s1, d1 ; CHECK-CVT-NEXT: fmov w9, s1 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 @@ -984,12 +849,7 @@ define bfloat @test_sitofp_i32_fadd(i32 %a, bfloat %b) #0 { ; CHECK-CVT-NEXT: add w9, w10, w9 ; CHECK-CVT-NEXT: lsr w9, w9, #16 ; CHECK-CVT-NEXT: fmov s1, w9 -; CHECK-CVT-NEXT: fmov w9, s0 -; CHECK-CVT-NEXT: fmov w10, s1 -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w9 -; CHECK-CVT-NEXT: lsl w10, w10, #16 -; CHECK-CVT-NEXT: fmov s1, w10 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-CVT-NEXT: fadd s0, s0, s1 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 @@ -1003,15 +863,11 @@ define bfloat @test_sitofp_i32_fadd(i32 %a, bfloat %b) #0 { ; CHECK-BF16-LABEL: test_sitofp_i32_fadd: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: scvtf d1, w0 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: fcvtxn s1, d1 -; CHECK-BF16-NEXT: fmov s0, w8 ; CHECK-BF16-NEXT: bfcvt h1, s1 -; CHECK-BF16-NEXT: fmov w9, s1 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s1, w9 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-BF16-NEXT: fadd s0, s0, s1 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret @@ -1070,10 +926,9 @@ define bfloat @test_fptrunc_double(double %a) #0 { define float @test_fpext_float(bfloat %a) #0 { ; CHECK-LABEL: test_fpext_float: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: ret %r = fpext bfloat %a to float ret float %r @@ -1082,10 +937,8 @@ define float @test_fpext_float(bfloat %a) #0 { define double @test_fpext_double(bfloat %a) #0 { ; CHECK-LABEL: test_fpext_double: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: fcvt d0, s0 ; CHECK-NEXT: ret %r = fpext bfloat %a to double @@ -1148,11 +1001,9 @@ declare bfloat @llvm.fmuladd.f16(bfloat %a, bfloat %b, bfloat %c) #0 define bfloat @test_sqrt(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_sqrt: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s0 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w9 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: fsqrt s0, s0 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 @@ -1165,10 +1016,8 @@ define bfloat @test_sqrt(bfloat %a) #0 { ; ; CHECK-BF16-LABEL: test_sqrt: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: fsqrt s0, s0 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret @@ -1180,10 +1029,9 @@ define bfloat @test_powi(bfloat %a, i32 %b) #0 { ; CHECK-CVT-LABEL: test_powi: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl __powisf2 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1199,10 +1047,9 @@ define bfloat @test_powi(bfloat %a, i32 %b) #0 { ; CHECK-BF16-LABEL: test_powi: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl __powisf2 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1216,10 +1063,9 @@ define bfloat @test_sin(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_sin: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl sinf ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1235,10 +1081,9 @@ define bfloat @test_sin(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_sin: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl sinf ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1251,10 +1096,9 @@ define bfloat @test_cos(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_cos: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl cosf ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1270,10 +1114,9 @@ define bfloat @test_cos(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_cos: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl cosf ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1286,10 +1129,9 @@ define bfloat @test_tan(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_tan: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl tanf ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1305,10 +1147,9 @@ define bfloat @test_tan(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_tan: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl tanf ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1321,10 +1162,9 @@ define bfloat @test_acos(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_acos: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl acosf ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1340,10 +1180,9 @@ define bfloat @test_acos(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_acos: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl acosf ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1356,10 +1195,9 @@ define bfloat @test_asin(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_asin: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl asinf ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1375,10 +1213,9 @@ define bfloat @test_asin(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_asin: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl asinf ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1391,10 +1228,9 @@ define bfloat @test_atan(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_atan: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl atanf ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1410,10 +1246,9 @@ define bfloat @test_atan(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_atan: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl atanf ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1426,14 +1261,12 @@ define bfloat @test_atan2(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-LABEL: test_atan2: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: fmov w9, s1 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: fmov s1, w9 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-CVT-NEXT: // kill: def $s1 killed $s1 killed $q1 ; CHECK-CVT-NEXT: bl atan2f ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1449,14 +1282,12 @@ define bfloat @test_atan2(bfloat %a, bfloat %b) #0 { ; CHECK-BF16-LABEL: test_atan2: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: fmov w9, s1 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-BF16-NEXT: // kill: def $s1 killed $s1 killed $q1 ; CHECK-BF16-NEXT: bl atan2f ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1469,10 +1300,9 @@ define bfloat @test_cosh(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_cosh: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl coshf ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1488,10 +1318,9 @@ define bfloat @test_cosh(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_cosh: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl coshf ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1504,10 +1333,9 @@ define bfloat @test_sinh(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_sinh: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl sinhf ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1523,10 +1351,9 @@ define bfloat @test_sinh(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_sinh: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl sinhf ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1539,10 +1366,9 @@ define bfloat @test_tanh(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_tanh: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl tanhf ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1558,10 +1384,9 @@ define bfloat @test_tanh(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_tanh: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl tanhf ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1574,14 +1399,12 @@ define bfloat @test_pow(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-LABEL: test_pow: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: fmov w9, s1 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: fmov s1, w9 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-CVT-NEXT: // kill: def $s1 killed $s1 killed $q1 ; CHECK-CVT-NEXT: bl powf ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1597,14 +1420,12 @@ define bfloat @test_pow(bfloat %a, bfloat %b) #0 { ; CHECK-BF16-LABEL: test_pow: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: fmov w9, s1 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-BF16-NEXT: // kill: def $s1 killed $s1 killed $q1 ; CHECK-BF16-NEXT: bl powf ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1617,10 +1438,9 @@ define bfloat @test_exp(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_exp: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl expf ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1636,10 +1456,9 @@ define bfloat @test_exp(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_exp: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl expf ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1652,10 +1471,9 @@ define bfloat @test_exp2(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_exp2: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl exp2f ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1671,10 +1489,9 @@ define bfloat @test_exp2(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_exp2: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl exp2f ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1687,10 +1504,9 @@ define bfloat @test_log(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_log: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl logf ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1706,10 +1522,9 @@ define bfloat @test_log(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_log: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl logf ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1722,10 +1537,9 @@ define bfloat @test_log10(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_log10: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl log10f ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1741,10 +1555,9 @@ define bfloat @test_log10(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_log10: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl log10f ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1757,10 +1570,9 @@ define bfloat @test_log2(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_log2: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl log2f ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1776,10 +1588,9 @@ define bfloat @test_log2(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_log2: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl log2f ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1791,20 +1602,14 @@ define bfloat @test_log2(bfloat %a) #0 { define bfloat @test_fma(bfloat %a, bfloat %b, bfloat %c) #0 { ; CHECK-CVT-LABEL: test_fma: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h2 killed $h2 def $s2 -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s2 -; CHECK-CVT-NEXT: fmov w9, s1 -; CHECK-CVT-NEXT: fmov w10, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: lsl w10, w10, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: fmov s1, w9 -; CHECK-CVT-NEXT: fmov s2, w10 +; CHECK-CVT-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w10, #32767 // =0x7fff -; CHECK-CVT-NEXT: fmadd s0, s2, s1, s0 +; CHECK-CVT-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: fmadd s0, s0, s1, s2 ; CHECK-CVT-NEXT: fmov w8, s0 ; CHECK-CVT-NEXT: ubfx w9, w8, #16, #1 ; CHECK-CVT-NEXT: add w8, w8, w10 @@ -1816,19 +1621,13 @@ define bfloat @test_fma(bfloat %a, bfloat %b, bfloat %c) #0 { ; ; CHECK-BF16-LABEL: test_fma: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h2 killed $h2 def $s2 -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s2 -; CHECK-BF16-NEXT: fmov w9, s1 -; CHECK-BF16-NEXT: fmov w10, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: lsl w10, w10, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 -; CHECK-BF16-NEXT: fmov s2, w10 -; CHECK-BF16-NEXT: fmadd s0, s2, s1, s0 +; CHECK-BF16-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fmadd s0, s0, s1, s2 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.fma.f16(bfloat %a, bfloat %b, bfloat %c) @@ -1851,16 +1650,12 @@ define bfloat @test_fabs(bfloat %a) #0 { define bfloat @test_minnum(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-LABEL: test_minnum: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s1 -; CHECK-CVT-NEXT: fmov w10, s0 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: lsl w10, w10, #16 -; CHECK-CVT-NEXT: fmov s0, w9 -; CHECK-CVT-NEXT: fmov s1, w10 -; CHECK-CVT-NEXT: fminnm s0, s1, s0 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: fminnm s0, s0, s1 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 ; CHECK-CVT-NEXT: add w8, w9, w8 @@ -1872,15 +1667,11 @@ define bfloat @test_minnum(bfloat %a, bfloat %b) #0 { ; ; CHECK-BF16-LABEL: test_minnum: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s1 -; CHECK-BF16-NEXT: fmov w9, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 -; CHECK-BF16-NEXT: fminnm s0, s1, s0 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fminnm s0, s0, s1 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.minnum.f16(bfloat %a, bfloat %b) @@ -1890,16 +1681,12 @@ define bfloat @test_minnum(bfloat %a, bfloat %b) #0 { define bfloat @test_maxnum(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-LABEL: test_maxnum: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s1 -; CHECK-CVT-NEXT: fmov w10, s0 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: lsl w10, w10, #16 -; CHECK-CVT-NEXT: fmov s0, w9 -; CHECK-CVT-NEXT: fmov s1, w10 -; CHECK-CVT-NEXT: fmaxnm s0, s1, s0 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: fmaxnm s0, s0, s1 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 ; CHECK-CVT-NEXT: add w8, w9, w8 @@ -1911,15 +1698,11 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) #0 { ; ; CHECK-BF16-LABEL: test_maxnum: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s1 -; CHECK-BF16-NEXT: fmov w9, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 -; CHECK-BF16-NEXT: fmaxnm s0, s1, s0 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fmaxnm s0, s0, s1 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.maxnum.f16(bfloat %a, bfloat %b) @@ -1929,16 +1712,12 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) #0 { define bfloat @test_copysign(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-LABEL: test_copysign: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s1 -; CHECK-CVT-NEXT: fmov w9, s0 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mvni v2.4s, #128, lsl #24 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: fmov s1, w9 -; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-CVT-NEXT: fmov w8, s0 ; CHECK-CVT-NEXT: lsr w8, w8, #16 ; CHECK-CVT-NEXT: fmov s0, w8 @@ -1947,16 +1726,12 @@ define bfloat @test_copysign(bfloat %a, bfloat %b) #0 { ; ; CHECK-BF16-LABEL: test_copysign: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s1 -; CHECK-BF16-NEXT: fmov w9, s0 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-BF16-NEXT: mvni v2.4s, #128, lsl #24 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 -; CHECK-BF16-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %b) @@ -1966,12 +1741,10 @@ define bfloat @test_copysign(bfloat %a, bfloat %b) #0 { define bfloat @test_copysign_f32(bfloat %a, float %b) #0 { ; CHECK-CVT-LABEL: test_copysign_f32: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mvni v2.4s, #128, lsl #24 ; CHECK-CVT-NEXT: // kill: def $s1 killed $s1 def $q1 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-CVT-NEXT: fmov w8, s0 ; CHECK-CVT-NEXT: lsr w8, w8, #16 @@ -1981,12 +1754,10 @@ define bfloat @test_copysign_f32(bfloat %a, float %b) #0 { ; ; CHECK-BF16-LABEL: test_copysign_f32: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-BF16-NEXT: mvni v2.4s, #128, lsl #24 ; CHECK-BF16-NEXT: // kill: def $s1 killed $s1 def $q1 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret @@ -1998,12 +1769,10 @@ define bfloat @test_copysign_f32(bfloat %a, float %b) #0 { define bfloat @test_copysign_f64(bfloat %a, double %b) #0 { ; CHECK-CVT-LABEL: test_copysign_f64: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: fcvt s1, d1 ; CHECK-CVT-NEXT: mvni v2.4s, #128, lsl #24 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-CVT-NEXT: fmov w8, s0 ; CHECK-CVT-NEXT: lsr w8, w8, #16 @@ -2013,12 +1782,10 @@ define bfloat @test_copysign_f64(bfloat %a, double %b) #0 { ; ; CHECK-BF16-LABEL: test_copysign_f64: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-BF16-NEXT: fcvt s1, d1 ; CHECK-BF16-NEXT: mvni v2.4s, #128, lsl #24 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret @@ -2032,34 +1799,33 @@ define bfloat @test_copysign_f64(bfloat %a, double %b) #0 { define float @test_copysign_extended(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-LABEL: test_copysign_extended: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s1 -; CHECK-CVT-NEXT: fmov w9, s0 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: movi v2.4s, #16 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: ushl v0.4s, v0.4s, v2.4s ; CHECK-CVT-NEXT: mvni v2.4s, #128, lsl #24 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: fmov s1, w9 -; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-CVT-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-CVT-NEXT: fmov w8, s0 ; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: lsl w8, w8, #16 ; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: ret ; ; CHECK-BF16-LABEL: test_copysign_extended: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s1 -; CHECK-BF16-NEXT: fmov w9, s0 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: movi v2.4s, #16 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: ushl v0.4s, v0.4s, v2.4s ; CHECK-BF16-NEXT: mvni v2.4s, #128, lsl #24 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 -; CHECK-BF16-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-BF16-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %b) @@ -2070,11 +1836,9 @@ define float @test_copysign_extended(bfloat %a, bfloat %b) #0 { define bfloat @test_floor(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_floor: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s0 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w9 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: frintm s0, s0 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 @@ -2087,10 +1851,8 @@ define bfloat @test_floor(bfloat %a) #0 { ; ; CHECK-BF16-LABEL: test_floor: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: frintm s0, s0 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret @@ -2101,11 +1863,9 @@ define bfloat @test_floor(bfloat %a) #0 { define bfloat @test_ceil(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_ceil: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s0 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w9 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: frintp s0, s0 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 @@ -2118,10 +1878,8 @@ define bfloat @test_ceil(bfloat %a) #0 { ; ; CHECK-BF16-LABEL: test_ceil: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: frintp s0, s0 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret @@ -2132,11 +1890,9 @@ define bfloat @test_ceil(bfloat %a) #0 { define bfloat @test_trunc(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_trunc: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s0 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w9 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: frintz s0, s0 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 @@ -2149,10 +1905,8 @@ define bfloat @test_trunc(bfloat %a) #0 { ; ; CHECK-BF16-LABEL: test_trunc: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: frintz s0, s0 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret @@ -2163,11 +1917,9 @@ define bfloat @test_trunc(bfloat %a) #0 { define bfloat @test_rint(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_rint: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s0 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w9 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: frintx s0, s0 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 @@ -2180,10 +1932,8 @@ define bfloat @test_rint(bfloat %a) #0 { ; ; CHECK-BF16-LABEL: test_rint: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: frintx s0, s0 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret @@ -2194,11 +1944,9 @@ define bfloat @test_rint(bfloat %a) #0 { define bfloat @test_nearbyint(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_nearbyint: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s0 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w9 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: frinti s0, s0 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 @@ -2211,10 +1959,8 @@ define bfloat @test_nearbyint(bfloat %a) #0 { ; ; CHECK-BF16-LABEL: test_nearbyint: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: frinti s0, s0 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret @@ -2225,11 +1971,9 @@ define bfloat @test_nearbyint(bfloat %a) #0 { define bfloat @test_round(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_round: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s0 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w9 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: frinta s0, s0 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 @@ -2242,10 +1986,8 @@ define bfloat @test_round(bfloat %a) #0 { ; ; CHECK-BF16-LABEL: test_round: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: frinta s0, s0 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret @@ -2256,11 +1998,9 @@ define bfloat @test_round(bfloat %a) #0 { define bfloat @test_roundeven(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_roundeven: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s0 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w9 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: frintn s0, s0 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 @@ -2273,10 +2013,8 @@ define bfloat @test_roundeven(bfloat %a) #0 { ; ; CHECK-BF16-LABEL: test_roundeven: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: frintn s0, s0 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret @@ -2287,27 +2025,21 @@ define bfloat @test_roundeven(bfloat %a) #0 { define bfloat @test_fmuladd(bfloat %a, bfloat %b, bfloat %c) #0 { ; CHECK-CVT-LABEL: test_fmuladd: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s1 -; CHECK-CVT-NEXT: fmov w9, s0 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w10, #32767 // =0x7fff -; CHECK-CVT-NEXT: // kill: def $h2 killed $h2 def $s2 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: fmov s1, w9 -; CHECK-CVT-NEXT: fmul s0, s1, s0 +; CHECK-CVT-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: fmul s0, s0, s1 +; CHECK-CVT-NEXT: shll v1.4s, v2.4h, #16 ; CHECK-CVT-NEXT: fmov w8, s0 ; CHECK-CVT-NEXT: ubfx w9, w8, #16, #1 ; CHECK-CVT-NEXT: add w8, w8, w10 ; CHECK-CVT-NEXT: add w8, w9, w8 -; CHECK-CVT-NEXT: fmov w9, s2 ; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: lsl w9, w9, #16 ; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: fmov s1, w9 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: fadd s0, s0, s1 ; CHECK-CVT-NEXT: fmov w8, s0 ; CHECK-CVT-NEXT: ubfx w9, w8, #16, #1 @@ -2320,23 +2052,15 @@ define bfloat @test_fmuladd(bfloat %a, bfloat %b, bfloat %c) #0 { ; ; CHECK-BF16-LABEL: test_fmuladd: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s1 -; CHECK-BF16-NEXT: fmov w9, s0 -; CHECK-BF16-NEXT: // kill: def $h2 killed $h2 def $s2 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 -; CHECK-BF16-NEXT: fmov w9, s2 -; CHECK-BF16-NEXT: fmul s0, s1, s0 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s1, w9 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fmul s0, s0, s1 +; CHECK-BF16-NEXT: shll v1.4s, v2.4h, #16 ; CHECK-BF16-NEXT: bfcvt h0, s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: fadd s0, s0, s1 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll index c03e2e5321321..a609e33be935e 100644 --- a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll +++ b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll @@ -272,9 +272,8 @@ define <8 x bfloat> @d_to_h(<8 x double> %a) { define <8 x float> @h_to_s(<8 x bfloat> %a) { ; CHECK-LABEL: h_to_s: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: shll2 v1.4s, v0.8h, #16 ; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-NEXT: ret %1 = fpext <8 x bfloat> %a to <8 x float> ret <8 x float> %1 @@ -283,13 +282,12 @@ define <8 x float> @h_to_s(<8 x bfloat> %a) { define <8 x double> @h_to_d(<8 x bfloat> %a) { ; CHECK-LABEL: h_to_d: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: shll v2.4s, v0.4h, #16 -; CHECK-NEXT: fcvtl v0.2d, v2.2s -; CHECK-NEXT: shll v4.4s, v1.4h, #16 -; CHECK-NEXT: fcvtl2 v1.2d, v2.4s -; CHECK-NEXT: fcvtl2 v3.2d, v4.4s -; CHECK-NEXT: fcvtl v2.2d, v4.2s +; CHECK-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-NEXT: fcvtl v0.2d, v1.2s +; CHECK-NEXT: fcvtl2 v3.2d, v2.4s +; CHECK-NEXT: fcvtl2 v1.2d, v1.4s +; CHECK-NEXT: fcvtl v2.2d, v2.2s ; CHECK-NEXT: ret %1 = fpext <8 x bfloat> %a to <8 x double> ret <8 x double> %1 @@ -788,11 +786,10 @@ define void @test_insert_at_zero(bfloat %a, ptr %b) #0 { define <8 x i8> @fptosi_i8(<8 x bfloat> %a) #0 { ; CHECK-LABEL: fptosi_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: shll2 v1.4s, v0.8h, #16 ; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-NEXT: fcvtzs v1.4s, v1.4s +; CHECK-NEXT: fcvtzs v0.4s, v0.4s ; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h ; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: ret @@ -803,11 +800,10 @@ define <8 x i8> @fptosi_i8(<8 x bfloat> %a) #0 { define <8 x i16> @fptosi_i16(<8 x bfloat> %a) #0 { ; CHECK-LABEL: fptosi_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: shll2 v1.4s, v0.8h, #16 ; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-NEXT: fcvtzs v1.4s, v1.4s +; CHECK-NEXT: fcvtzs v0.4s, v0.4s ; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %1 = fptosi<8 x bfloat> %a to <8 x i16> @@ -817,11 +813,10 @@ define <8 x i16> @fptosi_i16(<8 x bfloat> %a) #0 { define <8 x i8> @fptoui_i8(<8 x bfloat> %a) #0 { ; CHECK-LABEL: fptoui_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: shll2 v1.4s, v0.8h, #16 ; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-NEXT: fcvtzu v1.4s, v1.4s +; CHECK-NEXT: fcvtzu v0.4s, v0.4s ; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h ; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: ret @@ -832,11 +827,10 @@ define <8 x i8> @fptoui_i8(<8 x bfloat> %a) #0 { define <8 x i16> @fptoui_i16(<8 x bfloat> %a) #0 { ; CHECK-LABEL: fptoui_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: shll2 v1.4s, v0.8h, #16 ; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-NEXT: fcvtzu v1.4s, v1.4s +; CHECK-NEXT: fcvtzu v0.4s, v0.4s ; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %1 = fptoui<8 x bfloat> %a to <8 x i16> @@ -846,90 +840,58 @@ define <8 x i16> @fptoui_i16(<8 x bfloat> %a) #0 { define <8 x i1> @test_fcmp_une(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_une: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: mov h3, v1.h[4] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: csetm w9, ne -; CHECK-NEXT: fmov s16, w10 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov h6, v0.h[4] -; CHECK-NEXT: mov h4, v1.h[5] -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 ; CHECK-NEXT: csetm w8, ne -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: fcmp s16, s7 -; CHECK-NEXT: mov v2.h[1], w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w9, s6 -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: csetm w9, ne +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, ne +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s7, s3 -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fmov s6, w9 -; CHECK-NEXT: fmov w9, s5 ; CHECK-NEXT: csetm w8, ne +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s5, w8 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s6, w9 ; CHECK-NEXT: csetm w8, ne +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csetm w10, ne -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, ne +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, ne +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 -; CHECK-NEXT: fcmp s1, s0 ; CHECK-NEXT: csetm w8, ne ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: xtn v0.8b, v2.8h @@ -941,96 +903,64 @@ define <8 x i1> @test_fcmp_une(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_ueq(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_ueq: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w11, s0 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: lsl w9, w11, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s7, w9 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: csetm w10, eq -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csinv w10, w10, wzr, vc -; CHECK-NEXT: fcmp s7, s6 -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w11, s4 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h4, v0.h[4] -; CHECK-NEXT: mov h7, v1.h[5] +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 +; CHECK-NEXT: csetm w8, eq +; CHECK-NEXT: csinv w8, w8, wzr, vc +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] ; CHECK-NEXT: csetm w9, eq ; CHECK-NEXT: csinv w9, w9, wzr, vc -; CHECK-NEXT: fcmp s3, s2 -; CHECK-NEXT: mov h3, v1.h[4] +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 ; CHECK-NEXT: fmov s2, w9 -; CHECK-NEXT: lsl w11, w11, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov s5, w11 -; CHECK-NEXT: fmov s6, w8 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, eq -; CHECK-NEXT: mov v2.h[1], w10 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: fmov w10, s4 ; CHECK-NEXT: csinv w8, w8, wzr, vc -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: lsl w10, w10, #16 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov s6, w10 ; CHECK-NEXT: csetm w8, eq -; CHECK-NEXT: fmov w9, s7 -; CHECK-NEXT: fmov w10, s5 ; CHECK-NEXT: csinv w8, w8, wzr, vc -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: fmov s6, w10 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: fmov w10, s4 ; CHECK-NEXT: csetm w8, eq ; CHECK-NEXT: csinv w8, w8, wzr, vc +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: lsl w8, w9, #16 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: csetm w10, eq -; CHECK-NEXT: csinv w10, w10, wzr, vc -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, eq ; CHECK-NEXT: csinv w8, w8, wzr, vc -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, eq +; CHECK-NEXT: csinv w8, w8, wzr, vc +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 ; CHECK-NEXT: csetm w8, eq ; CHECK-NEXT: csinv w8, w8, wzr, vc @@ -1044,90 +974,58 @@ define <8 x i1> @test_fcmp_ueq(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_ugt(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_ugt: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: mov h3, v1.h[4] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: csetm w9, hi -; CHECK-NEXT: fmov s16, w10 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov h6, v0.h[4] -; CHECK-NEXT: mov h4, v1.h[5] -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 ; CHECK-NEXT: csetm w8, hi -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: fcmp s16, s7 -; CHECK-NEXT: mov v2.h[1], w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w9, s6 -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: csetm w9, hi +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, hi +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s7, s3 -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fmov s6, w9 -; CHECK-NEXT: fmov w9, s5 ; CHECK-NEXT: csetm w8, hi +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s5, w8 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s6, w9 ; CHECK-NEXT: csetm w8, hi +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csetm w10, hi -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, hi +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, hi +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 -; CHECK-NEXT: fcmp s1, s0 ; CHECK-NEXT: csetm w8, hi ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: xtn v0.8b, v2.8h @@ -1139,90 +1037,58 @@ define <8 x i1> @test_fcmp_ugt(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_uge(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_uge: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: mov h3, v1.h[4] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: csetm w9, pl -; CHECK-NEXT: fmov s16, w10 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov h6, v0.h[4] -; CHECK-NEXT: mov h4, v1.h[5] -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 ; CHECK-NEXT: csetm w8, pl -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: fcmp s16, s7 -; CHECK-NEXT: mov v2.h[1], w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w9, s6 -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: csetm w9, pl +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, pl +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s7, s3 -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fmov s6, w9 -; CHECK-NEXT: fmov w9, s5 ; CHECK-NEXT: csetm w8, pl +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s5, w8 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s6, w9 ; CHECK-NEXT: csetm w8, pl +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csetm w10, pl -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, pl +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, pl +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 -; CHECK-NEXT: fcmp s1, s0 ; CHECK-NEXT: csetm w8, pl ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: xtn v0.8b, v2.8h @@ -1234,90 +1100,58 @@ define <8 x i1> @test_fcmp_uge(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_ult(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_ult: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: mov h3, v1.h[4] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: csetm w9, lt -; CHECK-NEXT: fmov s16, w10 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov h6, v0.h[4] -; CHECK-NEXT: mov h4, v1.h[5] -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 ; CHECK-NEXT: csetm w8, lt -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: fcmp s16, s7 -; CHECK-NEXT: mov v2.h[1], w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w9, s6 -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: csetm w9, lt +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, lt +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s7, s3 -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fmov s6, w9 -; CHECK-NEXT: fmov w9, s5 ; CHECK-NEXT: csetm w8, lt +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s5, w8 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s6, w9 ; CHECK-NEXT: csetm w8, lt +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csetm w10, lt -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, lt +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, lt +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 -; CHECK-NEXT: fcmp s1, s0 ; CHECK-NEXT: csetm w8, lt ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: xtn v0.8b, v2.8h @@ -1329,90 +1163,58 @@ define <8 x i1> @test_fcmp_ult(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_ule(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_ule: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: mov h3, v1.h[4] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: csetm w9, le -; CHECK-NEXT: fmov s16, w10 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov h6, v0.h[4] -; CHECK-NEXT: mov h4, v1.h[5] -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 ; CHECK-NEXT: csetm w8, le -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: fcmp s16, s7 -; CHECK-NEXT: mov v2.h[1], w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w9, s6 -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: csetm w9, le +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, le +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s7, s3 -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fmov s6, w9 -; CHECK-NEXT: fmov w9, s5 ; CHECK-NEXT: csetm w8, le +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s5, w8 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s6, w9 ; CHECK-NEXT: csetm w8, le +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csetm w10, le -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, le +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, le +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 -; CHECK-NEXT: fcmp s1, s0 ; CHECK-NEXT: csetm w8, le ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: xtn v0.8b, v2.8h @@ -1424,90 +1226,58 @@ define <8 x i1> @test_fcmp_ule(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_uno(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_uno: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: mov h3, v1.h[4] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: csetm w9, vs -; CHECK-NEXT: fmov s16, w10 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov h6, v0.h[4] -; CHECK-NEXT: mov h4, v1.h[5] -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 ; CHECK-NEXT: csetm w8, vs -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: fcmp s16, s7 -; CHECK-NEXT: mov v2.h[1], w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w9, s6 -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: csetm w9, vs +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, vs +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s7, s3 -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fmov s6, w9 -; CHECK-NEXT: fmov w9, s5 ; CHECK-NEXT: csetm w8, vs +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s5, w8 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s6, w9 ; CHECK-NEXT: csetm w8, vs +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csetm w10, vs -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, vs +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, vs +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 -; CHECK-NEXT: fcmp s1, s0 ; CHECK-NEXT: csetm w8, vs ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: xtn v0.8b, v2.8h @@ -1519,96 +1289,64 @@ define <8 x i1> @test_fcmp_uno(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_one(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w11, s0 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: lsl w9, w11, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s7, w9 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: csetm w10, mi -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csinv w10, w10, wzr, le -; CHECK-NEXT: fcmp s7, s6 -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w11, s4 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h4, v0.h[4] -; CHECK-NEXT: mov h7, v1.h[5] +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 +; CHECK-NEXT: csetm w8, mi +; CHECK-NEXT: csinv w8, w8, wzr, le +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] ; CHECK-NEXT: csetm w9, mi ; CHECK-NEXT: csinv w9, w9, wzr, le -; CHECK-NEXT: fcmp s3, s2 -; CHECK-NEXT: mov h3, v1.h[4] +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 ; CHECK-NEXT: fmov s2, w9 -; CHECK-NEXT: lsl w11, w11, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov s5, w11 -; CHECK-NEXT: fmov s6, w8 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, mi -; CHECK-NEXT: mov v2.h[1], w10 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: fmov w10, s4 ; CHECK-NEXT: csinv w8, w8, wzr, le -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: lsl w10, w10, #16 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov s6, w10 ; CHECK-NEXT: csetm w8, mi -; CHECK-NEXT: fmov w9, s7 -; CHECK-NEXT: fmov w10, s5 ; CHECK-NEXT: csinv w8, w8, wzr, le -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: fmov s6, w10 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: fmov w10, s4 ; CHECK-NEXT: csetm w8, mi ; CHECK-NEXT: csinv w8, w8, wzr, le +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: lsl w8, w9, #16 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: csetm w10, mi -; CHECK-NEXT: csinv w10, w10, wzr, le -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, mi ; CHECK-NEXT: csinv w8, w8, wzr, le -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, mi +; CHECK-NEXT: csinv w8, w8, wzr, le +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 ; CHECK-NEXT: csetm w8, mi ; CHECK-NEXT: csinv w8, w8, wzr, le @@ -1622,90 +1360,58 @@ define <8 x i1> @test_fcmp_one(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_oeq(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_oeq: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: mov h3, v1.h[4] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: csetm w9, eq -; CHECK-NEXT: fmov s16, w10 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov h6, v0.h[4] -; CHECK-NEXT: mov h4, v1.h[5] -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 ; CHECK-NEXT: csetm w8, eq -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: fcmp s16, s7 -; CHECK-NEXT: mov v2.h[1], w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w9, s6 -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: csetm w9, eq +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, eq +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s7, s3 -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fmov s6, w9 -; CHECK-NEXT: fmov w9, s5 ; CHECK-NEXT: csetm w8, eq +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s5, w8 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s6, w9 ; CHECK-NEXT: csetm w8, eq +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csetm w10, eq -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, eq +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, eq +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 -; CHECK-NEXT: fcmp s1, s0 ; CHECK-NEXT: csetm w8, eq ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: xtn v0.8b, v2.8h @@ -1717,90 +1423,58 @@ define <8 x i1> @test_fcmp_oeq(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_ogt(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_ogt: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: mov h3, v1.h[4] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: csetm w9, gt -; CHECK-NEXT: fmov s16, w10 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov h6, v0.h[4] -; CHECK-NEXT: mov h4, v1.h[5] -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 ; CHECK-NEXT: csetm w8, gt -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: fcmp s16, s7 -; CHECK-NEXT: mov v2.h[1], w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w9, s6 -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: csetm w9, gt +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, gt +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s7, s3 -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fmov s6, w9 -; CHECK-NEXT: fmov w9, s5 ; CHECK-NEXT: csetm w8, gt +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s5, w8 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s6, w9 ; CHECK-NEXT: csetm w8, gt +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csetm w10, gt -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, gt +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, gt +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 -; CHECK-NEXT: fcmp s1, s0 ; CHECK-NEXT: csetm w8, gt ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: xtn v0.8b, v2.8h @@ -1812,90 +1486,58 @@ define <8 x i1> @test_fcmp_ogt(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_oge(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_oge: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: mov h3, v1.h[4] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: csetm w9, ge -; CHECK-NEXT: fmov s16, w10 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov h6, v0.h[4] -; CHECK-NEXT: mov h4, v1.h[5] -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 ; CHECK-NEXT: csetm w8, ge -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: fcmp s16, s7 -; CHECK-NEXT: mov v2.h[1], w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w9, s6 -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: csetm w9, ge +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, ge +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s7, s3 -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fmov s6, w9 -; CHECK-NEXT: fmov w9, s5 ; CHECK-NEXT: csetm w8, ge +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s5, w8 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s6, w9 ; CHECK-NEXT: csetm w8, ge +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csetm w10, ge -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, ge +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, ge +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 -; CHECK-NEXT: fcmp s1, s0 ; CHECK-NEXT: csetm w8, ge ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: xtn v0.8b, v2.8h @@ -1907,90 +1549,58 @@ define <8 x i1> @test_fcmp_oge(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_olt(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_olt: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: mov h3, v1.h[4] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: csetm w9, mi -; CHECK-NEXT: fmov s16, w10 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov h6, v0.h[4] -; CHECK-NEXT: mov h4, v1.h[5] -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 ; CHECK-NEXT: csetm w8, mi -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: fcmp s16, s7 -; CHECK-NEXT: mov v2.h[1], w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w9, s6 -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: csetm w9, mi +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, mi +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s7, s3 -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fmov s6, w9 -; CHECK-NEXT: fmov w9, s5 ; CHECK-NEXT: csetm w8, mi +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s5, w8 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s6, w9 ; CHECK-NEXT: csetm w8, mi +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csetm w10, mi -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, mi +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, mi +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 -; CHECK-NEXT: fcmp s1, s0 ; CHECK-NEXT: csetm w8, mi ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: xtn v0.8b, v2.8h @@ -2002,90 +1612,58 @@ define <8 x i1> @test_fcmp_olt(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_ole(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_ole: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: mov h3, v1.h[4] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: csetm w9, ls -; CHECK-NEXT: fmov s16, w10 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov h6, v0.h[4] -; CHECK-NEXT: mov h4, v1.h[5] -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 ; CHECK-NEXT: csetm w8, ls -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: fcmp s16, s7 -; CHECK-NEXT: mov v2.h[1], w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w9, s6 -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: csetm w9, ls +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, ls +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s7, s3 -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fmov s6, w9 -; CHECK-NEXT: fmov w9, s5 ; CHECK-NEXT: csetm w8, ls +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s5, w8 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s6, w9 ; CHECK-NEXT: csetm w8, ls +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csetm w10, ls -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, ls +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, ls +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 -; CHECK-NEXT: fcmp s1, s0 ; CHECK-NEXT: csetm w8, ls ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: xtn v0.8b, v2.8h @@ -2097,90 +1675,58 @@ define <8 x i1> @test_fcmp_ole(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_ord(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_ord: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: mov h3, v1.h[4] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: csetm w9, vc -; CHECK-NEXT: fmov s16, w10 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov h6, v0.h[4] -; CHECK-NEXT: mov h4, v1.h[5] -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 ; CHECK-NEXT: csetm w8, vc -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: fcmp s16, s7 -; CHECK-NEXT: mov v2.h[1], w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w9, s6 -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: csetm w9, vc +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, vc +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s7, s3 -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fmov s6, w9 -; CHECK-NEXT: fmov w9, s5 ; CHECK-NEXT: csetm w8, vc +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s5, w8 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s6, w9 ; CHECK-NEXT: csetm w8, vc +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csetm w10, vc -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, vc +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, vc +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 -; CHECK-NEXT: fcmp s1, s0 ; CHECK-NEXT: csetm w8, vc ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: xtn v0.8b, v2.8h diff --git a/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll index 40684b0f3a256..e3263252875f7 100644 --- a/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll +++ b/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll @@ -76,11 +76,9 @@ entry: define bfloat @t7(bfloat %x) { ; CHECK-LABEL: t7: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-NEXT: mov w8, #32767 // =0x7fff -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: fcvtzs w9, s0 ; CHECK-NEXT: scvtf d0, w9 ; CHECK-NEXT: fcvtxn s0, d0 @@ -101,11 +99,9 @@ entry: define bfloat @t8(bfloat %x) { ; CHECK-LABEL: t8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-NEXT: mov w8, #32767 // =0x7fff -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: fcvtzu w9, s0 ; CHECK-NEXT: ucvtf d0, w9 ; CHECK-NEXT: fcvtxn s0, d0 @@ -198,11 +194,9 @@ entry: define bfloat @t7_strict(bfloat %x) #0 { ; CHECK-LABEL: t7_strict: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-NEXT: mov w8, #32767 // =0x7fff -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: fcvtzs w9, s0 ; CHECK-NEXT: scvtf d0, w9 ; CHECK-NEXT: fcvtxn s0, d0 @@ -223,11 +217,9 @@ entry: define bfloat @t8_strict(bfloat %x) #0 { ; CHECK-LABEL: t8_strict: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-NEXT: mov w8, #32767 // =0x7fff -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: fcvtzu w9, s0 ; CHECK-NEXT: ucvtf d0, w9 ; CHECK-NEXT: fcvtxn s0, d0 diff --git a/llvm/test/CodeGen/AArch64/register-coalesce-update-subranges-remat.mir b/llvm/test/CodeGen/AArch64/register-coalesce-update-subranges-remat.mir index b61fa4be04007..08fc47d9480ce 100644 --- a/llvm/test/CodeGen/AArch64/register-coalesce-update-subranges-remat.mir +++ b/llvm/test/CodeGen/AArch64/register-coalesce-update-subranges-remat.mir @@ -1,5 +1,5 @@ +# RUN: llc -mtriple=aarch64 -o /dev/null -run-pass=register-coalescer -aarch64-enable-subreg-liveness-tracking -debug-only=regalloc %s 2>&1 | FileCheck %s --check-prefix=CHECK-DBG # RUN: llc -mtriple=aarch64 -verify-machineinstrs -o - -run-pass=register-coalescer -aarch64-enable-subreg-liveness-tracking %s | FileCheck %s --check-prefix=CHECK -# RUN: llc -mtriple=aarch64 -verify-machineinstrs -o /dev/null -run-pass=register-coalescer -aarch64-enable-subreg-liveness-tracking -debug-only=regalloc %s 2>&1 | FileCheck %s --check-prefix=CHECK-DBG # REQUIRES: asserts # CHECK-DBG: ********** REGISTER COALESCER ********** @@ -36,3 +36,94 @@ body: | RET_ReallyLR ... +# CHECK-DBG: ********** REGISTER COALESCER ********** +# CHECK-DBG: ********** Function: reproducer +# CHECK-DBG: ********** JOINING INTERVALS *********** +# CHECK-DBG: ********** INTERVALS ********** +# CHECK-DBG: %1 [32r,48B:2)[48B,320r:0)[320r,368B:1) 0@48B-phi 1@320r 2@32r +# CHECK-DBG-SAME: weight:0.000000e+00 +# CHECK-DBG: %3 [80r,160B:2)[240r,272B:1)[288r,304B:0)[304B,320r:3) 0@288r 1@240r 2@80r 3@304B-phi +# CHECK-DBG-SAME: L0000000000000080 [288r,304B:0)[304B,320r:3) 0@288r 1@x 2@x 3@304B-phi +# CHECK-DBG-SAME: L0000000000000040 [80r,160B:2)[240r,272B:1)[288r,304B:0)[304B,320r:3) 0@288r 1@240r 2@80r 3@304B-phi +# CHECK-DBG-SAME: weight:0.000000e+00 +--- +name: reproducer +tracksRegLiveness: true +body: | + bb.0: + %0:gpr32 = MOVi32imm 1 + %1:gpr64 = IMPLICIT_DEF + + bb.1: + + bb.2: + %3:gpr64all = SUBREG_TO_REG 0, %0, %subreg.sub_32 + + bb.3: + $nzcv = IMPLICIT_DEF + %4:gpr64 = COPY killed %3 + Bcc 1, %bb.7, implicit killed $nzcv + + bb.4: + $nzcv = IMPLICIT_DEF + Bcc 1, %bb.6, implicit killed $nzcv + + bb.5: + %5:gpr64all = SUBREG_TO_REG 0, %0, %subreg.sub_32 + %4:gpr64 = COPY killed %5 + B %bb.7 + + bb.6: + %4:gpr64 = COPY $xzr + + bb.7: + %7:gpr64 = ADDXrs killed %1, killed %4, 1 + %1:gpr64 = COPY killed %7 + B %bb.1 + +... +# CHECK-DBG: ********** REGISTER COALESCER ********** +# CHECK-DBG: ********** Function: reproducer2 +# CHECK-DBG: ********** JOINING INTERVALS *********** +# CHECK-DBG: ********** INTERVALS ********** +# CHECK-DBG: %1 [32r,48B:2)[48B,304r:0)[304r,352B:1) 0@48B-phi 1@304r 2@32r +# CHECK-DBG-SAME: weight:0.000000e+00 +# CHECK-DBG: %3 [80r,160B:2)[224r,256B:1)[272r,288B:0)[288B,304r:3) 0@272r 1@224r 2@80r 3@288B-phi +# CHECK-DBG-SAME: L0000000000000080 [224r,256B:1)[272r,288B:0)[288B,304r:3) 0@272r 1@224r 2@x 3@288B-phi +# CHECK-DBG-SAME: L0000000000000040 [80r,160B:2)[224r,256B:1)[272r,288B:0)[288B,304r:3) 0@272r 1@224r 2@80r 3@288B-phi +# CHECK-DBG-SAME: weight:0.000000e+00 +--- +name: reproducer2 +tracksRegLiveness: true +body: | + bb.0: + %0:gpr32 = MOVi32imm 1 + %1:gpr64 = IMPLICIT_DEF + + bb.1: + + bb.2: + %3:gpr64all = SUBREG_TO_REG 0, %0, %subreg.sub_32 + + bb.3: + $nzcv = IMPLICIT_DEF + %4:gpr64 = COPY killed %3 + Bcc 1, %bb.7, implicit killed $nzcv + + bb.4: + $nzcv = IMPLICIT_DEF + Bcc 1, %bb.6, implicit killed $nzcv + + bb.5: + %4:gpr64 = IMPLICIT_DEF + B %bb.7 + + bb.6: + %4:gpr64 = COPY $xzr + + bb.7: + %5:gpr64 = ADDXrs killed %1, killed %4, 1 + %1:gpr64 = COPY killed %5 + B %bb.1 + +... diff --git a/llvm/test/CodeGen/AArch64/round-fptosi-sat-scalar.ll b/llvm/test/CodeGen/AArch64/round-fptosi-sat-scalar.ll index ec7548e1e6541..b7fae2bff6876 100644 --- a/llvm/test/CodeGen/AArch64/round-fptosi-sat-scalar.ll +++ b/llvm/test/CodeGen/AArch64/round-fptosi-sat-scalar.ll @@ -7,19 +7,17 @@ define i32 @testmswbf(bfloat %a) { ; CHECK-LABEL: testmswbf: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-NEXT: mov w8, #32767 // =0x7fff -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: frintm s0, s0 ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: ubfx w10, w9, #16, #1 ; CHECK-NEXT: add w8, w9, w8 ; CHECK-NEXT: add w8, w10, w8 ; CHECK-NEXT: lsr w8, w8, #16 -; CHECK-NEXT: lsl w8, w8, #16 ; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: fcvtzs w0, s0 ; CHECK-NEXT: ret entry: @@ -31,19 +29,17 @@ entry: define i64 @testmsxbf(bfloat %a) { ; CHECK-LABEL: testmsxbf: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-NEXT: mov w8, #32767 // =0x7fff -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: frintm s0, s0 ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: ubfx w10, w9, #16, #1 ; CHECK-NEXT: add w8, w9, w8 ; CHECK-NEXT: add w8, w10, w8 ; CHECK-NEXT: lsr w8, w8, #16 -; CHECK-NEXT: lsl w8, w8, #16 ; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: fcvtzs x0, s0 ; CHECK-NEXT: ret entry: @@ -141,19 +137,17 @@ entry: define i32 @testpswbf(bfloat %a) { ; CHECK-LABEL: testpswbf: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-NEXT: mov w8, #32767 // =0x7fff -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: frintp s0, s0 ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: ubfx w10, w9, #16, #1 ; CHECK-NEXT: add w8, w9, w8 ; CHECK-NEXT: add w8, w10, w8 ; CHECK-NEXT: lsr w8, w8, #16 -; CHECK-NEXT: lsl w8, w8, #16 ; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: fcvtzs w0, s0 ; CHECK-NEXT: ret entry: @@ -165,19 +159,17 @@ entry: define i64 @testpsxbf(bfloat %a) { ; CHECK-LABEL: testpsxbf: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-NEXT: mov w8, #32767 // =0x7fff -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: frintp s0, s0 ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: ubfx w10, w9, #16, #1 ; CHECK-NEXT: add w8, w9, w8 ; CHECK-NEXT: add w8, w10, w8 ; CHECK-NEXT: lsr w8, w8, #16 -; CHECK-NEXT: lsl w8, w8, #16 ; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: fcvtzs x0, s0 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll index 73f3d4c037ad5..774a22fb907db 100644 --- a/llvm/test/CodeGen/AMDGPU/minmax.ll +++ b/llvm/test/CodeGen/AMDGPU/minmax.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG,SDAG-GFX11 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL,GISEL-GFX11 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-GFX12 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-GFX12 %s define i32 @test_minmax_i32(i32 %a, i32 %b, i32 %c) { ; GFX11-LABEL: test_minmax_i32: @@ -8,6 +10,16 @@ define i32 @test_minmax_i32(i32 %a, i32 %b, i32 %c) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_maxmin_i32 v0, v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_minmax_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maxmin_i32 v0, v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b) %sminmax = call i32 @llvm.smin.i32(i32 %smax, i32 %c) ret i32 %sminmax @@ -45,6 +57,16 @@ define i32 @test_minmax_commuted_i32(i32 %a, i32 %b, i32 %c) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_maxmin_i32 v0, v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_minmax_commuted_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maxmin_i32 v0, v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b) %sminmax = call i32 @llvm.smin.i32(i32 %c, i32 %smax) ret i32 %sminmax @@ -56,6 +78,16 @@ define i32 @test_maxmin_i32(i32 %a, i32 %b, i32 %c) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_minmax_i32 v0, v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_maxmin_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minmax_i32 v0, v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %smin = call i32 @llvm.smin.i32(i32 %a, i32 %b) %smaxmin = call i32 @llvm.smax.i32(i32 %smin, i32 %c) ret i32 %smaxmin @@ -67,6 +99,16 @@ define i32 @test_maxmin_commuted_i32(i32 %a, i32 %b, i32 %c) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_minmax_i32 v0, v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_maxmin_commuted_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minmax_i32 v0, v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %smin = call i32 @llvm.smin.i32(i32 %a, i32 %b) %smaxmin = call i32 @llvm.smax.i32(i32 %c, i32 %smin) ret i32 %smaxmin @@ -79,6 +121,17 @@ define void @test_smed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) { ; GFX11-NEXT: v_med3_i32 v2, v2, v3, v4 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_smed3_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_med3_i32 v2, v2, v3, v4 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] %tmp0 = call i32 @llvm.smin.i32(i32 %x, i32 %y) %tmp1 = call i32 @llvm.smax.i32(i32 %x, i32 %y) %tmp2 = call i32 @llvm.smin.i32(i32 %tmp1, i32 %z) @@ -93,6 +146,16 @@ define i32 @test_minmax_u32(i32 %a, i32 %b, i32 %c) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_maxmin_u32 v0, v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_minmax_u32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maxmin_u32 v0, v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %umax = call i32 @llvm.umax.i32(i32 %a, i32 %b) %uminmax = call i32 @llvm.umin.i32(i32 %umax, i32 %c) ret i32 %uminmax @@ -130,6 +193,16 @@ define i32 @test_minmax_commuted_u32(i32 %a, i32 %b, i32 %c) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_maxmin_u32 v0, v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_minmax_commuted_u32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maxmin_u32 v0, v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %umax = call i32 @llvm.umax.i32(i32 %a, i32 %b) %uminmax = call i32 @llvm.umin.i32(i32 %c, i32 %umax) ret i32 %uminmax @@ -141,6 +214,16 @@ define i32 @test_maxmin_u32(i32 %a, i32 %b, i32 %c) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_minmax_u32 v0, v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_maxmin_u32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minmax_u32 v0, v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %umin = call i32 @llvm.umin.i32(i32 %a, i32 %b) %umaxmin = call i32 @llvm.umax.i32(i32 %umin, i32 %c) ret i32 %umaxmin @@ -152,6 +235,16 @@ define i32 @test_maxmin_commuted_u32(i32 %a, i32 %b, i32 %c) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_minmax_u32 v0, v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_maxmin_commuted_u32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minmax_u32 v0, v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %umin = call i32 @llvm.umin.i32(i32 %a, i32 %b) %umaxmin = call i32 @llvm.umax.i32(i32 %c, i32 %umin) ret i32 %umaxmin @@ -164,6 +257,17 @@ define void @test_umed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) { ; GFX11-NEXT: v_med3_u32 v2, v2, v3, v4 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_umed3_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_med3_u32 v2, v2, v3, v4 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] %tmp0 = call i32 @llvm.umin.i32(i32 %x, i32 %y) %tmp1 = call i32 @llvm.umax.i32(i32 %x, i32 %y) %tmp2 = call i32 @llvm.umin.i32(i32 %tmp1, i32 %z) @@ -173,44 +277,88 @@ define void @test_umed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) { } define float @test_minmax_f32_ieee_true(float %a, float %b, float %c) { -; SDAG-LABEL: test_minmax_f32_ieee_true: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 -; SDAG-NEXT: v_max_f32_e32 v2, v2, v2 -; SDAG-NEXT: v_maxmin_f32 v0, v0, v1, v2 -; SDAG-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-LABEL: test_minmax_f32_ieee_true: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 +; SDAG-GFX11-NEXT: v_max_f32_e32 v2, v2, v2 +; SDAG-GFX11-NEXT: v_maxmin_f32 v0, v0, v1, v2 +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_minmax_f32_ieee_true: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 -; GISEL-NEXT: v_max_f32_e32 v2, v2, v2 -; GISEL-NEXT: v_maxmin_f32 v0, v0, v1, v2 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GISEL-GFX11-LABEL: test_minmax_f32_ieee_true: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 +; GISEL-GFX11-NEXT: v_max_f32_e32 v2, v2, v2 +; GISEL-GFX11-NEXT: v_maxmin_f32 v0, v0, v1, v2 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: test_minmax_f32_ieee_true: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 +; SDAG-GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 +; SDAG-GFX12-NEXT: v_maxmin_num_f32 v0, v0, v1, v2 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: test_minmax_f32_ieee_true: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1 +; GISEL-GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 +; GISEL-GFX12-NEXT: v_maxmin_num_f32 v0, v0, v1, v2 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] %max = call float @llvm.maxnum.f32(float %a, float %b) %minmax = call float @llvm.minnum.f32(float %max, float %c) ret float %minmax } define amdgpu_ps void @s_test_minmax_f32_ieee_false(float inreg %a, float inreg %b, float inreg %c, ptr addrspace(1) inreg %out) { -; SDAG-LABEL: s_test_minmax_f32_ieee_false: -; SDAG: ; %bb.0: -; SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 -; SDAG-NEXT: s_mov_b32 s5, s4 -; SDAG-NEXT: s_mov_b32 s4, s3 -; SDAG-NEXT: v_maxmin_f32 v0, s0, s1, v0 -; SDAG-NEXT: global_store_b32 v1, v0, s[4:5] -; SDAG-NEXT: s_endpgm +; SDAG-GFX11-LABEL: s_test_minmax_f32_ieee_false: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; SDAG-GFX11-NEXT: s_mov_b32 s5, s4 +; SDAG-GFX11-NEXT: s_mov_b32 s4, s3 +; SDAG-GFX11-NEXT: v_maxmin_f32 v0, s0, s1, v0 +; SDAG-GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; SDAG-GFX11-NEXT: s_endpgm ; -; GISEL-LABEL: s_test_minmax_f32_ieee_false: -; GISEL: ; %bb.0: -; GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 -; GISEL-NEXT: s_mov_b32 s6, s3 -; GISEL-NEXT: s_mov_b32 s7, s4 -; GISEL-NEXT: v_maxmin_f32 v0, s0, s1, v0 -; GISEL-NEXT: global_store_b32 v1, v0, s[6:7] -; GISEL-NEXT: s_endpgm +; GISEL-GFX11-LABEL: s_test_minmax_f32_ieee_false: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; GISEL-GFX11-NEXT: s_mov_b32 s6, s3 +; GISEL-GFX11-NEXT: s_mov_b32 s7, s4 +; GISEL-GFX11-NEXT: v_maxmin_f32 v0, s0, s1, v0 +; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[6:7] +; GISEL-GFX11-NEXT: s_endpgm +; +; SDAG-GFX12-LABEL: s_test_minmax_f32_ieee_false: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; SDAG-GFX12-NEXT: s_mov_b32 s5, s4 +; SDAG-GFX12-NEXT: s_mov_b32 s4, s3 +; SDAG-GFX12-NEXT: v_maxmin_num_f32 v0, s0, s1, v0 +; SDAG-GFX12-NEXT: global_store_b32 v1, v0, s[4:5] +; SDAG-GFX12-NEXT: s_endpgm +; +; GISEL-GFX12-LABEL: s_test_minmax_f32_ieee_false: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_max_num_f32 s0, s0, s1 +; GISEL-GFX12-NEXT: s_mov_b32 s6, s3 +; GISEL-GFX12-NEXT: s_mov_b32 s7, s4 +; GISEL-GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-GFX12-NEXT: s_min_num_f32 s0, s0, s2 +; GISEL-GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX12-NEXT: global_store_b32 v1, v0, s[6:7] +; GISEL-GFX12-NEXT: s_endpgm %smax = call float @llvm.maxnum.f32(float %a, float %b) %sminmax = call float @llvm.minnum.f32(float %smax, float %c) store float %sminmax, ptr addrspace(1) %out @@ -222,27 +370,56 @@ define amdgpu_ps float @test_minmax_commuted_f32_ieee_false(float %a, float %b, ; GFX11: ; %bb.0: ; GFX11-NEXT: v_maxmin_f32 v0, v0, v1, v2 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: test_minmax_commuted_f32_ieee_false: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_maxmin_num_f32 v0, v0, v1, v2 +; GFX12-NEXT: ; return to shader part epilog %max = call float @llvm.maxnum.f32(float %a, float %b) %minmax = call float @llvm.minnum.f32(float %c, float %max) ret float %minmax } define float @test_maxmin_f32_ieee_true(float %a, float %b, float %c) { -; SDAG-LABEL: test_maxmin_f32_ieee_true: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 -; SDAG-NEXT: v_max_f32_e32 v2, v2, v2 -; SDAG-NEXT: v_minmax_f32 v0, v0, v1, v2 -; SDAG-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-LABEL: test_maxmin_f32_ieee_true: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 +; SDAG-GFX11-NEXT: v_max_f32_e32 v2, v2, v2 +; SDAG-GFX11-NEXT: v_minmax_f32 v0, v0, v1, v2 +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_maxmin_f32_ieee_true: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 -; GISEL-NEXT: v_max_f32_e32 v2, v2, v2 -; GISEL-NEXT: v_minmax_f32 v0, v0, v1, v2 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GISEL-GFX11-LABEL: test_maxmin_f32_ieee_true: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 +; GISEL-GFX11-NEXT: v_max_f32_e32 v2, v2, v2 +; GISEL-GFX11-NEXT: v_minmax_f32 v0, v0, v1, v2 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: test_maxmin_f32_ieee_true: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 +; SDAG-GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 +; SDAG-GFX12-NEXT: v_minmax_num_f32 v0, v0, v1, v2 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: test_maxmin_f32_ieee_true: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1 +; GISEL-GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 +; GISEL-GFX12-NEXT: v_minmax_num_f32 v0, v0, v1, v2 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] %min = call float @llvm.minnum.f32(float %a, float %b) %maxmin = call float @llvm.maxnum.f32(float %min, float %c) ret float %maxmin @@ -253,6 +430,11 @@ define amdgpu_ps float @test_maxmin_commuted_f32_ieee_false(float %a, float %b, ; GFX11: ; %bb.0: ; GFX11-NEXT: v_minmax_f32 v0, v0, v1, v2 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: test_maxmin_commuted_f32_ieee_false: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_minmax_num_f32 v0, v0, v1, v2 +; GFX12-NEXT: ; return to shader part epilog %min = call float @llvm.minnum.f32(float %a, float %b) %maxmin = call float @llvm.maxnum.f32(float %c, float %min) ret float %maxmin @@ -265,6 +447,17 @@ define void @test_med3_f32(ptr addrspace(1) %arg, float %x, float %y, float %z) ; GFX11-NEXT: v_med3_f32 v2, v2, v3, v4 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_med3_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_med3_num_f32 v2, v2, v3, v4 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] %tmp0 = call float @llvm.minnum.f32(float %x, float %y) %tmp1 = call float @llvm.maxnum.f32(float %x, float %y) %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %z) @@ -278,29 +471,54 @@ define amdgpu_ps half @test_minmax_f16_ieee_false(half %a, half %b, half %c) { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_maxmin_f16 v0, v0, v1, v2 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: test_minmax_f16_ieee_false: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_maxmin_num_f16 v0, v0, v1, v2 +; GFX12-NEXT: ; return to shader part epilog %max = call half @llvm.maxnum.f16(half %a, half %b) %minmax = call half @llvm.minnum.f16(half %max, half %c) ret half %minmax } define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b, half inreg %c, ptr addrspace(1) inreg %out) { -; SDAG-LABEL: s_test_minmax_f16_ieee_false: -; SDAG: ; %bb.0: -; SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 -; SDAG-NEXT: s_mov_b32 s5, s4 -; SDAG-NEXT: s_mov_b32 s4, s3 -; SDAG-NEXT: v_maxmin_f16 v0, s0, s1, v0 -; SDAG-NEXT: global_store_b16 v1, v0, s[4:5] -; SDAG-NEXT: s_endpgm +; SDAG-GFX11-LABEL: s_test_minmax_f16_ieee_false: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; SDAG-GFX11-NEXT: s_mov_b32 s5, s4 +; SDAG-GFX11-NEXT: s_mov_b32 s4, s3 +; SDAG-GFX11-NEXT: v_maxmin_f16 v0, s0, s1, v0 +; SDAG-GFX11-NEXT: global_store_b16 v1, v0, s[4:5] +; SDAG-GFX11-NEXT: s_endpgm ; -; GISEL-LABEL: s_test_minmax_f16_ieee_false: -; GISEL: ; %bb.0: -; GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 -; GISEL-NEXT: s_mov_b32 s6, s3 -; GISEL-NEXT: s_mov_b32 s7, s4 -; GISEL-NEXT: v_maxmin_f16 v0, s0, s1, v0 -; GISEL-NEXT: global_store_b16 v1, v0, s[6:7] -; GISEL-NEXT: s_endpgm +; GISEL-GFX11-LABEL: s_test_minmax_f16_ieee_false: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; GISEL-GFX11-NEXT: s_mov_b32 s6, s3 +; GISEL-GFX11-NEXT: s_mov_b32 s7, s4 +; GISEL-GFX11-NEXT: v_maxmin_f16 v0, s0, s1, v0 +; GISEL-GFX11-NEXT: global_store_b16 v1, v0, s[6:7] +; GISEL-GFX11-NEXT: s_endpgm +; +; SDAG-GFX12-LABEL: s_test_minmax_f16_ieee_false: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; SDAG-GFX12-NEXT: s_mov_b32 s5, s4 +; SDAG-GFX12-NEXT: s_mov_b32 s4, s3 +; SDAG-GFX12-NEXT: v_maxmin_num_f16 v0, s0, s1, v0 +; SDAG-GFX12-NEXT: global_store_b16 v1, v0, s[4:5] +; SDAG-GFX12-NEXT: s_endpgm +; +; GISEL-GFX12-LABEL: s_test_minmax_f16_ieee_false: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_max_num_f16 s0, s0, s1 +; GISEL-GFX12-NEXT: s_mov_b32 s6, s3 +; GISEL-GFX12-NEXT: s_mov_b32 s7, s4 +; GISEL-GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-GFX12-NEXT: s_min_num_f16 s0, s0, s2 +; GISEL-GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX12-NEXT: global_store_b16 v1, v0, s[6:7] +; GISEL-GFX12-NEXT: s_endpgm %smax = call half @llvm.maxnum.f16(half %a, half %b) %sminmax = call half @llvm.minnum.f16(half %smax, half %c) store half %sminmax, ptr addrspace(1) %out @@ -308,23 +526,49 @@ define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b } define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) { -; SDAG-LABEL: test_minmax_commuted_f16_ieee_true: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_max_f16_e32 v1, v1, v1 -; SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; SDAG-NEXT: v_max_f16_e32 v2, v2, v2 -; SDAG-NEXT: v_maxmin_f16 v0, v0, v1, v2 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_minmax_commuted_f16_ieee_true: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_max_f16_e32 v0, v0, v0 -; GISEL-NEXT: v_max_f16_e32 v1, v1, v1 -; GISEL-NEXT: v_max_f16_e32 v2, v2, v2 -; GISEL-NEXT: v_maxmin_f16 v0, v0, v1, v2 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-LABEL: test_minmax_commuted_f16_ieee_true: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_max_f16_e32 v1, v1, v1 +; SDAG-GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; SDAG-GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; SDAG-GFX11-NEXT: v_maxmin_f16 v0, v0, v1, v2 +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-LABEL: test_minmax_commuted_f16_ieee_true: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GISEL-GFX11-NEXT: v_max_f16_e32 v1, v1, v1 +; GISEL-GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GISEL-GFX11-NEXT: v_maxmin_f16 v0, v0, v1, v2 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: test_minmax_commuted_f16_ieee_true: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1 +; SDAG-GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 +; SDAG-GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; SDAG-GFX12-NEXT: v_maxmin_num_f16 v0, v0, v1, v2 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: test_minmax_commuted_f16_ieee_true: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 +; GISEL-GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1 +; GISEL-GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GISEL-GFX12-NEXT: v_maxmin_num_f16 v0, v0, v1, v2 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] %max = call half @llvm.maxnum.f16(half %a, half %b) %minmax = call half @llvm.minnum.f16(half %c, half %max) ret half %minmax @@ -335,29 +579,60 @@ define amdgpu_ps half @test_maxmin_f16_ieee_false(half %a, half %b, half %c) { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_minmax_f16 v0, v0, v1, v2 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: test_maxmin_f16_ieee_false: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_minmax_num_f16 v0, v0, v1, v2 +; GFX12-NEXT: ; return to shader part epilog %min = call half @llvm.minnum.f16(half %a, half %b) %maxmin = call half @llvm.maxnum.f16(half %min, half %c) ret half %maxmin } define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) { -; SDAG-LABEL: test_maxmin_commuted_f16_ieee_true: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_max_f16_e32 v1, v1, v1 -; SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; SDAG-NEXT: v_max_f16_e32 v2, v2, v2 -; SDAG-NEXT: v_minmax_f16 v0, v0, v1, v2 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_maxmin_commuted_f16_ieee_true: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_max_f16_e32 v0, v0, v0 -; GISEL-NEXT: v_max_f16_e32 v1, v1, v1 -; GISEL-NEXT: v_max_f16_e32 v2, v2, v2 -; GISEL-NEXT: v_minmax_f16 v0, v0, v1, v2 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-LABEL: test_maxmin_commuted_f16_ieee_true: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_max_f16_e32 v1, v1, v1 +; SDAG-GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; SDAG-GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; SDAG-GFX11-NEXT: v_minmax_f16 v0, v0, v1, v2 +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-LABEL: test_maxmin_commuted_f16_ieee_true: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GISEL-GFX11-NEXT: v_max_f16_e32 v1, v1, v1 +; GISEL-GFX11-NEXT: v_max_f16_e32 v2, v2, v2 +; GISEL-GFX11-NEXT: v_minmax_f16 v0, v0, v1, v2 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: test_maxmin_commuted_f16_ieee_true: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1 +; SDAG-GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 +; SDAG-GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; SDAG-GFX12-NEXT: v_minmax_num_f16 v0, v0, v1, v2 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: test_maxmin_commuted_f16_ieee_true: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 +; GISEL-GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1 +; GISEL-GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GISEL-GFX12-NEXT: v_minmax_num_f16 v0, v0, v1, v2 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] %min = call half @llvm.minnum.f16(half %a, half %b) %maxmin = call half @llvm.maxnum.f16(half %c, half %min) ret half %maxmin @@ -370,6 +645,17 @@ define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0 ; GFX11-NEXT: v_med3_f16 v2, v2, v3, v4 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_med3_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_med3_num_f16 v2, v2, v3, v4 +; GFX12-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] %tmp0 = call half @llvm.minnum.f16(half %x, half %y) %tmp1 = call half @llvm.maxnum.f16(half %x, half %y) %tmp2 = call half @llvm.minnum.f16(half %tmp1, half %z) diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll new file mode 100644 index 0000000000000..a9b8663a48dea --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll @@ -0,0 +1,120 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 %s -o - | FileCheck %s --check-prefixes=GFX942 +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 %s -o - | FileCheck %s --check-prefixes=GFX908 + +define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { +; GFX942-LABEL: matmul_kernel: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: s_mov_b32 s2, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v1 +; GFX942-NEXT: s_mov_b32 s3, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_cmp_lg_u32 s0, 0 +; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; GFX942-NEXT: s_branch .LBB0_2 +; GFX942-NEXT: .LBB0_1: ; %bb2 +; GFX942-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; GFX942-NEXT: s_or_b32 s4, s3, 1 +; GFX942-NEXT: s_ashr_i32 s5, s3, 31 +; GFX942-NEXT: s_mov_b32 s3, s2 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX942-NEXT: s_and_b32 s3, s5, s4 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[0:3], v[4:5], v[4:5], a[0:3] +; GFX942-NEXT: s_cbranch_execz .LBB0_4 +; GFX942-NEXT: .LBB0_2: ; %bb +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX942-NEXT: s_cbranch_vccz .LBB0_1 +; GFX942-NEXT: ; %bb.3: +; GFX942-NEXT: ; implicit-def: $sgpr3 +; GFX942-NEXT: .LBB0_4: ; %common.ret +; GFX942-NEXT: s_endpgm +; +; GFX908-LABEL: matmul_kernel: +; GFX908: ; %bb.0: ; %entry +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX908-NEXT: v_mov_b32_e32 v1, 0 +; GFX908-NEXT: s_mov_b32 s2, 0 +; GFX908-NEXT: s_mov_b32 s3, 0 +; GFX908-NEXT: v_accvgpr_write_b32 a0, v1 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_cmp_lg_u32 s0, 0 +; GFX908-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; GFX908-NEXT: s_branch .LBB0_2 +; GFX908-NEXT: .LBB0_1: ; %bb2 +; GFX908-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; GFX908-NEXT: s_or_b32 s4, s3, 1 +; GFX908-NEXT: s_ashr_i32 s5, s3, 31 +; GFX908-NEXT: s_mov_b32 s3, s2 +; GFX908-NEXT: s_nop 3 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: v_mov_b32_e32 v5, s3 +; GFX908-NEXT: v_mov_b32_e32 v4, s2 +; GFX908-NEXT: v_mov_b32_e32 v2, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX908-NEXT: s_and_b32 s3, s5, s4 +; GFX908-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[4:5], v[4:5], a[0:3] +; GFX908-NEXT: s_cbranch_execz .LBB0_4 +; GFX908-NEXT: .LBB0_2: ; %bb +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX908-NEXT: s_cbranch_vccz .LBB0_1 +; GFX908-NEXT: ; %bb.3: +; GFX908-NEXT: ; implicit-def: $sgpr3 +; GFX908-NEXT: .LBB0_4: ; %common.ret +; GFX908-NEXT: s_endpgm +entry: + br label %bb + +bb: + %i = phi { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } [ %i10, %bb2 ], [ zeroinitializer, %entry ] + %i1 = phi i32 [ %i5, %bb2 ], [ 0, %entry ] + %c0 = icmp ne i32 %a0, 0 + br i1 %c0, label %bb2, label %bb11 + +bb2: + %i3 = or i32 %i1, 1 + %i4 = icmp slt i32 %i1, 0 + %i5 = select i1 %i4, i32 %i3, i32 0 + %i6 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %i, 123 + %i7 = insertelement <4 x float> zeroinitializer, float %i6, i32 0 + %i8 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> %i7, i32 0, i32 0, i32 0) + %i9 = extractelement <4 x float> %i8, i32 0 + %i10 = insertvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } zeroinitializer, float %i9, 123 + br label %bb + +bb11: + %c1 = icmp ne i32 %a1, 0 + br i1 %c1, label %bb12, label %common.ret + +common.ret: + ret void + +bb12: + %i13 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %i, 0 + %i14 = insertelement <4 x float> zeroinitializer, float %i13, i32 0 + %i15 = insertelement <4 x float> %i14, float 0.000000e+00, i32 0 + %i16 = insertelement <4 x float> %i15, float 0.000000e+00, i32 0 + br label %common.ret +} + +; Function Attrs: convergent nocallback nofree nosync nounwind willreturn memory(none) +declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half>, <4 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir new file mode 100644 index 0000000000000..5c83170563e59 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir @@ -0,0 +1,235 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -run-pass si-fold-operands %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -start-before=si-fold-operands -stop-after=register-coalescer %s -o - | FileCheck %s --check-prefixes=COALESCE +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -start-before=si-fold-operands -stop-after=register-coalescer %s -o - | FileCheck %s --check-prefixes=GFX908-COALESCE + +... +--- +name: test +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: test + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + ; CHECK-NEXT: S_BITCMP1_B32 killed [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:agpr_32 = COPY [[V_MOV_B32_e32_]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec + ; CHECK-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 [[V_CNDMASK_B32_e64_]], 1, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:agpr_32 = PHI [[COPY1]], %bb.0, %24, %bb.3 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %11, %bb.3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[PHI]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:agpr_32 = COPY [[PHI]] + ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; CHECK-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit $vcc + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI1]], 1, implicit-def dead $scc + ; CHECK-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[PHI1]], 31, implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[S_ASHR_I32_]], killed [[S_OR_B32_]], implicit-def dead $scc + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:areg_128_align2 = COPY [[REG_SEQUENCE]] + ; CHECK-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY4]], [[COPY4]], killed [[COPY5]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF]], %bb.1, [[S_AND_B32_]], %bb.2 + ; CHECK-NEXT: [[PHI3:%[0-9]+]]:agpr_32 = PHI [[COPY3]], %bb.1, [[V_MFMA_F32_16X16X16F16_e64_]].sub0, %bb.2 + ; CHECK-NEXT: [[PHI4:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.1, [[S_MOV_B64_1]], %bb.2 + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[PHI4]], implicit $exec + ; CHECK-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 [[V_CNDMASK_B32_e64_1]], 1, implicit $exec + ; CHECK-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_1]], implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc + ; CHECK-NEXT: S_BRANCH %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5: + ; CHECK-NEXT: S_ENDPGM 0 + ; + ; COALESCE-LABEL: name: test + ; COALESCE: bb.0: + ; COALESCE-NEXT: successors: %bb.1(0x80000000) + ; COALESCE-NEXT: liveins: $sgpr4_sgpr5 + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; COALESCE-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + ; COALESCE-NEXT: S_BITCMP1_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc + ; COALESCE-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 0 + ; COALESCE-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit killed $scc + ; COALESCE-NEXT: undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_128_align2 = V_MOV_B32_e32 0, implicit $exec + ; COALESCE-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; COALESCE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec + ; COALESCE-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec + ; COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: bb.1: + ; COALESCE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0 + ; COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1 + ; COALESCE-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc + ; COALESCE-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc + ; COALESCE-NEXT: S_BRANCH %bb.2 + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: bb.2: + ; COALESCE-NEXT: successors: %bb.3(0x80000000) + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], 1, implicit-def dead $scc + ; COALESCE-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc + ; COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc + ; COALESCE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub2:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1 + ; COALESCE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub3:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1 + ; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0 + ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]] + ; COALESCE-NEXT: [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[V_MOV_B32_e32_]] + ; COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: bb.3: + ; COALESCE-NEXT: successors: %bb.4(0x40000000), %bb.1(0x40000000) + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_MOV_B64_]], implicit $exec + ; COALESCE-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_1]], implicit $exec + ; COALESCE-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc + ; COALESCE-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + ; COALESCE-NEXT: S_BRANCH %bb.4 + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: bb.4: + ; COALESCE-NEXT: successors: %bb.5(0x80000000) + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: bb.5: + ; COALESCE-NEXT: S_ENDPGM 0 + ; + ; GFX908-COALESCE-LABEL: name: test + ; GFX908-COALESCE: bb.0: + ; GFX908-COALESCE-NEXT: successors: %bb.1(0x80000000) + ; GFX908-COALESCE-NEXT: liveins: $sgpr4_sgpr5 + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GFX908-COALESCE-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + ; GFX908-COALESCE-NEXT: S_BITCMP1_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc + ; GFX908-COALESCE-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 0 + ; GFX908-COALESCE-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit killed $scc + ; GFX908-COALESCE-NEXT: undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_128_align2 = V_MOV_B32_e32 0, implicit $exec + ; GFX908-COALESCE-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; GFX908-COALESCE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec + ; GFX908-COALESCE-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec + ; GFX908-COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: bb.1: + ; GFX908-COALESCE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0 + ; GFX908-COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1 + ; GFX908-COALESCE-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc + ; GFX908-COALESCE-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc + ; GFX908-COALESCE-NEXT: S_BRANCH %bb.2 + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: bb.2: + ; GFX908-COALESCE-NEXT: successors: %bb.3(0x80000000) + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], 1, implicit-def dead $scc + ; GFX908-COALESCE-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc + ; GFX908-COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc + ; GFX908-COALESCE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub2:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1 + ; GFX908-COALESCE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub3:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1 + ; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0 + ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]] + ; GFX908-COALESCE-NEXT: [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[V_MOV_B32_e32_]] + ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX908-COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: bb.3: + ; GFX908-COALESCE-NEXT: successors: %bb.4(0x40000000), %bb.1(0x40000000) + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_MOV_B64_]], implicit $exec + ; GFX908-COALESCE-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_1]], implicit $exec + ; GFX908-COALESCE-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc + ; GFX908-COALESCE-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + ; GFX908-COALESCE-NEXT: S_BRANCH %bb.4 + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: bb.4: + ; GFX908-COALESCE-NEXT: successors: %bb.5(0x80000000) + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: bb.5: + ; GFX908-COALESCE-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1 + liveins: $sgpr4_sgpr5 + + %0:sgpr_64(p4) = COPY $sgpr4_sgpr5 + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0(p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + S_BITCMP1_B32 killed %1, 0, implicit-def $scc + %2:sgpr_32 = S_MOV_B32 0 + %3:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc + %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %5:sreg_32 = IMPLICIT_DEF + %6:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %3, implicit $exec + %7:sreg_64_xexec = V_CMP_NE_U32_e64 %6, 1, implicit $exec + + bb.1: + successors: %bb.2, %bb.3 + + %8:vgpr_32 = PHI %4, %bb.0, %9, %bb.3 + %10:sreg_32 = PHI %2, %bb.0, %11, %bb.3 + %12:agpr_32 = COPY %8 + %13:sreg_64 = S_MOV_B64 -1 + $vcc = S_AND_B64 $exec, %7, implicit-def $scc + S_CBRANCH_VCCNZ %bb.3, implicit $vcc + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3 + + %14:sreg_32 = S_OR_B32 %10, 1, implicit-def dead $scc + %15:sreg_32 = S_ASHR_I32 %10, 31, implicit-def dead $scc + %16:sreg_32 = S_AND_B32 killed %15, killed %14, implicit-def dead $scc + %17:vreg_128_align2 = REG_SEQUENCE %8, %subreg.sub0, %4, %subreg.sub1, %4, %subreg.sub2, %4, %subreg.sub3 + %18:sreg_64 = REG_SEQUENCE %2, %subreg.sub0, %2, %subreg.sub1 + %19:vreg_64_align2 = COPY %18 + %20:areg_128_align2 = COPY %17 + %21:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %19, %19, killed %20, 0, 0, 0, implicit $mode, implicit $exec + %22:vgpr_32 = COPY %21.sub0 + %23:sreg_64 = S_MOV_B64 0 + + bb.3: + successors: %bb.4, %bb.1 + + %11:sreg_32 = PHI %5, %bb.1, %16, %bb.2 + %24:agpr_32 = PHI %12, %bb.1, %21.sub0, %bb.2 + %25:sreg_64_xexec = PHI %13, %bb.1, %23, %bb.2 + %9:vgpr_32 = COPY %24 + %26:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %25, implicit $exec + %27:sreg_64_xexec = V_CMP_NE_U32_e64 %26, 1, implicit $exec + $vcc = S_AND_B64 $exec, %27, implicit-def $scc + S_CBRANCH_VCCNZ %bb.1, implicit $vcc + S_BRANCH %bb.4 + + bb.4: + successors: %bb.5 + + bb.5: + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir new file mode 100644 index 0000000000000..49c0aaf9fb390 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir @@ -0,0 +1,182 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -run-pass si-fold-operands %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -start-before=si-fold-operands -stop-after=register-coalescer %s -o - | FileCheck %s --check-prefixes=COALESCE +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -start-before=si-fold-operands -stop-after=register-coalescer %s -o - | FileCheck %s --check-prefixes=GFX908-COALESCE + +... +--- +name: test +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: test + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; CHECK-NEXT: S_BITCMP0_B32 killed [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_3:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_4:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_128_align2 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub0, [[V_ACCVGPR_WRITE_B32_e64_2]], %subreg.sub1, [[V_ACCVGPR_WRITE_B32_e64_3]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_4]], %subreg.sub3 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; CHECK-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], killed [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], killed [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_16X16X16F16_e64_3:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], killed [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MFMA_F32_16X16X16F16_e64_3]].sub0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: [[PHI:%[0-9]+]]:agpr_32 = PHI [[V_ACCVGPR_WRITE_B32_e64_]], %bb.1, [[V_MFMA_F32_16X16X16F16_e64_3]].sub0, %bb.2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[PHI]] + ; CHECK-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_CVT_F16_F32_e64_]], 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_PACK_B32_F16_e64_]], %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3 + ; CHECK-NEXT: BUFFER_STORE_DWORDX2_OFFSET_exact [[REG_SEQUENCE2]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) null`, align 1, addrspace 8) + ; CHECK-NEXT: S_ENDPGM 0 + ; + ; COALESCE-LABEL: name: test + ; COALESCE: bb.0: + ; COALESCE-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; COALESCE-NEXT: liveins: $sgpr4_sgpr5 + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; COALESCE-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + ; COALESCE-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 0 + ; COALESCE-NEXT: S_BITCMP0_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc + ; COALESCE-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: bb.1: + ; COALESCE-NEXT: successors: %bb.3(0x80000000) + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; COALESCE-NEXT: S_BRANCH %bb.3 + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: bb.2: + ; COALESCE-NEXT: successors: %bb.3(0x80000000) + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1 + ; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], 0, 0, 0, 0, implicit $mode, implicit $exec + ; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; COALESCE-NEXT: {{ $}} + ; COALESCE-NEXT: bb.3: + ; COALESCE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0 + ; COALESCE-NEXT: [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 [[COPY2]], implicit $mode, implicit $exec + ; COALESCE-NEXT: undef [[V_PACK_B32_F16_e64_:%[0-9]+]].sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, [[V_CVT_F16_F32_e32_]], 0, 0, 0, 0, implicit $mode, implicit $exec + ; COALESCE-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec + ; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; COALESCE-NEXT: BUFFER_STORE_DWORDX2_OFFSET_exact [[V_PACK_B32_F16_e64_]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) null`, align 1, addrspace 8) + ; COALESCE-NEXT: S_ENDPGM 0 + ; + ; GFX908-COALESCE-LABEL: name: test + ; GFX908-COALESCE: bb.0: + ; GFX908-COALESCE-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GFX908-COALESCE-NEXT: liveins: $sgpr4_sgpr5 + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GFX908-COALESCE-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + ; GFX908-COALESCE-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 0 + ; GFX908-COALESCE-NEXT: S_BITCMP0_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc + ; GFX908-COALESCE-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: bb.1: + ; GFX908-COALESCE-NEXT: successors: %bb.3(0x80000000) + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; GFX908-COALESCE-NEXT: S_BRANCH %bb.3 + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: bb.2: + ; GFX908-COALESCE-NEXT: successors: %bb.3(0x80000000) + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub1:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub0 + ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub0 + ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub0 + ; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1 + ; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_ACCVGPR_WRITE_B32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX908-COALESCE-NEXT: {{ $}} + ; GFX908-COALESCE-NEXT: bb.3: + ; GFX908-COALESCE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0 + ; GFX908-COALESCE-NEXT: [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 [[COPY2]], implicit $mode, implicit $exec + ; GFX908-COALESCE-NEXT: undef [[V_PACK_B32_F16_e64_:%[0-9]+]].sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, [[V_CVT_F16_F32_e32_]], 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX908-COALESCE-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec + ; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; GFX908-COALESCE-NEXT: BUFFER_STORE_DWORDX2_OFFSET_exact [[V_PACK_B32_F16_e64_]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) null`, align 1, addrspace 8) + ; GFX908-COALESCE-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.2, %bb.1 + liveins: $sgpr4_sgpr5 + + %0:sgpr_64(p4) = COPY $sgpr4_sgpr5 + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0(p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + %2:sgpr_32 = S_MOV_B32 0 + S_BITCMP0_B32 killed %1, 0, implicit-def $scc + S_CBRANCH_SCC0 %bb.2, implicit $scc + + bb.1: + successors: %bb.3 + + %3:sgpr_32 = COPY %2 + %4:vgpr_32 = COPY %3, implicit $exec + S_BRANCH %bb.3 + + bb.2: + successors: %bb.3 + + %5:sgpr_32 = S_MOV_B32 0 + %6:vgpr_32 = COPY %5 + %7:agpr_32 = V_ACCVGPR_WRITE_B32_e64 %6, implicit $exec + %8:agpr_32 = V_ACCVGPR_WRITE_B32_e64 %6, implicit $exec + %9:agpr_32 = V_ACCVGPR_WRITE_B32_e64 %6, implicit $exec + %10:agpr_32 = V_ACCVGPR_WRITE_B32_e64 %6, implicit $exec + %11:areg_128_align2 = REG_SEQUENCE %7, %subreg.sub0, %8, %subreg.sub1, %9, %subreg.sub2, %10, %subreg.sub3 + %12:sreg_64 = REG_SEQUENCE %5, %subreg.sub0, %5, %subreg.sub1 + %13:vreg_64_align2 = COPY %12 + %14:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %13, %13, killed %11, 0, 0, 0, implicit $mode, implicit $exec + %15:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %13, %13, killed %14, 0, 0, 0, implicit $mode, implicit $exec + %16:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %13, %13, killed %15, 0, 0, 0, implicit $mode, implicit $exec + %17:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %13, %13, killed %16, 0, 0, 0, implicit $mode, implicit $exec + %18:vgpr_32 = COPY %17.sub0 + %19:vgpr_32 = COPY %18 + + bb.3: + %20:vgpr_32 = PHI %4, %bb.1, %19, %bb.2 + %21:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, %20, 0, 0, implicit $mode, implicit $exec + %22:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %21, 0, %2, 0, 0, implicit $mode, implicit $exec + %23:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %24:vreg_64_align2 = REG_SEQUENCE %22, %subreg.sub0, killed %23, %subreg.sub1 + %25:sgpr_128 = REG_SEQUENCE %2, %subreg.sub0, %2, %subreg.sub1, %2, %subreg.sub2, %2, %subreg.sub3 + %26:vreg_64_align2 = COPY %24 + BUFFER_STORE_DWORDX2_OFFSET_exact killed %26, killed %25, %2, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) null`, align 1, addrspace 8) + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/smed3.ll b/llvm/test/CodeGen/AMDGPU/smed3.ll index 0f93367fe68c1..786d9ec783ba9 100644 --- a/llvm/test/CodeGen/AMDGPU/smed3.ll +++ b/llvm/test/CodeGen/AMDGPU/smed3.ll @@ -1,6 +1,8 @@ ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s declare i32 @llvm.amdgcn.workitem.id.x() #0 @@ -98,6 +100,8 @@ declare i64 @llvm.smin.i64(i64, i64) ; VI: v_max_i16_e32 [[MAX:v[0-9]]], 12, {{v[0-9]}} ; VI: v_min_i16_e32 {{v[0-9]}}, 17, [[MAX]] ; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 +; GFX11-TRUE16: v_med3_i16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, 12, 17 +; GFX11-FAKE16: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 define amdgpu_kernel void @v_test_smed3_r_i_i_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid @@ -686,6 +690,8 @@ bb: ; VI: v_max_i16 ; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_med3_i16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, v{{[0-9]+}}.h, v{{[0-9]+}}.l +; GFX11-FAKE16: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_test_smed3_i16_pat_0(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 { bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -707,6 +713,8 @@ bb: ; GCN-LABEL: {{^}}v_test_smed3_i16_pat_1: ; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_med3_i16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, v{{[0-9]+}}.h, v{{[0-9]+}}.l +; GFX11-FAKE16: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_test_smed3_i16_pat_1(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 { bb: diff --git a/llvm/test/CodeGen/AMDGPU/swdev503538-move-to-valu-stack-srd-physreg.ll b/llvm/test/CodeGen/AMDGPU/swdev503538-move-to-valu-stack-srd-physreg.ll index 6849c8b4e609e..f0b3d334af67d 100644 --- a/llvm/test/CodeGen/AMDGPU/swdev503538-move-to-valu-stack-srd-physreg.ll +++ b/llvm/test/CodeGen/AMDGPU/swdev503538-move-to-valu-stack-srd-physreg.ll @@ -1,4 +1,4 @@ -; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -O0 2> %t.err < %s | FileCheck %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs=0 -O0 2> %t.err < %s | FileCheck %s ; RUN: FileCheck -check-prefix=ERR %s < %t.err ; FIXME: This error will be fixed by supporting arbitrary divergent diff --git a/llvm/test/CodeGen/AMDGPU/umed3.ll b/llvm/test/CodeGen/AMDGPU/umed3.ll index 83adaf03c7a2b..7ee1bbb934b03 100644 --- a/llvm/test/CodeGen/AMDGPU/umed3.ll +++ b/llvm/test/CodeGen/AMDGPU/umed3.ll @@ -1,6 +1,8 @@ ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-TRUE16 %s declare i32 @llvm.amdgcn.workitem.id.x() #0 @@ -84,6 +86,8 @@ define amdgpu_kernel void @v_test_umed3_r_i_i_i64(ptr addrspace(1) %out, ptr add ; VI: v_max_u16_e32 [[MAX:v[0-9]]], 12, {{v[0-9]}} ; VI: v_min_u16_e32 {{v[0-9]}}, 17, [[MAX]] ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 +; GFX11-TRUE16: v_med3_u16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, 12, 17 +; GFX11-FAKE16: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 define amdgpu_kernel void @v_test_umed3_r_i_i_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid @@ -707,6 +711,8 @@ bb: ; VI: v_max_u16 ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_med3_u16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, v{{[0-9]+}}.h, v{{[0-9]+}}.l +; GFX11-FAKE16: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_test_umed3_i16_pat_0(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 { bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -728,6 +734,8 @@ bb: ; GCN-LABEL: {{^}}v_test_umed3_i16_pat_1: ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX11-TRUE16: v_med3_u16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, v{{[0-9]+}}.h, v{{[0-9]+}}.l +; GFX11-FAKE16: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_test_umed3_i16_pat_1(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 { bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/DirectX/BufferLoad.ll b/llvm/test/CodeGen/DirectX/BufferLoad.ll index 7f1291bf4a5c8..86e2217a8e76f 100644 --- a/llvm/test/CodeGen/DirectX/BufferLoad.ll +++ b/llvm/test/CodeGen/DirectX/BufferLoad.ll @@ -17,8 +17,9 @@ define void @loadv4f32() { ; CHECK-NOT: %dx.resource.casthandle ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef) - %data0 = call <4 x float> @llvm.dx.resource.load.typedbuffer( + %load0 = call {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer( target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 0) + %data0 = extractvalue {<4 x float>, i1} %load0, 0 ; The extract order depends on the users, so don't enforce that here. ; CHECK-DAG: [[VAL0_0:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA0]], 0 @@ -34,8 +35,9 @@ define void @loadv4f32() { call void @scalar_user(float %data0_2) ; CHECK: [[DATA4:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 4, i32 undef) - %data4 = call <4 x float> @llvm.dx.resource.load.typedbuffer( + %load4 = call {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer( target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 4) + %data4 = extractvalue {<4 x float>, i1} %load4, 0 ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA4]], 0 ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA4]], 1 @@ -48,8 +50,9 @@ define void @loadv4f32() { call void @vector_user(<4 x float> %data4) ; CHECK: [[DATA12:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 12, i32 undef) - %data12 = call <4 x float> @llvm.dx.resource.load.typedbuffer( + %load12 = call {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer( target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 12) + %data12 = extractvalue {<4 x float>, i1} %load12, 0 ; CHECK: [[DATA12_3:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA12]], 3 %data12_3 = extractelement <4 x float> %data12, i32 3 @@ -70,8 +73,9 @@ define void @index_dynamic(i32 %bufindex, i32 %elemindex) { i32 0, i32 0, i32 1, i32 0, i1 false) ; CHECK: [[LOAD:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 %bufindex, i32 undef) - %load = call <4 x float> @llvm.dx.resource.load.typedbuffer( + %load = call {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer( target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 %bufindex) + %data = extractvalue {<4 x float>, i1} %load, 0 ; CHECK: [[ALLOCA:%.*]] = alloca [4 x float] ; CHECK: [[V0:%.*]] = extractvalue %dx.types.ResRet.f32 [[LOAD]], 0 @@ -89,10 +93,10 @@ define void @index_dynamic(i32 %bufindex, i32 %elemindex) { ; ; CHECK: [[PTR:%.*]] = getelementptr inbounds [4 x float], ptr [[ALLOCA]], i32 0, i32 %elemindex ; CHECK: [[X:%.*]] = load float, ptr [[PTR]] - %data = extractelement <4 x float> %load, i32 %elemindex + %x = extractelement <4 x float> %data, i32 %elemindex ; CHECK: call void @scalar_user(float [[X]]) - call void @scalar_user(float %data) + call void @scalar_user(float %x) ret void } @@ -105,8 +109,9 @@ define void @loadf32() { i32 0, i32 0, i32 1, i32 0, i1 false) ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef) - %data0 = call float @llvm.dx.resource.load.typedbuffer( + %load0 = call {float, i1} @llvm.dx.resource.load.typedbuffer( target("dx.TypedBuffer", float, 0, 0, 0) %buffer, i32 0) + %data0 = extractvalue {float, i1} %load0, 0 ; CHECK: [[VAL0:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA0]], 0 ; CHECK: call void @scalar_user(float [[VAL0]]) @@ -123,7 +128,7 @@ define void @loadv2f32() { i32 0, i32 0, i32 1, i32 0, i1 false) ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef) - %data0 = call <2 x float> @llvm.dx.resource.load.typedbuffer( + %data0 = call {<2 x float>, i1} @llvm.dx.resource.load.typedbuffer( target("dx.TypedBuffer", <2 x float>, 0, 0, 0) %buffer, i32 0) ret void @@ -137,7 +142,7 @@ define void @loadv4f32_checkbit() { i32 0, i32 0, i32 1, i32 0, i1 false) ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef) - %data0 = call {<4 x float>, i1} @llvm.dx.resource.loadchecked.typedbuffer.f32( + %data0 = call {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer.f32( target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 0) ; CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA0]], 4 @@ -158,7 +163,7 @@ define void @loadv4i32() { i32 0, i32 0, i32 1, i32 0, i1 false) ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef) - %data0 = call <4 x i32> @llvm.dx.resource.load.typedbuffer( + %data0 = call {<4 x i32>, i1} @llvm.dx.resource.load.typedbuffer( target("dx.TypedBuffer", <4 x i32>, 0, 0, 0) %buffer, i32 0) ret void @@ -172,7 +177,7 @@ define void @loadv4f16() { i32 0, i32 0, i32 1, i32 0, i1 false) ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f16 @dx.op.bufferLoad.f16(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef) - %data0 = call <4 x half> @llvm.dx.resource.load.typedbuffer( + %data0 = call {<4 x half>, i1} @llvm.dx.resource.load.typedbuffer( target("dx.TypedBuffer", <4 x half>, 0, 0, 0) %buffer, i32 0) ret void @@ -186,7 +191,7 @@ define void @loadv4i16() { i32 0, i32 0, i32 1, i32 0, i1 false) ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.i16 @dx.op.bufferLoad.i16(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef) - %data0 = call <4 x i16> @llvm.dx.resource.load.typedbuffer( + %data0 = call {<4 x i16>, i1} @llvm.dx.resource.load.typedbuffer( target("dx.TypedBuffer", <4 x i16>, 0, 0, 0) %buffer, i32 0) ret void diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load_typedbuffer.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load_typedbuffer.ll index 9b7e7fd04f605..8769e6ec66d8e 100644 --- a/llvm/test/CodeGen/DirectX/ResourceAccess/load_typedbuffer.ll +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load_typedbuffer.ll @@ -15,17 +15,19 @@ define void @load_float4(i32 %index, i32 %elemindex) { %ptr = call ptr @llvm.dx.resource.getpointer( target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index) - ; CHECK: %[[VALUE:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index) + ; CHECK: %[[VALUE:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index) %vec_data = load <4 x float>, ptr %ptr call void @use_float4(<4 x float> %vec_data) - ; CHECK: %[[VALUE:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index) + ; CHECK: %[[LOAD:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index) + ; CHECK: %[[VALUE:.*]] = extractvalue { <4 x float>, i1 } %[[LOAD]], 0 ; CHECK: extractelement <4 x float> %[[VALUE]], i32 1 %y_ptr = getelementptr inbounds <4 x float>, ptr %ptr, i32 0, i32 1 %y_data = load float, ptr %y_ptr call void @use_float(float %y_data) - ; CHECK: %[[VALUE:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index) + ; CHECK: %[[LOAD:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index) + ; CHECK: %[[VALUE:.*]] = extractvalue { <4 x float>, i1 } %[[LOAD]], 0 ; CHECK: extractelement <4 x float> %[[VALUE]], i32 %elemindex %dynamic = getelementptr inbounds <4 x float>, ptr %ptr, i32 0, i32 %elemindex %dyndata = load float, ptr %dynamic diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/store_typedbuffer.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/store_typedbuffer.ll index 17606408cadff..0b7882ac722e8 100644 --- a/llvm/test/CodeGen/DirectX/ResourceAccess/store_typedbuffer.ll +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/store_typedbuffer.ll @@ -18,21 +18,24 @@ define void @store_float4(<4 x float> %data, i32 %index, i32 %elemindex) { ; Store just the .x component %scalar = extractelement <4 x float> %data, i32 0 - ; CHECK: %[[LOAD:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index) - ; CHECK: %[[INSERT:.*]] = insertelement <4 x float> %[[LOAD]], float %scalar, i32 0 + ; CHECK: %[[LOAD:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index) + ; CHECK: %[[VEC:.*]] = extractvalue { <4 x float>, i1 } %[[LOAD]], 0 + ; CHECK: %[[INSERT:.*]] = insertelement <4 x float> %[[VEC]], float %scalar, i32 0 ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index, <4 x float> %[[INSERT]]) store float %scalar, ptr %ptr ; Store just the .y component - ; CHECK: %[[LOAD:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index) - ; CHECK: %[[INSERT:.*]] = insertelement <4 x float> %[[LOAD]], float %scalar, i32 1 + ; CHECK: %[[LOAD:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index) + ; CHECK: %[[VEC:.*]] = extractvalue { <4 x float>, i1 } %[[LOAD]], 0 + ; CHECK: %[[INSERT:.*]] = insertelement <4 x float> %[[VEC]], float %scalar, i32 1 ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index, <4 x float> %[[INSERT]]) %y_ptr = getelementptr inbounds i8, ptr %ptr, i32 4 store float %scalar, ptr %y_ptr ; Store to one of the elements dynamically - ; CHECK: %[[LOAD:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index) - ; CHECK: %[[INSERT:.*]] = insertelement <4 x float> %[[LOAD]], float %scalar, i32 %elemindex + ; CHECK: %[[LOAD:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index) + ; CHECK: %[[VEC:.*]] = extractvalue { <4 x float>, i1 } %[[LOAD]], 0 + ; CHECK: %[[INSERT:.*]] = insertelement <4 x float> %[[VEC]], float %scalar, i32 %elemindex ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index, <4 x float> %[[INSERT]]) %dynamic = getelementptr inbounds <4 x float>, ptr %ptr, i32 0, i32 %elemindex store float %scalar, ptr %dynamic @@ -56,14 +59,16 @@ define void @store_half4(<4 x half> %data, i32 %index) { ; Store just the .x component %scalar = extractelement <4 x half> %data, i32 0 - ; CHECK: %[[LOAD:.*]] = call <4 x half> @llvm.dx.resource.load.typedbuffer.v4f16.tdx.TypedBuffer_v4f16_1_0_0t(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index) - ; CHECK: %[[INSERT:.*]] = insertelement <4 x half> %[[LOAD]], half %scalar, i32 0 + ; CHECK: %[[LOAD:.*]] = call { <4 x half>, i1 } @llvm.dx.resource.load.typedbuffer.v4f16.tdx.TypedBuffer_v4f16_1_0_0t(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index) + ; CHECK: %[[VEC:.*]] = extractvalue { <4 x half>, i1 } %[[LOAD]], 0 + ; CHECK: %[[INSERT:.*]] = insertelement <4 x half> %[[VEC]], half %scalar, i32 0 ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f16_1_0_0t.v4f16(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index, <4 x half> %[[INSERT]]) store half %scalar, ptr %ptr ; Store just the .y component - ; CHECK: %[[LOAD:.*]] = call <4 x half> @llvm.dx.resource.load.typedbuffer.v4f16.tdx.TypedBuffer_v4f16_1_0_0t(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index) - ; CHECK: %[[INSERT:.*]] = insertelement <4 x half> %[[LOAD]], half %scalar, i32 1 + ; CHECK: %[[LOAD:.*]] = call { <4 x half>, i1 } @llvm.dx.resource.load.typedbuffer.v4f16.tdx.TypedBuffer_v4f16_1_0_0t(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index) + ; CHECK: %[[VEC:.*]] = extractvalue { <4 x half>, i1 } %[[LOAD]], 0 + ; CHECK: %[[INSERT:.*]] = insertelement <4 x half> %[[VEC]], half %scalar, i32 1 ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f16_1_0_0t.v4f16(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index, <4 x half> %[[INSERT]]) %y_ptr = getelementptr inbounds i8, ptr %ptr, i32 2 store half %scalar, ptr %y_ptr @@ -87,14 +92,16 @@ define void @store_double2(<2 x double> %data, i32 %index) { ; Store just the .x component %scalar = extractelement <2 x double> %data, i32 0 - ; CHECK: %[[LOAD:.*]] = call <2 x double> @llvm.dx.resource.load.typedbuffer.v2f64.tdx.TypedBuffer_v2f64_1_0_0t(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index) - ; CHECK: %[[INSERT:.*]] = insertelement <2 x double> %[[LOAD]], double %scalar, i32 0 + ; CHECK: %[[LOAD:.*]] = call { <2 x double>, i1 } @llvm.dx.resource.load.typedbuffer.v2f64.tdx.TypedBuffer_v2f64_1_0_0t(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index) + ; CHECK: %[[VEC:.*]] = extractvalue { <2 x double>, i1 } %[[LOAD]], 0 + ; CHECK: %[[INSERT:.*]] = insertelement <2 x double> %[[VEC]], double %scalar, i32 0 ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v2f64_1_0_0t.v2f64(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index, <2 x double> %[[INSERT]]) store double %scalar, ptr %ptr ; Store just the .y component - ; CHECK: %[[LOAD:.*]] = call <2 x double> @llvm.dx.resource.load.typedbuffer.v2f64.tdx.TypedBuffer_v2f64_1_0_0t(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index) - ; CHECK: %[[INSERT:.*]] = insertelement <2 x double> %[[LOAD]], double %scalar, i32 1 + ; CHECK: %[[LOAD:.*]] = call { <2 x double>, i1 } @llvm.dx.resource.load.typedbuffer.v2f64.tdx.TypedBuffer_v2f64_1_0_0t(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index) + ; CHECK: %[[VEC:.*]] = extractvalue { <2 x double>, i1 } %[[LOAD]], 0 + ; CHECK: %[[INSERT:.*]] = insertelement <2 x double> %[[VEC]], double %scalar, i32 1 ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v2f64_1_0_0t.v2f64(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index, <2 x double> %[[INSERT]]) %y_ptr = getelementptr inbounds i8, ptr %ptr, i32 8 store double %scalar, ptr %y_ptr diff --git a/llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll b/llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll index c837b36a19e11..cd21adc11a9b4 100644 --- a/llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll +++ b/llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll @@ -29,18 +29,20 @@ entry: %0 = call i32 @llvm.dx.flattened.thread.id.in.group() ; CHECK-NOT: load {{.*}} ptr @In %1 = load target("dx.TypedBuffer", <4 x float>, 1, 0, 0), ptr @In, align 4 - ; CSE: call noundef <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t - %2 = call noundef <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %1, i32 %0) + ; CSE: call noundef { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t + %load = call noundef {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %1, i32 %0) + %2 = extractvalue {<4 x float>, i1} %load, 0 ; CHECK-NOT: load {{.*}} ptr @In %3 = load target("dx.TypedBuffer", <4 x float>, 1, 0, 0), ptr @In, align 4 - %4 = call noundef <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %3, i32 %0) + %load2 = call noundef {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %3, i32 %0) + %4 = extractvalue {<4 x float>, i1} %load2, 0 %add.i = fadd <4 x float> %2, %4 call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %Out_h.i, i32 %0, <4 x float> %add.i) ; CHECK: ret void ret void } -; CSE-DAG: declare <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32) [[ROAttr:#[0-9]+]] +; CSE-DAG: declare { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32) [[ROAttr:#[0-9]+]] ; CSE-DAG: declare void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32, <4 x float>) [[WOAttr:#[0-9]+]] attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="8,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/typed-uav-load-additional-formats.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/typed-uav-load-additional-formats.ll index 26223359dfdf1..060d54f961c70 100644 --- a/llvm/test/CodeGen/DirectX/ShaderFlags/typed-uav-load-additional-formats.ll +++ b/llvm/test/CodeGen/DirectX/ShaderFlags/typed-uav-load-additional-formats.ll @@ -17,8 +17,9 @@ target triple = "dxil-pc-shadermodel6.7-library" define <4 x float> @multicomponent() #0 { %res = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false) - %val = call <4 x float> @llvm.dx.resource.load.typedbuffer( + %load = call {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer( target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %res, i32 0) + %val = extractvalue {<4 x float>, i1} %load, 0 ret <4 x float> %val } @@ -26,8 +27,9 @@ define <4 x float> @multicomponent() #0 { define float @onecomponent() #0 { %res = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false) - %val = call float @llvm.dx.resource.load.typedbuffer( + %load = call {float, i1} @llvm.dx.resource.load.typedbuffer( target("dx.TypedBuffer", float, 1, 0, 0) %res, i32 0) + %val = extractvalue {float, i1} %load, 0 ret float %val } diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/double-zfa.ll b/llvm/test/CodeGen/RISCV/GlobalISel/double-zfa.ll index 385156b3b99d4..4878699226582 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/double-zfa.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/double-zfa.ll @@ -1,9 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 - ; RUN: llc -mtriple=riscv32 -mattr=+zfa,d -global-isel < %s \ -; RUN: | FileCheck %s +; RUN: | FileCheck %s --check-prefixes=CHECK,RV32IDZFA ; RUN: llc -mtriple=riscv64 -mattr=+zfa,d -global-isel < %s \ -; RUN: | FileCheck %s +; RUN: | FileCheck %s --check-prefixes=CHECK,RV64DZFA define double @fceil(double %a) { @@ -86,3 +85,32 @@ define double @fminimum(double %a, double %b) { %c = call double @llvm.minimum.f64(double %a, double %b) ret double %c } + +define i64 @fmvh_x_d(double %fa) { +; RV32IDZFA-LABEL: fmvh_x_d: +; RV32IDZFA: # %bb.0: +; RV32IDZFA-NEXT: fmv.x.w a0, fa0 +; RV32IDZFA-NEXT: fmvh.x.d a1, fa0 +; RV32IDZFA-NEXT: ret +; +; RV64DZFA-LABEL: fmvh_x_d: +; RV64DZFA: # %bb.0: +; RV64DZFA-NEXT: fmv.x.d a0, fa0 +; RV64DZFA-NEXT: ret + %i = bitcast double %fa to i64 + ret i64 %i +} + +define double @fmvp_d_x(i64 %a) { +; RV32IDZFA-LABEL: fmvp_d_x: +; RV32IDZFA: # %bb.0: +; RV32IDZFA-NEXT: fmvp.d.x fa0, a0, a1 +; RV32IDZFA-NEXT: ret +; +; RV64DZFA-LABEL: fmvp_d_x: +; RV64DZFA: # %bb.0: +; RV64DZFA-NEXT: fmv.d.x fa0, a0 +; RV64DZFA-NEXT: ret + %or = bitcast i64 %a to double + ret double %or +} diff --git a/llvm/test/CodeGen/RISCV/add_sext_shl_constant.ll b/llvm/test/CodeGen/RISCV/add_sext_shl_constant.ll index fe89b4aa24171..d7f62ae834346 100644 --- a/llvm/test/CodeGen/RISCV/add_sext_shl_constant.ll +++ b/llvm/test/CodeGen/RISCV/add_sext_shl_constant.ll @@ -320,6 +320,19 @@ define i64 @add_shl_moreOneUse_sh3add(i64 %x) { ret i64 %add } +;; Covers a case which previously crashed (pr119527) +define i64 @add_shl_sext(i32 %1) { +; RV64-LABEL: add_shl_sext: +; RV64: # %bb.0: +; RV64-NEXT: addi a1, a0, 3 +; RV64-NEXT: sllw a0, a1, a0 +; RV64-NEXT: ret + %3 = add i32 %1, 3 + %4 = shl i32 %3, %1 + %5 = sext i32 %4 to i64 + ret i64 %5 +} + define i64 @add_shl_moreOneUse_sh4add(i64 %x) { ; RV64-LABEL: add_shl_moreOneUse_sh4add: ; RV64: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index 4f42d5c655280..15e287d66754b 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -4129,6 +4129,62 @@ define <32 x i8> @shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz ret <32 x i8> %shuffle } +; PR121823 +define <32 x i8> @shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a) { +; AVX1-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[1,9,0,3] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,0,3,11,2,5,13,4,7,15,6],zero,zero,zero,zero +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[11,2,5,13,4,7,15,6],zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,9,0,3,11,2,5,13,4,7,15,6,u,u,u,u,17,25,16,19,27,18,21,29,20,23,31,22,u,u,u,u] +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,4,5,6,0,0] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[1,9,0,3,11,2,5,13,4,7,15,6,u,u,u,u,17,25,16,19,27,18,21,29,20,23,31,22,u,u,u,u] +; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,1,2,4,5,6,14,15] +; AVX512VLBW-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [1,9,0,3,11,2,5,13,4,7,15,6,17,25,16,19,27,18,21,29,20,23,31,22,56,57,58,59,60,61,62,63] +; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 +; AVX512VLVBMI-NEXT: retq +; +; XOPAVX1-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[1,9,0,3,11,2,5,13,4,7,15,6],xmm1[1,9,0,3] +; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[11,2,5,13,4,7,15,6],zero,zero,zero,zero,zero,zero,zero,zero +; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,9,0,3,11,2,5,13,4,7,15,6,u,u,u,u,17,25,16,19,27,18,21,29,20,23,31,22,u,u,u,u] +; XOPAVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,4,5,6,0,0] +; XOPAVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; XOPAVX2-NEXT: retq + %r = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> + ret <32 x i8> %r +} + define <32 x i8> @shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: ; AVX1: # %bb.0: diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s index fc8d2bdc0540a..6bc92bc29ea8a 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s @@ -3746,50 +3746,62 @@ v_max_u16 v5.l, v255.l, v255.h v_max_u16 v255.h, 0xfe0b, vcc_hi // GFX11: v_max_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_maxmin_f16 v5, v1, v2, s3 -// GFX11: v_maxmin_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00] +v_maxmin_f16 v5.l, v1.l, v2.l, s3 +// GFX11: v_maxmin_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00] -v_maxmin_f16 v5, v255, s2, s105 -// GFX11: v_maxmin_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01] +v_maxmin_f16 v5.l, v255.l, s2, s105 +// GFX11: v_maxmin_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01] -v_maxmin_f16 v5, s1, v255, exec_hi -// GFX11: v_maxmin_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01] +v_maxmin_f16 v5.l, s1, v255.l, exec_hi +// GFX11: v_maxmin_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01] -v_maxmin_f16 v5, s105, s105, exec_lo -// GFX11: v_maxmin_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01] +v_maxmin_f16 v5.l, s105, s105, exec_lo +// GFX11: v_maxmin_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01] -v_maxmin_f16 v5, vcc_lo, ttmp15, v3 -// GFX11: v_maxmin_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04] +v_maxmin_f16 v5.l, vcc_lo, ttmp15, v3.l +// GFX11: v_maxmin_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04] -v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255 -// GFX11: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.l +// GFX11: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] -v_maxmin_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| -// GFX11: v_maxmin_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1] +v_maxmin_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX11: v_maxmin_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1] -v_maxmin_f16 v5, m0, 0.5, m0 -// GFX11: v_maxmin_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01] +v_maxmin_f16 v5.l, m0, 0.5, m0 +// GFX11: v_maxmin_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01] -v_maxmin_f16 v5, |exec_lo|, -1, vcc_hi -// GFX11: v_maxmin_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01] +v_maxmin_f16 v5.l, |exec_lo|, -1, vcc_hi +// GFX11: v_maxmin_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01] -v_maxmin_f16 v5, -|exec_hi|, null, -|vcc_lo| -// GFX11: v_maxmin_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1] +v_maxmin_f16 v5.l, -|exec_hi|, null, -|vcc_lo| +// GFX11: v_maxmin_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1] -v_maxmin_f16 v5, null, exec_lo, -|0xfe0b| -// GFX11: v_maxmin_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +v_maxmin_f16 v5.l, null, exec_lo, -|0xfe0b| +// GFX11: v_maxmin_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] -v_maxmin_f16 v5, -1, -|exec_hi|, -|src_scc| -// GFX11: v_maxmin_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3] +v_maxmin_f16 v5.l, -1, -|exec_hi|, -|src_scc| +// GFX11: v_maxmin_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3] -v_maxmin_f16 v5, 0.5, -m0, 0.5 mul:2 -// GFX11: v_maxmin_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b] +v_maxmin_f16 v5.l, 0.5, -m0, 0.5 mul:2 +// GFX11: v_maxmin_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b] -v_maxmin_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 -// GFX11: v_maxmin_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33] +v_maxmin_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 +// GFX11: v_maxmin_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33] -v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 -// GFX11: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +v_maxmin_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 +// GFX11: v_maxmin_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] + +v_maxmin_f16 v5.l, v255.h, s2, s105 +// GFX11: v_maxmin_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01] + +v_maxmin_f16 v5.l, s1, v255.h, exec_hi +// GFX11: v_maxmin_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01] + +v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.h +// GFX11: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_maxmin_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp div:2 +// GFX11: v_maxmin_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] v_maxmin_f32 v5, v1, v2, s3 // GFX11: v_maxmin_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x5e,0xd6,0x01,0x05,0x0e,0x00] @@ -4823,50 +4835,62 @@ v_min_u16 v5.l, v255.l, v255.h v_min_u16 v255.h, 0xfe0b, vcc_hi // GFX11: v_min_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_minmax_f16 v5, v1, v2, s3 -// GFX11: v_minmax_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00] +v_minmax_f16 v5.l, v1.l, v2.l, s3 +// GFX11: v_minmax_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00] + +v_minmax_f16 v5.l, v255.l, s2, s105 +// GFX11: v_minmax_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01] + +v_minmax_f16 v5.l, s1, v255.l, exec_hi +// GFX11: v_minmax_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01] + +v_minmax_f16 v5.l, s105, s105, exec_lo +// GFX11: v_minmax_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01] + +v_minmax_f16 v5.l, vcc_lo, ttmp15, v3.l +// GFX11: v_minmax_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04] -v_minmax_f16 v5, v255, s2, s105 -// GFX11: v_minmax_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01] +v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.l +// GFX11: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] -v_minmax_f16 v5, s1, v255, exec_hi -// GFX11: v_minmax_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01] +v_minmax_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX11: v_minmax_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1] -v_minmax_f16 v5, s105, s105, exec_lo -// GFX11: v_minmax_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01] +v_minmax_f16 v5.l, m0, 0.5, m0 +// GFX11: v_minmax_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01] -v_minmax_f16 v5, vcc_lo, ttmp15, v3 -// GFX11: v_minmax_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04] +v_minmax_f16 v5.l, |exec_lo|, -1, vcc_hi +// GFX11: v_minmax_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01] -v_minmax_f16 v5, vcc_hi, 0xfe0b, v255 -// GFX11: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +v_minmax_f16 v5.l, -|exec_hi|, null, -|vcc_lo| +// GFX11: v_minmax_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1] -v_minmax_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| -// GFX11: v_minmax_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1] +v_minmax_f16 v5.l, null, exec_lo, -|0xfe0b| +// GFX11: v_minmax_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] -v_minmax_f16 v5, m0, 0.5, m0 -// GFX11: v_minmax_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01] +v_minmax_f16 v5.l, -1, -|exec_hi|, -|src_scc| +// GFX11: v_minmax_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3] -v_minmax_f16 v5, |exec_lo|, -1, vcc_hi -// GFX11: v_minmax_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01] +v_minmax_f16 v5.l, 0.5, -m0, 0.5 mul:2 +// GFX11: v_minmax_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b] -v_minmax_f16 v5, -|exec_hi|, null, -|vcc_lo| -// GFX11: v_minmax_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1] +v_minmax_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 +// GFX11: v_minmax_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33] -v_minmax_f16 v5, null, exec_lo, -|0xfe0b| -// GFX11: v_minmax_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +v_minmax_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 +// GFX11: v_minmax_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] -v_minmax_f16 v5, -1, -|exec_hi|, -|src_scc| -// GFX11: v_minmax_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3] +v_minmax_f16 v5.l, v255.h, s2, s105 +// GFX11: v_minmax_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01] -v_minmax_f16 v5, 0.5, -m0, 0.5 mul:2 -// GFX11: v_minmax_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b] +v_minmax_f16 v5.l, s1, v255.h, exec_hi +// GFX11: v_minmax_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01] -v_minmax_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 -// GFX11: v_minmax_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33] +v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.h +// GFX11: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] -v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 -// GFX11: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +v_minmax_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp div:2 +// GFX11: v_minmax_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] v_minmax_f32 v5, v1, v2, s3 // GFX11: v_minmax_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x5f,0xd6,0x01,0x05,0x0e,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s index f71569433d326..5fa1334aa6e95 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s @@ -2660,47 +2660,92 @@ v_max_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ v_max_u16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_max_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] -// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] -v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] -// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] +v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] -v_maxmin_f16_e64_dpp v5, v1, v2, v3 row_mirror -// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] +v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] -v_maxmin_f16_e64_dpp v5, v1, v2, v255 row_half_mirror -// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] +v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] -v_maxmin_f16_e64_dpp v5, v1, v2, s105 row_shl:1 -// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] +v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] -v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 -// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] -v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 -// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] -v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 -// GFX11: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] +v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf +// GFX11: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] -v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 -// GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] -v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 -// GFX11: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf +// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] -v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] -v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] +v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] -v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] +v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] -v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] +v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] + +v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] + +v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff] + +v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff] + +v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15 +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff] + +v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1 +// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0xfe,0xa1,0x01,0x21,0x01,0xff] + +v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15 +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xfa,0xc1,0x01,0x2f,0x01,0xff] + +v_maxmin_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_maxmin_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x50,0x01,0xff] + +v_maxmin_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] + +v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13] + +v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h quad_perm:[3,2,1,0] +// GFX11: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h quad_perm:[0,1,2,3] +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] + +v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] + +v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13] + +v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] v_maxmin_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] // GFX11: v_maxmin_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] @@ -3704,47 +3749,92 @@ v_min_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ v_min_u16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX11: v_min_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] -// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf +// GFX11: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf +// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] + +v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] -v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] -// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] +v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff] -v_minmax_f16_e64_dpp v5, v1, v2, v3 row_mirror -// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] +v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff] -v_minmax_f16_e64_dpp v5, v1, v2, v255 row_half_mirror -// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] +v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15 +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff] -v_minmax_f16_e64_dpp v5, v1, v2, s105 row_shl:1 -// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] +v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1 +// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0xfe,0xa1,0x01,0x21,0x01,0xff] -v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 -// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15 +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xfa,0xc1,0x01,0x2f,0x01,0xff] -v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 -// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +v_minmax_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: v_minmax_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x50,0x01,0xff] -v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 -// GFX11: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] +v_minmax_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] -v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 -// GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13] -v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 -// GFX11: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h quad_perm:[3,2,1,0] +// GFX11: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] -v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf -// GFX11: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h quad_perm:[0,1,2,3] +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] -v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX11: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] +v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] -v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] +v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13] -v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX11: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] +v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] v_minmax_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] // GFX11: v_minmax_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s index 2ececc0c78ecd..2fc02061c59de 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s @@ -1660,41 +1660,80 @@ v_max_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 v_max_u16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_max_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0x09,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_maxmin_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] -v_maxmin_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] -v_maxmin_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] -v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] -v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] -v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] +v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] -v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] -v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] -v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] -v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] +v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] -v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x60,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] +v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x60,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] -v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x60,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x60,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] + +v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05] + +v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0xfe,0xa1,0x01,0x77,0x39,0x05] + +v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xfa,0xc1,0x01,0x77,0x39,0x05] + +v_maxmin_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_maxmin_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x77,0x39,0x05] + +v_maxmin_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] + +v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x03,0x60,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05] + +v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] + +v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x13,0x60,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05] + +v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x60,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] v_maxmin_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_maxmin_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] @@ -2434,41 +2473,80 @@ v_min_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 v_min_u16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX11: v_min_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0x0b,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_minmax_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x61,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x61,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] + +v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] -v_minmax_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05] -v_minmax_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0xfe,0xa1,0x01,0x77,0x39,0x05] -v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xfa,0xc1,0x01,0x77,0x39,0x05] -v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +v_minmax_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_minmax_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x77,0x39,0x05] -v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] +v_minmax_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] -v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x03,0x61,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05] -v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] -v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] -v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] +v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] -v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x61,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] +v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x13,0x61,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05] -v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX11: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x61,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x61,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] v_minmax_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] // GFX11: v_minmax_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s index c2db5b90bb478..3e7b7d28c2e97 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s @@ -3590,50 +3590,62 @@ v_max_u16 v255.l, 0xfe0b, vcc_hi v_max_u16 v255.h, 0xfe0b, vcc_hi // GFX12: v_max_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_maxmin_num_f16 v5, v1, v2, s3 -// GFX12: v_maxmin_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00] +v_maxmin_num_f16 v5.l, v1.l, v2.l, s3 +// GFX12: v_maxmin_num_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00] -v_maxmin_num_f16 v5, v255, s2, s105 -// GFX12: v_maxmin_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01] +v_maxmin_num_f16 v5.l, v255.l, s2, s105 +// GFX12: v_maxmin_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01] -v_maxmin_num_f16 v5, s1, v255, exec_hi -// GFX12: v_maxmin_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01] +v_maxmin_num_f16 v5.l, s1, v255.l, exec_hi +// GFX12: v_maxmin_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01] -v_maxmin_num_f16 v5, s105, s105, exec_lo -// GFX12: v_maxmin_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01] +v_maxmin_num_f16 v5.l, s105, s105, exec_lo +// GFX12: v_maxmin_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01] -v_maxmin_num_f16 v5, vcc_lo, ttmp15, v3 -// GFX12: v_maxmin_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04] +v_maxmin_num_f16 v5.l, vcc_lo, ttmp15, v3.l +// GFX12: v_maxmin_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04] -v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255 -// GFX12: v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l +// GFX12: v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] -v_maxmin_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| -// GFX12: v_maxmin_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1] +v_maxmin_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX12: v_maxmin_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1] -v_maxmin_num_f16 v5, m0, 0.5, m0 -// GFX12: v_maxmin_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01] +v_maxmin_num_f16 v5.l, m0, 0.5, m0 +// GFX12: v_maxmin_num_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01] -v_maxmin_num_f16 v5, |exec_lo|, -1, vcc_hi -// GFX12: v_maxmin_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01] +v_maxmin_num_f16 v5.l, |exec_lo|, -1, vcc_hi +// GFX12: v_maxmin_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01] -v_maxmin_num_f16 v5, -|exec_hi|, null, -|vcc_lo| -// GFX12: v_maxmin_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1] +v_maxmin_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| +// GFX12: v_maxmin_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1] -v_maxmin_num_f16 v5, null, exec_lo, -|0xfe0b| -// GFX12: v_maxmin_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +v_maxmin_num_f16 v5.l, null, exec_lo, -|0xfe0b| +// GFX12: v_maxmin_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] -v_maxmin_num_f16 v5, -1, -|exec_hi|, -|src_scc| -// GFX12: v_maxmin_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3] +v_maxmin_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| +// GFX12: v_maxmin_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3] -v_maxmin_num_f16 v5, 0.5, -m0, 0.5 mul:2 -// GFX12: v_maxmin_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b] +v_maxmin_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 +// GFX12: v_maxmin_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b] -v_maxmin_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 -// GFX12: v_maxmin_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33] +v_maxmin_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 +// GFX12: v_maxmin_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33] -v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 -// GFX12: v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +v_maxmin_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 +// GFX12: v_maxmin_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] + +v_maxmin_num_f16 v5.l, v255.h, s2, s105 +// GFX12: v_maxmin_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6b,0xd6,0xff,0x05,0xa4,0x01] + +v_maxmin_num_f16 v5.l, s1, v255.h, exec_hi +// GFX12: v_maxmin_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6b,0xd6,0x01,0xfe,0xff,0x01] + +v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h +// GFX12: v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +v_maxmin_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp div:2 +// GFX12: v_maxmin_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] v_maxmin_num_f32 v5, v1, v2, s3 // GFX12: v_maxmin_num_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x69,0xd6,0x01,0x05,0x0e,0x00] @@ -4580,50 +4592,62 @@ v_min_u16 v255.l, 0xfe0b, vcc_hi v_min_u16 v255.h, 0xfe0b, vcc_hi // GFX12: v_min_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_minmax_num_f16 v5, v1, v2, s3 -// GFX12: v_minmax_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00] +v_minmax_num_f16 v5.l, v1.l, v2.l, s3 +// GFX12: v_minmax_num_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00] + +v_minmax_num_f16 v5.l, v255.l, s2, s105 +// GFX12: v_minmax_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01] + +v_minmax_num_f16 v5.l, s1, v255.l, exec_hi +// GFX12: v_minmax_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01] + +v_minmax_num_f16 v5.l, s105, s105, exec_lo +// GFX12: v_minmax_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01] + +v_minmax_num_f16 v5.l, vcc_lo, ttmp15, v3.l +// GFX12: v_minmax_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04] -v_minmax_num_f16 v5, v255, s2, s105 -// GFX12: v_minmax_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01] +v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l +// GFX12: v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] -v_minmax_num_f16 v5, s1, v255, exec_hi -// GFX12: v_minmax_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01] +v_minmax_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| +// GFX12: v_minmax_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1] -v_minmax_num_f16 v5, s105, s105, exec_lo -// GFX12: v_minmax_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01] +v_minmax_num_f16 v5.l, m0, 0.5, m0 +// GFX12: v_minmax_num_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01] -v_minmax_num_f16 v5, vcc_lo, ttmp15, v3 -// GFX12: v_minmax_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04] +v_minmax_num_f16 v5.l, |exec_lo|, -1, vcc_hi +// GFX12: v_minmax_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01] -v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255 -// GFX12: v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +v_minmax_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| +// GFX12: v_minmax_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1] -v_minmax_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| -// GFX12: v_minmax_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1] +v_minmax_num_f16 v5.l, null, exec_lo, -|0xfe0b| +// GFX12: v_minmax_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] -v_minmax_num_f16 v5, m0, 0.5, m0 -// GFX12: v_minmax_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01] +v_minmax_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| +// GFX12: v_minmax_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3] -v_minmax_num_f16 v5, |exec_lo|, -1, vcc_hi -// GFX12: v_minmax_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01] +v_minmax_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 +// GFX12: v_minmax_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b] -v_minmax_num_f16 v5, -|exec_hi|, null, -|vcc_lo| -// GFX12: v_minmax_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1] +v_minmax_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 +// GFX12: v_minmax_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33] -v_minmax_num_f16 v5, null, exec_lo, -|0xfe0b| -// GFX12: v_minmax_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +v_minmax_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 +// GFX12: v_minmax_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] -v_minmax_num_f16 v5, -1, -|exec_hi|, -|src_scc| -// GFX12: v_minmax_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3] +v_minmax_num_f16 v5.l, v255.h, s2, s105 +// GFX12: v_minmax_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6a,0xd6,0xff,0x05,0xa4,0x01] -v_minmax_num_f16 v5, 0.5, -m0, 0.5 mul:2 -// GFX12: v_minmax_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b] +v_minmax_num_f16 v5.l, s1, v255.h, exec_hi +// GFX12: v_minmax_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6a,0xd6,0x01,0xfe,0xff,0x01] -v_minmax_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 -// GFX12: v_minmax_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33] +v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h +// GFX12: v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] -v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 -// GFX12: v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +v_minmax_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp div:2 +// GFX12: v_minmax_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] v_minmax_num_f32 v5, v1, v2, s3 // GFX12: v_minmax_num_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x68,0xd6,0x01,0x05,0x0e,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s index ee4561fad367c..ffcf65187747b 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s @@ -24,11 +24,11 @@ v_minmax_f32_e64_dpp v0, -v1, -v2, -v3 dpp8:[0,1,2,3,4,5,6,7] v_maxmin_f32_e64_dpp v0, v1, v2, v3 clamp dpp8:[0,1,2,3,4,5,6,7] // GFX12: v_maxmin_num_f32_e64_dpp v0, v1, v2, v3 clamp dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x00,0x80,0x69,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0xc6,0xfa] -v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] -// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] -v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] -// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] v_mad_i64_i32 v[5:6], s12, v1, v2, v[3:4] // GFX12: v_mad_co_i64_i32 v[5:6], s12, v1, v2, v[3:4] ; encoding: [0x05,0x0c,0xff,0xd6,0x01,0x05,0x0e,0x04] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s index 623e66885aaec..aa804cc302bf0 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s @@ -2921,53 +2921,98 @@ v_max_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ v_max_u16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_max_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] -// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] -v_maxmin_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] -// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff] +v_maxmin_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff] -v_maxmin_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] -// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff] +v_maxmin_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff] -v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] -// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] -v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 row_mirror -// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] -v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror -// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] -v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 -// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] -v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 -// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] -v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 -// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] -v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 -// GFX12: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] +v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] -v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 -// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] -v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 -// GFX12: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] -v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] -v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] +v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] -v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] +v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] -v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] +v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15 +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff] + +v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1 +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0xfe,0xa1,0x01,0x21,0x01,0xff] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15 +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xfa,0xc1,0x01,0x2f,0x01,0xff] + +v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x50,0x01,0xff] + +v_maxmin_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] + +v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13] + +v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h quad_perm:[3,2,1,0] +// GFX12: v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h quad_perm:[0,1,2,3] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] + +v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] + +v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x13,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13] + +v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] // GFX12: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] @@ -3956,53 +4001,98 @@ v_min_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ v_min_u16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: v_min_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30] -v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] -// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] + +v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 +// GFX12: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] + +v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 +// GFX12: v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] + +v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] + +v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] + +v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] -v_minmax_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] -// GFX12: v_minmax_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff] +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] -v_minmax_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] -// GFX12: v_minmax_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff] +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff] -v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] -// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff] -v_minmax_num_f16_e64_dpp v5, v1, v2, v3 row_mirror -// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff] -v_minmax_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror -// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15 +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff] -v_minmax_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 -// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] +v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1 +// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0xfe,0xa1,0x01,0x21,0x01,0xff] -v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 -// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15 +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xfa,0xc1,0x01,0x2f,0x01,0xff] -v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 -// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +v_minmax_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x50,0x01,0xff] -v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 -// GFX12: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] +v_minmax_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_minmax_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] -v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 -// GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13] -v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 -// GFX12: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h quad_perm:[3,2,1,0] +// GFX12: v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] -v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf -// GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h quad_perm:[0,1,2,3] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] -v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] +v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] -v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13] +v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x13,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13] -v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX12: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] +v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] // GFX12: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s index 056ea80d8a99d..e93a65ec92e73 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s @@ -1878,47 +1878,86 @@ v_max_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 v_max_u16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_max_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0x09,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] -v_maxmin_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05] +v_maxmin_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05] -v_maxmin_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05] +v_maxmin_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05] -v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] -v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] -v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] -v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] -v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] +v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] -v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] -v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] -v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] -v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] +v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] -v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6b,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] +v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6b,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] -v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] -// GFX12: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6b,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6b,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0xfe,0xa1,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6b,0xd6,0xe9,0x04,0xfa,0xc1,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x03,0x6b,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x13,0x6b,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05] + +v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x6b,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x69,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] @@ -2673,47 +2712,86 @@ v_min_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 v_min_u16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: v_min_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0x0b,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] -v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6a,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] + +v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] +// GFX12: v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6a,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] -v_minmax_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_minmax_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05] +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05] -v_minmax_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_minmax_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05] +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05] -v_minmax_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05] -v_minmax_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0xfe,0xa1,0x01,0x77,0x39,0x05] -v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6a,0xd6,0xe9,0x04,0xfa,0xc1,0x01,0x77,0x39,0x05] -v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +v_minmax_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x77,0x39,0x05] -v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] +v_minmax_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] -v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x03,0x6a,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05] -v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] -v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] -v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] +v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] -v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6a,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] +v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x13,0x6a,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05] -v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] -// GFX12: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6a,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x6a,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] v_minmax_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] // GFX12: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt index f9e236977c973..adcca58776100 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt @@ -4361,49 +4361,118 @@ # W64-FAKE16: v_max_u16 v255, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00 -# GFX11: v_maxmin_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00] +# W32-REAL16: v_maxmin_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00] +# W32-FAKE16: v_maxmin_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00] +# W64-REAL16: v_maxmin_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00] +# W64-FAKE16: v_maxmin_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00] 0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01 -# GFX11: v_maxmin_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01] +# W32-REAL16: v_maxmin_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01] +# W32-FAKE16: v_maxmin_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01] +# W64-REAL16: v_maxmin_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01] +# W64-FAKE16: v_maxmin_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01] 0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01 -# GFX11: v_maxmin_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01] +# W32-REAL16: v_maxmin_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01] +# W32-FAKE16: v_maxmin_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01] +# W64-REAL16: v_maxmin_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01] +# W64-FAKE16: v_maxmin_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01] 0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01 -# GFX11: v_maxmin_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01] +# W32-REAL16: v_maxmin_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01] +# W32-FAKE16: v_maxmin_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01] +# W64-REAL16: v_maxmin_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01] +# W64-FAKE16: v_maxmin_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01] 0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04 -# GFX11: v_maxmin_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04] +# W32-REAL16: v_maxmin_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04] +# W32-FAKE16: v_maxmin_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04] +# W64-REAL16: v_maxmin_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04] +# W64-FAKE16: v_maxmin_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04] 0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00 -# GFX11: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] 0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1 -# GFX11: v_maxmin_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1] +# W32-REAL16: v_maxmin_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1] +# W32-FAKE16: v_maxmin_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1] +# W64-REAL16: v_maxmin_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1] +# W64-FAKE16: v_maxmin_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1] 0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01 -# GFX11: v_maxmin_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01] +# W32-REAL16: v_maxmin_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01] +# W32-FAKE16: v_maxmin_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01] +# W64-REAL16: v_maxmin_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01] +# W64-FAKE16: v_maxmin_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01] 0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01 -# GFX11: v_maxmin_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01] +# W32-REAL16: v_maxmin_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01] +# W32-FAKE16: v_maxmin_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01] +# W64-REAL16: v_maxmin_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01] +# W64-FAKE16: v_maxmin_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01] 0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1 -# GFX11: v_maxmin_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1] +# W32-REAL16: v_maxmin_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1] +# W32-FAKE16: v_maxmin_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1] +# W64-REAL16: v_maxmin_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1] +# W64-FAKE16: v_maxmin_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1] 0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00 -# GFX11: v_maxmin_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_maxmin_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_maxmin_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_maxmin_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_maxmin_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] 0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3 -# GFX11: v_maxmin_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3] +# W32-REAL16: v_maxmin_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3] +# W32-FAKE16: v_maxmin_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3] +# W64-REAL16: v_maxmin_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3] +# W64-FAKE16: v_maxmin_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3] 0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b -# GFX11: v_maxmin_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b] +# W32-REAL16: v_maxmin_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b] +# W32-FAKE16: v_maxmin_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b] +# W64-REAL16: v_maxmin_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b] +# W64-FAKE16: v_maxmin_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b] 0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33 -# GFX11: v_maxmin_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33] +# W32-REAL16: v_maxmin_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33] +# W32-FAKE16: v_maxmin_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33] +# W64-REAL16: v_maxmin_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33] +# W64-FAKE16: v_maxmin_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33] 0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00 -# GFX11: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_maxmin_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_maxmin_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] + +0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01 +# W32-REAL16: v_maxmin_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01] +# W32-FAKE16: v_maxmin_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01] +# W64-REAL16: v_maxmin_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01] +# W64-FAKE16: v_maxmin_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01] + +0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01 +# W32-REAL16: v_maxmin_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01] +# W32-FAKE16: v_maxmin_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01] +# W64-REAL16: v_maxmin_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01] +# W64-FAKE16: v_maxmin_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01] + +0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00 +# W32-REAL16: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00 +# W32-REAL16: v_maxmin_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_maxmin_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] 0x05,0x00,0x5e,0xd6,0x01,0x05,0x0e,0x00 # GFX11: v_maxmin_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x5e,0xd6,0x01,0x05,0x0e,0x00] @@ -5851,49 +5920,118 @@ # W64-FAKE16: v_min_u16 v255, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00 -# GFX11: v_minmax_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00] +# W32-REAL16: v_minmax_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00] +# W32-FAKE16: v_minmax_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00] +# W64-REAL16: v_minmax_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00] +# W64-FAKE16: v_minmax_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00] 0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01 -# GFX11: v_minmax_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01] +# W32-REAL16: v_minmax_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01] +# W32-FAKE16: v_minmax_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01] +# W64-REAL16: v_minmax_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01] +# W64-FAKE16: v_minmax_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01] 0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01 -# GFX11: v_minmax_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01] +# W32-REAL16: v_minmax_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01] +# W32-FAKE16: v_minmax_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01] +# W64-REAL16: v_minmax_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01] +# W64-FAKE16: v_minmax_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01] 0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01 -# GFX11: v_minmax_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01] +# W32-REAL16: v_minmax_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01] +# W32-FAKE16: v_minmax_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01] +# W64-REAL16: v_minmax_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01] +# W64-FAKE16: v_minmax_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01] 0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04 -# GFX11: v_minmax_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04] +# W32-REAL16: v_minmax_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04] +# W32-FAKE16: v_minmax_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04] +# W64-REAL16: v_minmax_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04] +# W64-FAKE16: v_minmax_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04] 0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00 -# GFX11: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] 0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1 -# GFX11: v_minmax_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1] +# W32-REAL16: v_minmax_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1] +# W32-FAKE16: v_minmax_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1] +# W64-REAL16: v_minmax_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1] +# W64-FAKE16: v_minmax_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1] 0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01 -# GFX11: v_minmax_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01] +# W32-REAL16: v_minmax_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01] +# W32-FAKE16: v_minmax_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01] +# W64-REAL16: v_minmax_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01] +# W64-FAKE16: v_minmax_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01] 0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01 -# GFX11: v_minmax_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01] +# W32-REAL16: v_minmax_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01] +# W32-FAKE16: v_minmax_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01] +# W64-REAL16: v_minmax_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01] +# W64-FAKE16: v_minmax_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01] 0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1 -# GFX11: v_minmax_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1] +# W32-REAL16: v_minmax_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1] +# W32-FAKE16: v_minmax_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1] +# W64-REAL16: v_minmax_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1] +# W64-FAKE16: v_minmax_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1] 0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00 -# GFX11: v_minmax_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_minmax_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_minmax_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_minmax_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_minmax_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] 0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3 -# GFX11: v_minmax_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3] +# W32-REAL16: v_minmax_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3] +# W32-FAKE16: v_minmax_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3] +# W64-REAL16: v_minmax_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3] +# W64-FAKE16: v_minmax_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3] 0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b -# GFX11: v_minmax_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b] +# W32-REAL16: v_minmax_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b] +# W32-FAKE16: v_minmax_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b] +# W64-REAL16: v_minmax_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b] +# W64-FAKE16: v_minmax_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b] 0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33 -# GFX11: v_minmax_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33] +# W32-REAL16: v_minmax_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33] +# W32-FAKE16: v_minmax_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33] +# W64-REAL16: v_minmax_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33] +# W64-FAKE16: v_minmax_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33] 0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00 -# GFX11: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_minmax_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_minmax_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] + +0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01 +# W32-REAL16: v_minmax_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01] +# W32-FAKE16: v_minmax_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01] +# W64-REAL16: v_minmax_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01] +# W64-FAKE16: v_minmax_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01] + +0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01 +# W32-REAL16: v_minmax_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01] +# W32-FAKE16: v_minmax_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01] +# W64-REAL16: v_minmax_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01] +# W64-FAKE16: v_minmax_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01] + +0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00 +# W32-REAL16: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00 +# W32-REAL16: v_minmax_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_minmax_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] 0x05,0x00,0x5f,0xd6,0x01,0x05,0x0e,0x00 # GFX11: v_minmax_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x5f,0xd6,0x01,0x05,0x0e,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt index 132fc80dda47d..2964360a77fd2 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt @@ -2113,46 +2113,118 @@ # W64-FAKE16: v_max_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff -# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] 0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff -# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] 0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff -# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] 0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff -# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] 0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff -# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] 0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff -# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] 0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff -# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] 0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff -# GFX11: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] 0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff -# GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] 0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff -# GFX11: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] 0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff -# GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] 0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01 -# GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] 0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13 -# GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13] +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13] 0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30 -# GFX11: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W32-REAL16: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] + +0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff +# W32-REAL16: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] + +0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01 +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] + +0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13 +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13] + +0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30 +# W32-REAL16: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] 0x05,0x00,0x5e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff # GFX11: v_maxmin_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] @@ -2833,46 +2905,118 @@ # W64-FAKE16: v_min_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff -# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] 0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff -# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] 0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff -# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] 0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff -# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] 0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff -# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] 0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff -# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] 0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff -# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] 0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff -# GFX11: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] 0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff -# GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] 0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff -# GFX11: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] 0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff -# GFX11: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] 0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01 -# GFX11: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] 0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13 -# GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13] +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13] 0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30 -# GFX11: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W32-REAL16: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] + +0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff +# W32-REAL16: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + +0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] + +0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01 +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] + +0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13 +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13] + +0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30 +# W32-REAL16: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] 0x05,0x00,0x5f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff # GFX11: v_minmax_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt index 714fac9fe62a0..7a81ba23afa35 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt @@ -1141,40 +1141,106 @@ # W64-FAKE16: v_max_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 -# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] 0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05 -# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] 0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05 -# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] 0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05 -# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] 0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05 -# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] 0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05 -# GFX11: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] 0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05 -# GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] 0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05 -# GFX11: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] 0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05 -# GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] 0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05 -# GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] 0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05 -# GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] 0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00 -# GFX11: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W32-REAL16: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W64-REAL16: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 +# W32-REAL16: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05 +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05 +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] + +0x05,0x13,0x60,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05 +# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x60,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x60,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x60,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x60,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05] + +0xff,0xc7,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00 +# W32-REAL16: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W64-REAL16: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] 0x05,0x00,0x5e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 # GFX11: v_maxmin_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] @@ -1585,40 +1651,106 @@ # W64-FAKE16: v_min_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 -# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] 0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05 -# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] 0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05 -# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] 0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05 -# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] 0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05 -# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] 0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05 -# GFX11: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] 0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05 -# GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] 0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05 -# GFX11: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] 0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05 -# GFX11: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] 0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05 -# GFX11: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] 0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05 -# GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] 0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00 -# GFX11: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W32-REAL16: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W64-REAL16: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 +# W32-REAL16: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05 +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05 +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] + +0x05,0x13,0x61,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05 +# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x61,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x61,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x61,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x61,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05] + +0xff,0xc7,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00 +# W32-REAL16: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W64-REAL16: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] 0x05,0x00,0x5f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 # GFX11: v_minmax_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt index 6d48440633f4f..633d3a48634fa 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt @@ -4265,49 +4265,120 @@ # W64-FAKE16: v_max_u16 v255, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00 -# GFX12: v_maxmin_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00] +# W32-REAL16: v_maxmin_num_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00] +# W32-FAKE16: v_maxmin_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00] +# W64-REAL16: v_maxmin_num_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00] +# W64-FAKE16: v_maxmin_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00] 0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01 -# GFX12: v_maxmin_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01] +# W32-REAL16: v_maxmin_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01] +# W32-FAKE16: v_maxmin_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01] +# W64-REAL16: v_maxmin_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01] +# W64-FAKE16: v_maxmin_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01] 0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01 -# GFX12: v_maxmin_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01] +# W32-REAL16: v_maxmin_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01] +# W32-FAKE16: v_maxmin_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01] +# W64-REAL16: v_maxmin_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01] +# W64-FAKE16: v_maxmin_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01] 0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01 -# GFX12: v_maxmin_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01] +# W32-REAL16: v_maxmin_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01] +# W32-FAKE16: v_maxmin_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01] +# W64-REAL16: v_maxmin_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01] +# W64-FAKE16: v_maxmin_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01] 0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04 -# GFX12: v_maxmin_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04] +# W32-REAL16: v_maxmin_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04] +# W32-FAKE16: v_maxmin_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04] +# W64-REAL16: v_maxmin_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04] +# W64-FAKE16: v_maxmin_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04] 0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00 -# GFX12: v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] 0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1 -# GFX12: v_maxmin_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1] +# W32-REAL16: v_maxmin_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1] +# W32-FAKE16: v_maxmin_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1] +# W64-REAL16: v_maxmin_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1] +# W64-FAKE16: v_maxmin_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1] 0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01 -# GFX12: v_maxmin_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01] +# W32-REAL16: v_maxmin_num_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01] +# W32-FAKE16: v_maxmin_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01] +# W64-REAL16: v_maxmin_num_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01] +# W64-FAKE16: v_maxmin_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01] 0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01 -# GFX12: v_maxmin_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01] +# W32-REAL16: v_maxmin_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01] +# W32-FAKE16: v_maxmin_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01] +# W64-REAL16: v_maxmin_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01] +# W64-FAKE16: v_maxmin_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01] 0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1 -# GFX12: v_maxmin_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1] +# W32-REAL16: v_maxmin_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1] +# W32-FAKE16: v_maxmin_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1] +# W64-REAL16: v_maxmin_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1] +# W64-FAKE16: v_maxmin_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1] 0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00 -# GFX12: v_maxmin_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_maxmin_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_maxmin_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_maxmin_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_maxmin_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] 0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3 -# GFX12: v_maxmin_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3] +# W32-REAL16: v_maxmin_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3] +# W32-FAKE16: v_maxmin_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3] +# W64-REAL16: v_maxmin_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3] +# W64-FAKE16: v_maxmin_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3] 0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b -# GFX12: v_maxmin_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b] +# W32-REAL16: v_maxmin_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b] +# W32-FAKE16: v_maxmin_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b] +# W64-REAL16: v_maxmin_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b] +# W64-FAKE16: v_maxmin_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b] 0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33 -# GFX12: v_maxmin_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33] +# W32-REAL16: v_maxmin_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33] +# W32-FAKE16: v_maxmin_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33] +# W64-REAL16: v_maxmin_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33] +# W64-FAKE16: v_maxmin_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33] 0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00 -# GFX12: v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_maxmin_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_maxmin_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] + +0x05,0x08,0x6b,0xd6,0xff,0x05,0xa4,0x01 +# W32-REAL16: v_maxmin_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6b,0xd6,0xff,0x05,0xa4,0x01] +# W32-FAKE16: v_maxmin_num_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6b,0xd6,0xff,0x05,0xa4,0x01] +# W64-REAL16: v_maxmin_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6b,0xd6,0xff,0x05,0xa4,0x01] +# W64-FAKE16: v_maxmin_num_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6b,0xd6,0xff,0x05,0xa4,0x01] + + +0x05,0x10,0x6b,0xd6,0x01,0xfe,0xff,0x01 +# W32-REAL16: v_maxmin_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6b,0xd6,0x01,0xfe,0xff,0x01] +# W32-FAKE16: v_maxmin_num_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6b,0xd6,0x01,0xfe,0xff,0x01] +# W64-REAL16: v_maxmin_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6b,0xd6,0x01,0xfe,0xff,0x01] +# W64-FAKE16: v_maxmin_num_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6b,0xd6,0x01,0xfe,0xff,0x01] + + +0x05,0x20,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00 +# W32-REAL16: v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +0xff,0xc3,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00 +# W32-REAL16: v_maxmin_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_maxmin_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] 0x05,0x00,0x69,0xd6,0x01,0x05,0x0e,0x00 # GFX12: v_maxmin_num_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x69,0xd6,0x01,0x05,0x0e,0x00] @@ -5693,49 +5764,120 @@ # W64-FAKE16: v_min_u16 v255, 0xfe0b, vcc_hi ; encoding: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00] 0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00 -# GFX12: v_minmax_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00] +# W32-REAL16: v_minmax_num_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00] +# W32-FAKE16: v_minmax_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00] +# W64-REAL16: v_minmax_num_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00] +# W64-FAKE16: v_minmax_num_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00] 0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01 -# GFX12: v_minmax_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01] +# W32-REAL16: v_minmax_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01] +# W32-FAKE16: v_minmax_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01] +# W64-REAL16: v_minmax_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01] +# W64-FAKE16: v_minmax_num_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01] 0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01 -# GFX12: v_minmax_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01] +# W32-REAL16: v_minmax_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01] +# W32-FAKE16: v_minmax_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01] +# W64-REAL16: v_minmax_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01] +# W64-FAKE16: v_minmax_num_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01] 0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01 -# GFX12: v_minmax_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01] +# W32-REAL16: v_minmax_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01] +# W32-FAKE16: v_minmax_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01] +# W64-REAL16: v_minmax_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01] +# W64-FAKE16: v_minmax_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01] 0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04 -# GFX12: v_minmax_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04] +# W32-REAL16: v_minmax_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04] +# W32-FAKE16: v_minmax_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04] +# W64-REAL16: v_minmax_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04] +# W64-FAKE16: v_minmax_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04] 0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00 -# GFX12: v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] 0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1 -# GFX12: v_minmax_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1] +# W32-REAL16: v_minmax_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1] +# W32-FAKE16: v_minmax_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1] +# W64-REAL16: v_minmax_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1] +# W64-FAKE16: v_minmax_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1] 0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01 -# GFX12: v_minmax_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01] +# W32-REAL16: v_minmax_num_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01] +# W32-FAKE16: v_minmax_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01] +# W64-REAL16: v_minmax_num_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01] +# W64-FAKE16: v_minmax_num_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01] 0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01 -# GFX12: v_minmax_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01] +# W32-REAL16: v_minmax_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01] +# W32-FAKE16: v_minmax_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01] +# W64-REAL16: v_minmax_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01] +# W64-FAKE16: v_minmax_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01] 0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1 -# GFX12: v_minmax_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1] +# W32-REAL16: v_minmax_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1] +# W32-FAKE16: v_minmax_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1] +# W64-REAL16: v_minmax_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1] +# W64-FAKE16: v_minmax_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1] 0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00 -# GFX12: v_minmax_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_minmax_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_minmax_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_minmax_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_minmax_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00] 0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3 -# GFX12: v_minmax_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3] +# W32-REAL16: v_minmax_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3] +# W32-FAKE16: v_minmax_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3] +# W64-REAL16: v_minmax_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3] +# W64-FAKE16: v_minmax_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3] 0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b -# GFX12: v_minmax_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b] +# W32-REAL16: v_minmax_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b] +# W32-FAKE16: v_minmax_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b] +# W64-REAL16: v_minmax_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b] +# W64-FAKE16: v_minmax_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b] 0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33 -# GFX12: v_minmax_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33] +# W32-REAL16: v_minmax_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33] +# W32-FAKE16: v_minmax_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33] +# W64-REAL16: v_minmax_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33] +# W64-FAKE16: v_minmax_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33] 0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00 -# GFX12: v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W32-REAL16: v_minmax_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_minmax_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] + +0x05,0x08,0x6a,0xd6,0xff,0x05,0xa4,0x01 +# W32-REAL16: v_minmax_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6a,0xd6,0xff,0x05,0xa4,0x01] +# W32-FAKE16: v_minmax_num_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6a,0xd6,0xff,0x05,0xa4,0x01] +# W64-REAL16: v_minmax_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6a,0xd6,0xff,0x05,0xa4,0x01] +# W64-FAKE16: v_minmax_num_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6a,0xd6,0xff,0x05,0xa4,0x01] + + +0x05,0x10,0x6a,0xd6,0x01,0xfe,0xff,0x01 +# W32-REAL16: v_minmax_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6a,0xd6,0x01,0xfe,0xff,0x01] +# W32-FAKE16: v_minmax_num_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6a,0xd6,0x01,0xfe,0xff,0x01] +# W64-REAL16: v_minmax_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6a,0xd6,0x01,0xfe,0xff,0x01] +# W64-FAKE16: v_minmax_num_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6a,0xd6,0x01,0xfe,0xff,0x01] + + +0x05,0x20,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00 +# W32-REAL16: v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00] + +0xff,0xc3,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00 +# W32-REAL16: v_minmax_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W32-FAKE16: v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W64-REAL16: v_minmax_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] +# W64-FAKE16: v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00] 0x05,0x00,0x68,0xd6,0x01,0x05,0x0e,0x00 # GFX12: v_minmax_num_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x68,0xd6,0x01,0x05,0x0e,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt index 561d3a6ca7f90..7e30a4a2096b1 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt @@ -2329,52 +2329,131 @@ # W64-FAKE16: v_max_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff -# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] 0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff -# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] 0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff -# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff] 0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff -# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] 0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff -# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] 0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff -# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] 0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff -# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] 0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff -# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] 0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff -# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] 0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff -# GFX12: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] 0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff -# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] 0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff -# GFX12: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] 0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff -# GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] 0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01 -# GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] 0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13 -# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13] 0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30 -# GFX12: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] + +0x05,0x78,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + + +0x05,0x20,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] + +0x05,0x0a,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01 +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] + +0x05,0x13,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13 +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13] + +0xff,0xc7,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30 +# W32-REAL16: v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] 0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff # GFX12: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] @@ -3082,49 +3161,125 @@ # W64-FAKE16: v_min_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff -# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] 0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff -# GFX12: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff] 0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff -# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff] 0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff -# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff] 0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff -# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff] 0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff -# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff] 0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff -# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff] 0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff -# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff] 0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff -# GFX12: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff] 0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff -# GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff] 0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff -# GFX12: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff] 0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff -# GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff] 0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01 -# GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01] 0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13 -# GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13] +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13] 0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30 -# GFX12: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W32-REAL16: v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] + +0x05,0x78,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] + + +0x05,0x20,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff] + +0x05,0x0a,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01 +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01] + +0x05,0x13,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13 +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13] + +0xff,0xc7,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30 +# W32-REAL16: v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W64-REAL16: v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] 0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff # GFX12: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt index 06b4bfcc8985f..2aaba2a17fae6 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt @@ -1294,43 +1294,112 @@ # W64-FAKE16: v_max_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 -# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] 0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05 -# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] 0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05 -# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] 0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05 -# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] 0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05 -# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] 0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05 -# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] 0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05 -# GFX12: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] 0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05 -# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] 0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05 -# GFX12: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] 0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05 -# GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] 0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05 -# GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] 0x05,0x06,0x6b,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05 -# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6b,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6b,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6b,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6b,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6b,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] 0xff,0x87,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00 -# GFX12: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W32-REAL16: v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +0x05,0x78,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +0x05,0x20,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05 +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +0x05,0x0a,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05 +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] + +0x05,0x13,0x6b,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05 +# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6b,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6b,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6b,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6b,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05] + +0xff,0xc7,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00 +# W32-REAL16: v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W64-REAL16: v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] 0x05,0x00,0x69,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 # GFX12: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x69,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] @@ -1768,43 +1837,112 @@ # W64-FAKE16: v_min_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 -# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] 0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05 -# GFX12: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05] 0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05 -# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] 0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05 -# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05] 0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05 -# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05] 0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05 -# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05] 0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05 -# GFX12: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05] 0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05 -# GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05] 0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05 -# GFX12: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05] 0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05 -# GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05] 0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05 -# GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05] 0x05,0x06,0x6a,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05 -# GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6a,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6a,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6a,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6a,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6a,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05] 0xff,0x87,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00 -# GFX12: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W32-REAL16: v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W64-REAL16: v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] + +0x05,0x78,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] + +0x05,0x20,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05 +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05] + +0x05,0x0a,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05 +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05] + +0x05,0x13,0x6a,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05 +# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6a,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6a,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05] +# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6a,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6a,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05] + +0xff,0xc7,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00 +# W32-REAL16: v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W32-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W64-REAL16: v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +# W64-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] 0x05,0x00,0x68,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 # GFX12: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/RISCV/custom_reloc.s b/llvm/test/MC/RISCV/custom_reloc.s index 4bd470008ee52..cdb819467875f 100644 --- a/llvm/test/MC/RISCV/custom_reloc.s +++ b/llvm/test/MC/RISCV/custom_reloc.s @@ -21,16 +21,33 @@ .reloc ., R_RISCV_VENDOR, VENDOR_NAME .reloc ., R_RISCV_CUSTOM192, my_foo + 1 addi a0, a0, 0 - # CHECK-ASM: [[L1:.L[^:]+]]: + # CHECK-ASM: [[L1:.L[^:]+]]: # CHECK-ASM-NEXT: .reloc [[L1]], R_RISCV_VENDOR, VENDOR_NAME # CHECK-ASM-NEXT: [[L2:.L[^:]+]]: # CHECK-ASM-NEXT: .reloc [[L2]], R_RISCV_CUSTOM192, my_foo+1 # CHECK-ASM-NEXT: mv a0, a0 - # CHECK-OBJ: addi a0, a0, 0 + # CHECK-OBJ: addi a0, a0, 0 # CHECK-OBJ-NEXT: R_RISCV_VENDOR VENDOR_NAME # CHECK-OBJ-NEXT: R_RISCV_CUSTOM192 my_foo+0x1 nop # CHECK-ASM: nop # CHECK-OBJ: addi zero, zero, 0x0 + + .reloc ., R_RISCV_VENDOR, QUALCOMM + .reloc ., R_RISCV_QC_ABS20_U, my_bar + 2 + addi a1, a1, 0 + # CHECK-ASM: [[L3:.L[^:]+]]: + # CHECK-ASM-NEXT: .reloc [[L3]], R_RISCV_VENDOR, QUALCOMM + # CHECK-ASM-NEXT: [[L4:.L[^:]+]]: + # CHECK-ASM-NEXT: .reloc [[L4]], R_RISCV_QC_ABS20_U, my_bar+2 + # CHECK-ASM-NEXT: mv a1, a1 + + # CHECK-OBJ: addi a1, a1, 0 + # CHECK-OBJ-NEXT: R_RISCV_VENDOR QUALCOMM + # CHECK-OBJ-NEXT: R_RISCV_CUSTOM192 my_bar+0x2 + + nop + # CHECK-ASM: nop + # CHECK-OBJ: addi zero, zero, 0x0 diff --git a/llvm/test/Transforms/InstCombine/compare-signs.ll b/llvm/test/Transforms/InstCombine/compare-signs.ll index 9703b47b44d0c..59ec9adb30b9e 100644 --- a/llvm/test/Transforms/InstCombine/compare-signs.ll +++ b/llvm/test/Transforms/InstCombine/compare-signs.ll @@ -152,6 +152,19 @@ define i1 @test4a(i32 %a) { ret i1 %c } +define i1 @test4a_commuted(i32 %a) { +; CHECK-LABEL: @test4a_commuted( +; CHECK-NEXT: [[C:%.*]] = icmp slt i32 [[SIGNUM:%.*]], 1 +; CHECK-NEXT: ret i1 [[C]] +; + %l = ashr i32 %a, 31 + %na = sub i32 0, %a + %r = lshr i32 %na, 31 + %signum = or i32 %r, %l + %c = icmp slt i32 %signum, 1 + ret i1 %c +} + define <2 x i1> @test4a_vec(<2 x i32> %a) { ; CHECK-LABEL: @test4a_vec( ; CHECK-NEXT: [[C:%.*]] = icmp slt <2 x i32> [[A:%.*]], splat (i32 1) diff --git a/llvm/test/Transforms/InstCombine/onehot_merge.ll b/llvm/test/Transforms/InstCombine/onehot_merge.ll index 2e57597455c2c..d68de1f1f0190 100644 --- a/llvm/test/Transforms/InstCombine/onehot_merge.ll +++ b/llvm/test/Transforms/InstCombine/onehot_merge.ll @@ -1143,3 +1143,20 @@ define i1 @foo1_and_signbit_lshr_without_shifting_signbit_not_pwr2_logical(i32 % %or = select i1 %t2, i1 true, i1 %t4 ret i1 %or } + +define i1 @two_types_of_bittest(i8 %x, i8 %c) { +; CHECK-LABEL: @two_types_of_bittest( +; CHECK-NEXT: [[T0:%.*]] = shl nuw i8 1, [[C:%.*]] +; CHECK-NEXT: [[ICMP1:%.*]] = icmp slt i8 [[X:%.*]], 0 +; CHECK-NEXT: [[AND:%.*]] = and i8 [[X]], [[T0]] +; CHECK-NEXT: [[ICMP2:%.*]] = icmp ne i8 [[AND]], 0 +; CHECK-NEXT: [[RET:%.*]] = and i1 [[ICMP1]], [[ICMP2]] +; CHECK-NEXT: ret i1 [[RET]] +; + %t0 = shl i8 1, %c + %icmp1 = icmp slt i8 %x, 0 + %and = and i8 %x, %t0 + %icmp2 = icmp ne i8 %and, 0 + %ret = and i1 %icmp1, %icmp2 + ret i1 %ret +} diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll index 4e600fe69f938..386974f3eabfe 100644 --- a/llvm/test/Transforms/InstCombine/select.ll +++ b/llvm/test/Transforms/InstCombine/select.ll @@ -3941,11 +3941,8 @@ entry: define i32 @src_or_eq_0_and_xor(i32 %x, i32 %y) { ; CHECK-LABEL: @src_or_eq_0_and_xor( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[OR]], 0 -; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[Y]], [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 0, i32 [[XOR]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]] +; CHECK-NEXT: ret i32 [[XOR]] ; entry: %or = or i32 %y, %x @@ -3960,11 +3957,8 @@ entry: define i32 @src_or_eq_0_xor_and(i32 %x, i32 %y) { ; CHECK-LABEL: @src_or_eq_0_xor_and( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[OR]], 0 -; CHECK-NEXT: [[AND:%.*]] = and i32 [[Y]], [[X]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 0, i32 [[AND]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]] +; CHECK-NEXT: ret i32 [[AND]] ; entry: %or = or i32 %y, %x @@ -4442,11 +4436,8 @@ define i32 @src_no_trans_select_and_eq0_xor_and(i32 %x, i32 %y) { define i32 @src_no_trans_select_or_eq0_or_and(i32 %x, i32 %y) { ; CHECK-LABEL: @src_no_trans_select_or_eq0_or_and( -; CHECK-NEXT: [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[OR0:%.*]] = icmp eq i32 [[OR]], 0 -; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], [[Y]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[OR0]], i32 0, i32 [[AND]] -; CHECK-NEXT: ret i32 [[COND]] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i32 [[AND]] ; %or = or i32 %x, %y %or0 = icmp eq i32 %or, 0 @@ -4841,3 +4832,16 @@ define i32 @replace_and_cond_multiuse2(i1 %cond1, i1 %cond2) { %mux = select i1 %cond1, i32 %sel, i32 1 ret i32 %mux } + +define i32 @src_simplify_2x_at_once_and(i32 %x, i32 %y) { +; CHECK-LABEL: @src_simplify_2x_at_once_and( +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i32 [[XOR]] +; + %and = and i32 %x, %y + %and0 = icmp eq i32 %and, -1 + %sub = sub i32 %x, %y + %xor = xor i32 %x, %y + %cond = select i1 %and0, i32 %sub, i32 %xor + ret i32 %cond +} diff --git a/llvm/test/Transforms/InstCombine/xor-and-or.ll b/llvm/test/Transforms/InstCombine/xor-and-or.ll index 47275ce31070b..c380e2748f89b 100644 --- a/llvm/test/Transforms/InstCombine/xor-and-or.ll +++ b/llvm/test/Transforms/InstCombine/xor-and-or.ll @@ -25,6 +25,18 @@ define i1 @xor_logic_and_logic_or2(i1 %c, i1 %x, i1 %y) { ret i1 %r } +define i1 @xor_logic_and_logic_or2_commuted(i1 %c, i1 %x, i1 %y) { +; CHECK-LABEL: @xor_logic_and_logic_or2_commuted( +; CHECK-NEXT: [[TMP1:%.*]] = xor i1 [[X:%.*]], true +; CHECK-NEXT: [[R:%.*]] = select i1 [[C:%.*]], i1 [[TMP1]], i1 [[Y:%.*]] +; CHECK-NEXT: ret i1 [[R]] +; + %o = select i1 %y, i1 true, i1 %c + %a = select i1 %c, i1 %x, i1 false + %r = xor i1 %o, %a + ret i1 %r +} + define i1 @xor_logic_and_logic_or3(i1 %c, i1 %x, i1 %y) { ; CHECK-LABEL: @xor_logic_and_logic_or3( ; CHECK-NEXT: [[TMP1:%.*]] = freeze i1 [[C:%.*]] diff --git a/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2i-d2i.ll b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2i-d2i.ll new file mode 100644 index 0000000000000..543c73137c1b6 --- /dev/null +++ b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2i-d2i.ll @@ -0,0 +1,1129 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -passes=instsimplify -march=nvptx64 -S | FileCheck %s + +; f2i/f2ui and d2i/d2ui - double/float to i32 tests + +;############################################################### +;# Tests with Positive 1.5 # +;############################################################### + +;+-------------------------------------------------------------+ +;| f2i | +;+-------------------------------------------------------------+ +define i32 @test_pos_1_5_f2i_rm() { +; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rm() { +; CHECK-NEXT: ret i32 1 +; + %res = call i32 @llvm.nvvm.f2i.rm(float 1.5) + ret i32 %res +} + +define i32 @test_pos_1_5_f2i_rn() { +; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rn() { +; CHECK-NEXT: ret i32 2 +; + %res = call i32 @llvm.nvvm.f2i.rn(float 1.5) + ret i32 %res +} + + +define i32 @test_pos_1_5_f2i_rp() { +; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rp() { +; CHECK-NEXT: ret i32 2 +; + %res = call i32 @llvm.nvvm.f2i.rp(float 1.5) + ret i32 %res +} + +define i32 @test_pos_1_5_f2i_rz() { +; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rz() { +; CHECK-NEXT: ret i32 1 +; + %res = call i32 @llvm.nvvm.f2i.rz(float 1.5) + ret i32 %res +} + +;+-------------------------------------------------------------+ +;| f2i_ftz | +;+-------------------------------------------------------------+ +define i32 @test_pos_1_5_f2i_rm_ftz() { +; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rm_ftz() { +; CHECK-NEXT: ret i32 1 +; + %res = call i32 @llvm.nvvm.f2i.rm.ftz(float 1.5) + ret i32 %res +} + +define i32 @test_pos_1_5_f2i_rn_ftz() { +; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rn_ftz() { +; CHECK-NEXT: ret i32 2 +; + %res = call i32 @llvm.nvvm.f2i.rn.ftz(float 1.5) + ret i32 %res +} + +define i32 @test_pos_1_5_f2i_rp_ftz() { +; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rp_ftz() { +; CHECK-NEXT: ret i32 2 +; + %res = call i32 @llvm.nvvm.f2i.rp.ftz(float 1.5) + ret i32 %res +} + +define i32 @test_pos_1_5_f2i_rz_ftz() { +; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rz_ftz() { +; CHECK-NEXT: ret i32 1 +; + %res = call i32 @llvm.nvvm.f2i.rz.ftz(float 1.5) + ret i32 %res +} +;+-------------------------------------------------------------+ +;| d2i | +;+-------------------------------------------------------------+ +define i32 @test_pos_1_5_d2i_rm() { +; CHECK-LABEL: define i32 @test_pos_1_5_d2i_rm() { +; CHECK-NEXT: ret i32 1 +; + %res = call i32 @llvm.nvvm.d2i.rm(double 1.5) + ret i32 %res +} + +define i32 @test_pos_1_5_d2i_rn() { +; CHECK-LABEL: define i32 @test_pos_1_5_d2i_rn() { +; CHECK-NEXT: ret i32 2 +; + %res = call i32 @llvm.nvvm.d2i.rn(double 1.5) + ret i32 %res +} + + +define i32 @test_pos_1_5_d2i_rp() { +; CHECK-LABEL: define i32 @test_pos_1_5_d2i_rp() { +; CHECK-NEXT: ret i32 2 +; + %res = call i32 @llvm.nvvm.d2i.rp(double 1.5) + ret i32 %res +} + +define i32 @test_pos_1_5_d2i_rz() { +; CHECK-LABEL: define i32 @test_pos_1_5_d2i_rz() { +; CHECK-NEXT: ret i32 1 +; + %res = call i32 @llvm.nvvm.d2i.rz(double 1.5) + ret i32 %res +} + +;+-------------------------------------------------------------+ +;| f2ui | +;+-------------------------------------------------------------+ +define i32 @test_pos_1_5_f2ui_rm() { +; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rm() { +; CHECK-NEXT: ret i32 1 +; + %res = call i32 @llvm.nvvm.f2ui.rm(float 1.5) + ret i32 %res +} + +define i32 @test_pos_1_5_f2ui_rn() { +; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rn() { +; CHECK-NEXT: ret i32 2 +; + %res = call i32 @llvm.nvvm.f2ui.rn(float 1.5) + ret i32 %res +} + + +define i32 @test_pos_1_5_f2ui_rp() { +; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rp() { +; CHECK-NEXT: ret i32 2 +; + %res = call i32 @llvm.nvvm.f2ui.rp(float 1.5) + ret i32 %res +} + +define i32 @test_pos_1_5_f2ui_rz() { +; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rz() { +; CHECK-NEXT: ret i32 1 +; + %res = call i32 @llvm.nvvm.f2ui.rz(float 1.5) + ret i32 %res +} + +;+-------------------------------------------------------------+ +;| f2ui_ftz | +;+-------------------------------------------------------------+ +define i32 @test_pos_1_5_f2ui_rm_ftz() { +; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rm_ftz() { +; CHECK-NEXT: ret i32 1 +; + %res = call i32 @llvm.nvvm.f2ui.rm.ftz(float 1.5) + ret i32 %res +} + +define i32 @test_pos_1_5_f2ui_rn_ftz() { +; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rn_ftz() { +; CHECK-NEXT: ret i32 2 +; + %res = call i32 @llvm.nvvm.f2ui.rn.ftz(float 1.5) + ret i32 %res +} + +define i32 @test_pos_1_5_f2ui_rp_ftz() { +; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rp_ftz() { +; CHECK-NEXT: ret i32 2 +; + %res = call i32 @llvm.nvvm.f2ui.rp.ftz(float 1.5) + ret i32 %res +} + +define i32 @test_pos_1_5_f2ui_rz_ftz() { +; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rz_ftz() { +; CHECK-NEXT: ret i32 1 +; + %res = call i32 @llvm.nvvm.f2ui.rz.ftz(float 1.5) + ret i32 %res +} +;+-------------------------------------------------------------+ +;| d2ui | +;+-------------------------------------------------------------+ +define i32 @test_pos_1_5_d2ui_rm() { +; CHECK-LABEL: define i32 @test_pos_1_5_d2ui_rm() { +; CHECK-NEXT: ret i32 1 +; + %res = call i32 @llvm.nvvm.d2ui.rm(double 1.5) + ret i32 %res +} + +define i32 @test_pos_1_5_d2ui_rn() { +; CHECK-LABEL: define i32 @test_pos_1_5_d2ui_rn() { +; CHECK-NEXT: ret i32 2 +; + %res = call i32 @llvm.nvvm.d2ui.rn(double 1.5) + ret i32 %res +} + + +define i32 @test_pos_1_5_d2ui_rp() { +; CHECK-LABEL: define i32 @test_pos_1_5_d2ui_rp() { +; CHECK-NEXT: ret i32 2 +; + %res = call i32 @llvm.nvvm.d2ui.rp(double 1.5) + ret i32 %res +} + +define i32 @test_pos_1_5_d2ui_rz() { +; CHECK-LABEL: define i32 @test_pos_1_5_d2ui_rz() { +; CHECK-NEXT: ret i32 1 +; + %res = call i32 @llvm.nvvm.d2ui.rz(double 1.5) + ret i32 %res +} + +;############################################################### +;# Tests with Negative 1.5 # +;############################################################### + +;+-------------------------------------------------------------+ +;| f2i | +;+-------------------------------------------------------------+ +define i32 @test_neg_1_5_f2i_rm() { +; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rm() { +; CHECK-NEXT: ret i32 -2 +; + %res = call i32 @llvm.nvvm.f2i.rm(float -1.5) + ret i32 %res +} + +define i32 @test_neg_1_5_f2i_rn() { +; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rn() { +; CHECK-NEXT: ret i32 -2 +; + %res = call i32 @llvm.nvvm.f2i.rn(float -1.5) + ret i32 %res +} + + +define i32 @test_neg_1_5_f2i_rp() { +; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rp() { +; CHECK-NEXT: ret i32 -1 +; + %res = call i32 @llvm.nvvm.f2i.rp(float -1.5) + ret i32 %res +} + +define i32 @test_neg_1_5_f2i_rz() { +; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rz() { +; CHECK-NEXT: ret i32 -1 +; + %res = call i32 @llvm.nvvm.f2i.rz(float -1.5) + ret i32 %res +} + +;+-------------------------------------------------------------+ +;| f2i_ftz | +;+-------------------------------------------------------------+ +define i32 @test_neg_1_5_f2i_rm_ftz() { +; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rm_ftz() { +; CHECK-NEXT: ret i32 -2 +; + %res = call i32 @llvm.nvvm.f2i.rm.ftz(float -1.5) + ret i32 %res +} + +define i32 @test_neg_1_5_f2i_rn_ftz() { +; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rn_ftz() { +; CHECK-NEXT: ret i32 -2 +; + %res = call i32 @llvm.nvvm.f2i.rn.ftz(float -1.5) + ret i32 %res +} + +define i32 @test_neg_1_5_f2i_rp_ftz() { +; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rp_ftz() { +; CHECK-NEXT: ret i32 -1 +; + %res = call i32 @llvm.nvvm.f2i.rp.ftz(float -1.5) + ret i32 %res +} + +define i32 @test_neg_1_5_f2i_rz_ftz() { +; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rz_ftz() { +; CHECK-NEXT: ret i32 -1 +; + %res = call i32 @llvm.nvvm.f2i.rz.ftz(float -1.5) + ret i32 %res +} +;+-------------------------------------------------------------+ +;| d2i | +;+-------------------------------------------------------------+ +define i32 @test_neg_1_5_d2i_rm() { +; CHECK-LABEL: define i32 @test_neg_1_5_d2i_rm() { +; CHECK-NEXT: ret i32 -2 +; + %res = call i32 @llvm.nvvm.d2i.rm(double -1.5) + ret i32 %res +} + +define i32 @test_neg_1_5_d2i_rn() { +; CHECK-LABEL: define i32 @test_neg_1_5_d2i_rn() { +; CHECK-NEXT: ret i32 -2 +; + %res = call i32 @llvm.nvvm.d2i.rn(double -1.5) + ret i32 %res +} + + +define i32 @test_neg_1_5_d2i_rp() { +; CHECK-LABEL: define i32 @test_neg_1_5_d2i_rp() { +; CHECK-NEXT: ret i32 -1 +; + %res = call i32 @llvm.nvvm.d2i.rp(double -1.5) + ret i32 %res +} + +define i32 @test_neg_1_5_d2i_rz() { +; CHECK-LABEL: define i32 @test_neg_1_5_d2i_rz() { +; CHECK-NEXT: ret i32 -1 +; + %res = call i32 @llvm.nvvm.d2i.rz(double -1.5) + ret i32 %res +} + +;+-------------------------------------------------------------+ +;| f2ui | +;+-------------------------------------------------------------+ +define i32 @test_neg_1_5_f2ui_rm() { +; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rm() { +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rm(float -1.500000e+00) +; CHECK-NEXT: ret i32 [[RES]] +; + %res = call i32 @llvm.nvvm.f2ui.rm(float -1.5) + ret i32 %res +} + +define i32 @test_neg_1_5_f2ui_rn() { +; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rn() { +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rn(float -1.500000e+00) +; CHECK-NEXT: ret i32 [[RES]] +; + %res = call i32 @llvm.nvvm.f2ui.rn(float -1.5) + ret i32 %res +} + + +define i32 @test_neg_1_5_f2ui_rp() { +; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rp() { +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rp(float -1.500000e+00) +; CHECK-NEXT: ret i32 [[RES]] +; + %res = call i32 @llvm.nvvm.f2ui.rp(float -1.5) + ret i32 %res +} + +define i32 @test_neg_1_5_f2ui_rz() { +; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rz() { +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rz(float -1.500000e+00) +; CHECK-NEXT: ret i32 [[RES]] +; + %res = call i32 @llvm.nvvm.f2ui.rz(float -1.5) + ret i32 %res +} + +;+-------------------------------------------------------------+ +;| f2ui_ftz | +;+-------------------------------------------------------------+ +define i32 @test_neg_1_5_f2ui_rm_ftz() { +; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rm_ftz() { +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rm.ftz(float -1.500000e+00) +; CHECK-NEXT: ret i32 [[RES]] +; + %res = call i32 @llvm.nvvm.f2ui.rm.ftz(float -1.5) + ret i32 %res +} + +define i32 @test_neg_1_5_f2ui_rn_ftz() { +; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rn_ftz() { +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rn.ftz(float -1.500000e+00) +; CHECK-NEXT: ret i32 [[RES]] +; + %res = call i32 @llvm.nvvm.f2ui.rn.ftz(float -1.5) + ret i32 %res +} + +define i32 @test_neg_1_5_f2ui_rp_ftz() { +; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rp_ftz() { +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rp.ftz(float -1.500000e+00) +; CHECK-NEXT: ret i32 [[RES]] +; + %res = call i32 @llvm.nvvm.f2ui.rp.ftz(float -1.5) + ret i32 %res +} + +define i32 @test_neg_1_5_f2ui_rz_ftz() { +; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rz_ftz() { +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rz.ftz(float -1.500000e+00) +; CHECK-NEXT: ret i32 [[RES]] +; + %res = call i32 @llvm.nvvm.f2ui.rz.ftz(float -1.5) + ret i32 %res +} +;+-------------------------------------------------------------+ +;| d2ui | +;+-------------------------------------------------------------+ +define i32 @test_neg_1_5_d2ui_rm() { +; CHECK-LABEL: define i32 @test_neg_1_5_d2ui_rm() { +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rm(double -1.500000e+00) +; CHECK-NEXT: ret i32 [[RES]] +; + %res = call i32 @llvm.nvvm.d2ui.rm(double -1.5) + ret i32 %res +} + +define i32 @test_neg_1_5_d2ui_rn() { +; CHECK-LABEL: define i32 @test_neg_1_5_d2ui_rn() { +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rn(double -1.500000e+00) +; CHECK-NEXT: ret i32 [[RES]] +; + %res = call i32 @llvm.nvvm.d2ui.rn(double -1.5) + ret i32 %res +} + + +define i32 @test_neg_1_5_d2ui_rp() { +; CHECK-LABEL: define i32 @test_neg_1_5_d2ui_rp() { +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rp(double -1.500000e+00) +; CHECK-NEXT: ret i32 [[RES]] +; + %res = call i32 @llvm.nvvm.d2ui.rp(double -1.5) + ret i32 %res +} + +define i32 @test_neg_1_5_d2ui_rz() { +; CHECK-LABEL: define i32 @test_neg_1_5_d2ui_rz() { +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rz(double -1.500000e+00) +; CHECK-NEXT: ret i32 [[RES]] +; + %res = call i32 @llvm.nvvm.d2ui.rz(double -1.5) + ret i32 %res +} + +;############################################################### +;# Tests with NaN # +;############################################################### + +;+-------------------------------------------------------------+ +;| f2i | +;+-------------------------------------------------------------+ +define i32 @test_nan_f2i_rm() { +; CHECK-LABEL: define i32 @test_nan_f2i_rm() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2i.rm(float 0x7FFFFF0000000000) + ret i32 %res +} + +define i32 @test_nan_f2i_rn() { +; CHECK-LABEL: define i32 @test_nan_f2i_rn() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2i.rn(float 0x7FFFFF0000000000) + ret i32 %res +} + + +define i32 @test_nan_f2i_rp() { +; CHECK-LABEL: define i32 @test_nan_f2i_rp() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2i.rp(float 0x7FFFFF0000000000) + ret i32 %res +} + +define i32 @test_nan_f2i_rz() { +; CHECK-LABEL: define i32 @test_nan_f2i_rz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2i.rz(float 0x7FFFFF0000000000) + ret i32 %res +} + +;+-------------------------------------------------------------+ +;| f2i_ftz | +;+-------------------------------------------------------------+ +define i32 @test_nan_f2i_rm_ftz() { +; CHECK-LABEL: define i32 @test_nan_f2i_rm_ftz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2i.rm.ftz(float 0x7FFFFF0000000000) + ret i32 %res +} + +define i32 @test_nan_f2i_rn_ftz() { +; CHECK-LABEL: define i32 @test_nan_f2i_rn_ftz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2i.rn.ftz(float 0x7FFFFF0000000000) + ret i32 %res +} + +define i32 @test_nan_f2i_rp_ftz() { +; CHECK-LABEL: define i32 @test_nan_f2i_rp_ftz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2i.rp.ftz(float 0x7FFFFF0000000000) + ret i32 %res +} + +define i32 @test_nan_f2i_rz_ftz() { +; CHECK-LABEL: define i32 @test_nan_f2i_rz_ftz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2i.rz.ftz(float 0x7FFFFF0000000000) + ret i32 %res +} +;+-------------------------------------------------------------+ +;| d2i | +;+-------------------------------------------------------------+ +define i32 @test_nan_d2i_rm() { +; CHECK-LABEL: define i32 @test_nan_d2i_rm() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.d2i.rm(double 0xFFF8000000000000) + ret i32 %res +} + +define i32 @test_nan_d2i_rn() { +; CHECK-LABEL: define i32 @test_nan_d2i_rn() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.d2i.rn(double 0xFFF8000000000000) + ret i32 %res +} + + +define i32 @test_nan_d2i_rp() { +; CHECK-LABEL: define i32 @test_nan_d2i_rp() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.d2i.rp(double 0xFFF8000000000000) + ret i32 %res +} + +define i32 @test_nan_d2i_rz() { +; CHECK-LABEL: define i32 @test_nan_d2i_rz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.d2i.rz(double 0xFFF8000000000000) + ret i32 %res +} + +;+-------------------------------------------------------------+ +;| f2ui | +;+-------------------------------------------------------------+ +define i32 @test_nan_f2ui_rm() { +; CHECK-LABEL: define i32 @test_nan_f2ui_rm() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2ui.rm(float 0x7FFFFF0000000000) + ret i32 %res +} + +define i32 @test_nan_f2ui_rn() { +; CHECK-LABEL: define i32 @test_nan_f2ui_rn() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2ui.rn(float 0x7FFFFF0000000000) + ret i32 %res +} + + +define i32 @test_nan_f2ui_rp() { +; CHECK-LABEL: define i32 @test_nan_f2ui_rp() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2ui.rp(float 0x7FFFFF0000000000) + ret i32 %res +} + +define i32 @test_nan_f2ui_rz() { +; CHECK-LABEL: define i32 @test_nan_f2ui_rz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2ui.rz(float 0x7FFFFF0000000000) + ret i32 %res +} + +;+-------------------------------------------------------------+ +;| f2ui_ftz | +;+-------------------------------------------------------------+ +define i32 @test_nan_f2ui_rm_ftz() { +; CHECK-LABEL: define i32 @test_nan_f2ui_rm_ftz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2ui.rm.ftz(float 0x7FFFFF0000000000) + ret i32 %res +} + +define i32 @test_nan_f2ui_rn_ftz() { +; CHECK-LABEL: define i32 @test_nan_f2ui_rn_ftz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2ui.rn.ftz(float 0x7FFFFF0000000000) + ret i32 %res +} + +define i32 @test_nan_f2ui_rp_ftz() { +; CHECK-LABEL: define i32 @test_nan_f2ui_rp_ftz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2ui.rp.ftz(float 0x7FFFFF0000000000) + ret i32 %res +} + +define i32 @test_nan_f2ui_rz_ftz() { +; CHECK-LABEL: define i32 @test_nan_f2ui_rz_ftz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2ui.rz.ftz(float 0x7FFFFF0000000000) + ret i32 %res +} +;+-------------------------------------------------------------+ +;| d2ui | +;+-------------------------------------------------------------+ +define i32 @test_nan_d2ui_rm() { +; CHECK-LABEL: define i32 @test_nan_d2ui_rm() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.d2ui.rm(double 0xFFF8000000000000) + ret i32 %res +} + +define i32 @test_nan_d2ui_rn() { +; CHECK-LABEL: define i32 @test_nan_d2ui_rn() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.d2ui.rn(double 0xFFF8000000000000) + ret i32 %res +} + + +define i32 @test_nan_d2ui_rp() { +; CHECK-LABEL: define i32 @test_nan_d2ui_rp() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.d2ui.rp(double 0xFFF8000000000000) + ret i32 %res +} + +define i32 @test_nan_d2ui_rz() { +; CHECK-LABEL: define i32 @test_nan_d2ui_rz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.d2ui.rz(double 0xFFF8000000000000) + ret i32 %res +} + +;############################################################### +;# Tests with Positive Subnormal # +;############################################################### + +;+-------------------------------------------------------------+ +;| f2i | +;+-------------------------------------------------------------+ +define i32 @test_pos_subnormal_f2i_rm() { +; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rm() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2i.rm(float 0x380FFFFFC0000000) + ret i32 %res +} + +define i32 @test_pos_subnormal_f2i_rn() { +; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rn() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2i.rn(float 0x380FFFFFC0000000) + ret i32 %res +} + + +define i32 @test_pos_subnormal_f2i_rp() { +; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rp() { +; CHECK-NEXT: ret i32 1 +; + %res = call i32 @llvm.nvvm.f2i.rp(float 0x380FFFFFC0000000) + ret i32 %res +} + +define i32 @test_pos_subnormal_f2i_rz() { +; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2i.rz(float 0x380FFFFFC0000000) + ret i32 %res +} + +;+-------------------------------------------------------------+ +;| f2i_ftz | +;+-------------------------------------------------------------+ +define i32 @test_pos_subnormal_f2i_rm_ftz() { +; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rm_ftz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2i.rm.ftz(float 0x380FFFFFC0000000) + ret i32 %res +} + +define i32 @test_pos_subnormal_f2i_rn_ftz() { +; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rn_ftz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2i.rn.ftz(float 0x380FFFFFC0000000) + ret i32 %res +} + +define i32 @test_pos_subnormal_f2i_rp_ftz() { +; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rp_ftz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2i.rp.ftz(float 0x380FFFFFC0000000) + ret i32 %res +} + +define i32 @test_pos_subnormal_f2i_rz_ftz() { +; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rz_ftz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2i.rz.ftz(float 0x380FFFFFC0000000) + ret i32 %res +} +;+-------------------------------------------------------------+ +;| d2i | +;+-------------------------------------------------------------+ +define i32 @test_pos_subnormal_d2i_rm() { +; CHECK-LABEL: define i32 @test_pos_subnormal_d2i_rm() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.d2i.rm(double 0x000fffffffffffff) + ret i32 %res +} + +define i32 @test_pos_subnormal_d2i_rn() { +; CHECK-LABEL: define i32 @test_pos_subnormal_d2i_rn() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.d2i.rn(double 0x000fffffffffffff) + ret i32 %res +} + + +define i32 @test_pos_subnormal_d2i_rp() { +; CHECK-LABEL: define i32 @test_pos_subnormal_d2i_rp() { +; CHECK-NEXT: ret i32 1 +; + %res = call i32 @llvm.nvvm.d2i.rp(double 0x000fffffffffffff) + ret i32 %res +} + +define i32 @test_pos_subnormal_d2i_rz() { +; CHECK-LABEL: define i32 @test_pos_subnormal_d2i_rz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.d2i.rz(double 0x000fffffffffffff) + ret i32 %res +} + +;+-------------------------------------------------------------+ +;| f2ui | +;+-------------------------------------------------------------+ +define i32 @test_pos_subnormal_f2ui_rm() { +; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rm() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2ui.rm(float 0x380FFFFFC0000000) + ret i32 %res +} + +define i32 @test_pos_subnormal_f2ui_rn() { +; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rn() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2ui.rn(float 0x380FFFFFC0000000) + ret i32 %res +} + + +define i32 @test_pos_subnormal_f2ui_rp() { +; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rp() { +; CHECK-NEXT: ret i32 1 +; + %res = call i32 @llvm.nvvm.f2ui.rp(float 0x380FFFFFC0000000) + ret i32 %res +} + +define i32 @test_pos_subnormal_f2ui_rz() { +; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2ui.rz(float 0x380FFFFFC0000000) + ret i32 %res +} + +;+-------------------------------------------------------------+ +;| f2ui_ftz | +;+-------------------------------------------------------------+ +define i32 @test_pos_subnormal_f2ui_rm_ftz() { +; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rm_ftz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2ui.rm.ftz(float 0x380FFFFFC0000000) + ret i32 %res +} + +define i32 @test_pos_subnormal_f2ui_rn_ftz() { +; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rn_ftz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2ui.rn.ftz(float 0x380FFFFFC0000000) + ret i32 %res +} + +define i32 @test_pos_subnormal_f2ui_rp_ftz() { +; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rp_ftz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2ui.rp.ftz(float 0x380FFFFFC0000000) + ret i32 %res +} + +define i32 @test_pos_subnormal_f2ui_rz_ftz() { +; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rz_ftz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2ui.rz.ftz(float 0x380FFFFFC0000000) + ret i32 %res +} +;+-------------------------------------------------------------+ +;| d2ui | +;+-------------------------------------------------------------+ +define i32 @test_pos_subnormal_d2ui_rm() { +; CHECK-LABEL: define i32 @test_pos_subnormal_d2ui_rm() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.d2ui.rm(double 0x000fffffffffffff) + ret i32 %res +} + +define i32 @test_pos_subnormal_d2ui_rn() { +; CHECK-LABEL: define i32 @test_pos_subnormal_d2ui_rn() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.d2ui.rn(double 0x000fffffffffffff) + ret i32 %res +} + + +define i32 @test_pos_subnormal_d2ui_rp() { +; CHECK-LABEL: define i32 @test_pos_subnormal_d2ui_rp() { +; CHECK-NEXT: ret i32 1 +; + %res = call i32 @llvm.nvvm.d2ui.rp(double 0x000fffffffffffff) + ret i32 %res +} + +define i32 @test_pos_subnormal_d2ui_rz() { +; CHECK-LABEL: define i32 @test_pos_subnormal_d2ui_rz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.d2ui.rz(double 0x000fffffffffffff) + ret i32 %res +} + +;############################################################### +;# Tests with Negative Subnormal # +;############################################################### + +;+-------------------------------------------------------------+ +;| f2i | +;+-------------------------------------------------------------+ +define i32 @test_neg_subnormal_f2i_rm() { +; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rm() { +; CHECK-NEXT: ret i32 -1 +; + %res = call i32 @llvm.nvvm.f2i.rm(float 0xB80FFFFFC0000000) + ret i32 %res +} + +define i32 @test_neg_subnormal_f2i_rn() { +; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rn() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2i.rn(float 0xB80FFFFFC0000000) + ret i32 %res +} + + +define i32 @test_neg_subnormal_f2i_rp() { +; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rp() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2i.rp(float 0xB80FFFFFC0000000) + ret i32 %res +} + +define i32 @test_neg_subnormal_f2i_rz() { +; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2i.rz(float 0xB80FFFFFC0000000) + ret i32 %res +} + +;+-------------------------------------------------------------+ +;| f2i_ftz | +;+-------------------------------------------------------------+ +define i32 @test_neg_subnormal_f2i_rm_ftz() { +; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rm_ftz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2i.rm.ftz(float 0xB80FFFFFC0000000) + ret i32 %res +} + +define i32 @test_neg_subnormal_f2i_rn_ftz() { +; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rn_ftz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2i.rn.ftz(float 0xB80FFFFFC0000000) + ret i32 %res +} + +define i32 @test_neg_subnormal_f2i_rp_ftz() { +; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rp_ftz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2i.rp.ftz(float 0xB80FFFFFC0000000) + ret i32 %res +} + +define i32 @test_neg_subnormal_f2i_rz_ftz() { +; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rz_ftz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2i.rz.ftz(float 0xB80FFFFFC0000000) + ret i32 %res +} +;+-------------------------------------------------------------+ +;| d2i | +;+-------------------------------------------------------------+ +define i32 @test_neg_subnormal_d2i_rm() { +; CHECK-LABEL: define i32 @test_neg_subnormal_d2i_rm() { +; CHECK-NEXT: ret i32 -1 +; + %res = call i32 @llvm.nvvm.d2i.rm(double 0x800fffffffffffff) + ret i32 %res +} + +define i32 @test_neg_subnormal_d2i_rn() { +; CHECK-LABEL: define i32 @test_neg_subnormal_d2i_rn() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.d2i.rn(double 0x800fffffffffffff) + ret i32 %res +} + + +define i32 @test_neg_subnormal_d2i_rp() { +; CHECK-LABEL: define i32 @test_neg_subnormal_d2i_rp() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.d2i.rp(double 0x800fffffffffffff) + ret i32 %res +} + +define i32 @test_neg_subnormal_d2i_rz() { +; CHECK-LABEL: define i32 @test_neg_subnormal_d2i_rz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.d2i.rz(double 0x800fffffffffffff) + ret i32 %res +} + +;+-------------------------------------------------------------+ +;| f2ui | +;+-------------------------------------------------------------+ +define i32 @test_neg_subnormal_f2ui_rm() { +; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rm() { +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rm(float 0xB80FFFFFC0000000) +; CHECK-NEXT: ret i32 [[RES]] +; + %res = call i32 @llvm.nvvm.f2ui.rm(float 0xB80FFFFFC0000000) + ret i32 %res +} + +define i32 @test_neg_subnormal_f2ui_rn() { +; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rn() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2ui.rn(float 0xB80FFFFFC0000000) + ret i32 %res +} + + +define i32 @test_neg_subnormal_f2ui_rp() { +; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rp() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2ui.rp(float 0xB80FFFFFC0000000) + ret i32 %res +} + +define i32 @test_neg_subnormal_f2ui_rz() { +; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2ui.rz(float 0xB80FFFFFC0000000) + ret i32 %res +} + +;+-------------------------------------------------------------+ +;| f2ui_ftz | +;+-------------------------------------------------------------+ +define i32 @test_neg_subnormal_f2ui_rm_ftz() { +; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rm_ftz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2ui.rm.ftz(float 0xB80FFFFFC0000000) + ret i32 %res +} + +define i32 @test_neg_subnormal_f2ui_rn_ftz() { +; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rn_ftz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2ui.rn.ftz(float 0xB80FFFFFC0000000) + ret i32 %res +} + +define i32 @test_neg_subnormal_f2ui_rp_ftz() { +; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rp_ftz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2ui.rp.ftz(float 0xB80FFFFFC0000000) + ret i32 %res +} + +define i32 @test_neg_subnormal_f2ui_rz_ftz() { +; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rz_ftz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.f2ui.rz.ftz(float 0xB80FFFFFC0000000) + ret i32 %res +} +;+-------------------------------------------------------------+ +;| d2ui | +;+-------------------------------------------------------------+ +define i32 @test_neg_subnormal_d2ui_rm() { +; CHECK-LABEL: define i32 @test_neg_subnormal_d2ui_rm() { +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rm(double 0x800FFFFFFFFFFFFF) +; CHECK-NEXT: ret i32 [[RES]] +; + %res = call i32 @llvm.nvvm.d2ui.rm(double 0x800fffffffffffff) + ret i32 %res +} + +define i32 @test_neg_subnormal_d2ui_rn() { +; CHECK-LABEL: define i32 @test_neg_subnormal_d2ui_rn() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.d2ui.rn(double 0x800fffffffffffff) + ret i32 %res +} + + +define i32 @test_neg_subnormal_d2ui_rp() { +; CHECK-LABEL: define i32 @test_neg_subnormal_d2ui_rp() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.d2ui.rp(double 0x800fffffffffffff) + ret i32 %res +} + +define i32 @test_neg_subnormal_d2ui_rz() { +; CHECK-LABEL: define i32 @test_neg_subnormal_d2ui_rz() { +; CHECK-NEXT: ret i32 0 +; + %res = call i32 @llvm.nvvm.d2ui.rz(double 0x800fffffffffffff) + ret i32 %res +} + +declare i32 @llvm.nvvm.f2i.rm(float) +declare i32 @llvm.nvvm.f2i.rn(float) +declare i32 @llvm.nvvm.f2i.rp(float) +declare i32 @llvm.nvvm.f2i.rz(float) + +declare i32 @llvm.nvvm.f2i.rm.ftz(float) +declare i32 @llvm.nvvm.f2i.rn.ftz(float) +declare i32 @llvm.nvvm.f2i.rp.ftz(float) +declare i32 @llvm.nvvm.f2i.rz.ftz(float) + +declare i32 @llvm.nvvm.d2i.rm(double) +declare i32 @llvm.nvvm.d2i.rn(double) +declare i32 @llvm.nvvm.d2i.rp(double) +declare i32 @llvm.nvvm.d2i.rz(double) + + +declare i32 @llvm.nvvm.f2ui.rm(float) +declare i32 @llvm.nvvm.f2ui.rn(float) +declare i32 @llvm.nvvm.f2ui.rp(float) +declare i32 @llvm.nvvm.f2ui.rz(float) + +declare i32 @llvm.nvvm.f2ui.rm.ftz(float) +declare i32 @llvm.nvvm.f2ui.rn.ftz(float) +declare i32 @llvm.nvvm.f2ui.rp.ftz(float) +declare i32 @llvm.nvvm.f2ui.rz.ftz(float) + +declare i32 @llvm.nvvm.d2ui.rm(double) +declare i32 @llvm.nvvm.d2ui.rn(double) +declare i32 @llvm.nvvm.d2ui.rp(double) +declare i32 @llvm.nvvm.d2ui.rz(double) diff --git a/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2ll-d2ll.ll b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2ll-d2ll.ll new file mode 100644 index 0000000000000..be38177dce2c3 --- /dev/null +++ b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2ll-d2ll.ll @@ -0,0 +1,1129 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -passes=instsimplify -march=nvptx64 -S | FileCheck %s + +; f2ll/f2ull and d2ll/d2ull - double/float to i64 tests + +;############################################################### +;# Tests with Positive 1.5 # +;############################################################### + +;+-------------------------------------------------------------+ +;| f2ll | +;+-------------------------------------------------------------+ +define i64 @test_pos_1_5_f2ll_rm() { +; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rm() { +; CHECK-NEXT: ret i64 1 +; + %res = call i64 @llvm.nvvm.f2ll.rm(float 1.5) + ret i64 %res +} + +define i64 @test_pos_1_5_f2ll_rn() { +; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rn() { +; CHECK-NEXT: ret i64 2 +; + %res = call i64 @llvm.nvvm.f2ll.rn(float 1.5) + ret i64 %res +} + + +define i64 @test_pos_1_5_f2ll_rp() { +; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rp() { +; CHECK-NEXT: ret i64 2 +; + %res = call i64 @llvm.nvvm.f2ll.rp(float 1.5) + ret i64 %res +} + +define i64 @test_pos_1_5_f2ll_rz() { +; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rz() { +; CHECK-NEXT: ret i64 1 +; + %res = call i64 @llvm.nvvm.f2ll.rz(float 1.5) + ret i64 %res +} + +;+-------------------------------------------------------------+ +;| f2ll_ftz | +;+-------------------------------------------------------------+ +define i64 @test_pos_1_5_f2ll_rm_ftz() { +; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rm_ftz() { +; CHECK-NEXT: ret i64 1 +; + %res = call i64 @llvm.nvvm.f2ll.rm.ftz(float 1.5) + ret i64 %res +} + +define i64 @test_pos_1_5_f2ll_rn_ftz() { +; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rn_ftz() { +; CHECK-NEXT: ret i64 2 +; + %res = call i64 @llvm.nvvm.f2ll.rn.ftz(float 1.5) + ret i64 %res +} + +define i64 @test_pos_1_5_f2ll_rp_ftz() { +; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rp_ftz() { +; CHECK-NEXT: ret i64 2 +; + %res = call i64 @llvm.nvvm.f2ll.rp.ftz(float 1.5) + ret i64 %res +} + +define i64 @test_pos_1_5_f2ll_rz_ftz() { +; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rz_ftz() { +; CHECK-NEXT: ret i64 1 +; + %res = call i64 @llvm.nvvm.f2ll.rz.ftz(float 1.5) + ret i64 %res +} +;+-------------------------------------------------------------+ +;| d2ll | +;+-------------------------------------------------------------+ +define i64 @test_pos_1_5_d2ll_rm() { +; CHECK-LABEL: define i64 @test_pos_1_5_d2ll_rm() { +; CHECK-NEXT: ret i64 1 +; + %res = call i64 @llvm.nvvm.d2ll.rm(double 1.5) + ret i64 %res +} + +define i64 @test_pos_1_5_d2ll_rn() { +; CHECK-LABEL: define i64 @test_pos_1_5_d2ll_rn() { +; CHECK-NEXT: ret i64 2 +; + %res = call i64 @llvm.nvvm.d2ll.rn(double 1.5) + ret i64 %res +} + + +define i64 @test_pos_1_5_d2ll_rp() { +; CHECK-LABEL: define i64 @test_pos_1_5_d2ll_rp() { +; CHECK-NEXT: ret i64 2 +; + %res = call i64 @llvm.nvvm.d2ll.rp(double 1.5) + ret i64 %res +} + +define i64 @test_pos_1_5_d2ll_rz() { +; CHECK-LABEL: define i64 @test_pos_1_5_d2ll_rz() { +; CHECK-NEXT: ret i64 1 +; + %res = call i64 @llvm.nvvm.d2ll.rz(double 1.5) + ret i64 %res +} + +;+-------------------------------------------------------------+ +;| f2ull | +;+-------------------------------------------------------------+ +define i64 @test_pos_1_5_f2ull_rm() { +; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rm() { +; CHECK-NEXT: ret i64 1 +; + %res = call i64 @llvm.nvvm.f2ull.rm(float 1.5) + ret i64 %res +} + +define i64 @test_pos_1_5_f2ull_rn() { +; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rn() { +; CHECK-NEXT: ret i64 2 +; + %res = call i64 @llvm.nvvm.f2ull.rn(float 1.5) + ret i64 %res +} + + +define i64 @test_pos_1_5_f2ull_rp() { +; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rp() { +; CHECK-NEXT: ret i64 2 +; + %res = call i64 @llvm.nvvm.f2ull.rp(float 1.5) + ret i64 %res +} + +define i64 @test_pos_1_5_f2ull_rz() { +; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rz() { +; CHECK-NEXT: ret i64 1 +; + %res = call i64 @llvm.nvvm.f2ull.rz(float 1.5) + ret i64 %res +} + +;+-------------------------------------------------------------+ +;| f2ull_ftz | +;+-------------------------------------------------------------+ +define i64 @test_pos_1_5_f2ull_rm_ftz() { +; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rm_ftz() { +; CHECK-NEXT: ret i64 1 +; + %res = call i64 @llvm.nvvm.f2ull.rm.ftz(float 1.5) + ret i64 %res +} + +define i64 @test_pos_1_5_f2ull_rn_ftz() { +; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rn_ftz() { +; CHECK-NEXT: ret i64 2 +; + %res = call i64 @llvm.nvvm.f2ull.rn.ftz(float 1.5) + ret i64 %res +} + +define i64 @test_pos_1_5_f2ull_rp_ftz() { +; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rp_ftz() { +; CHECK-NEXT: ret i64 2 +; + %res = call i64 @llvm.nvvm.f2ull.rp.ftz(float 1.5) + ret i64 %res +} + +define i64 @test_pos_1_5_f2ull_rz_ftz() { +; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rz_ftz() { +; CHECK-NEXT: ret i64 1 +; + %res = call i64 @llvm.nvvm.f2ull.rz.ftz(float 1.5) + ret i64 %res +} +;+-------------------------------------------------------------+ +;| d2ull | +;+-------------------------------------------------------------+ +define i64 @test_pos_1_5_d2ull_rm() { +; CHECK-LABEL: define i64 @test_pos_1_5_d2ull_rm() { +; CHECK-NEXT: ret i64 1 +; + %res = call i64 @llvm.nvvm.d2ull.rm(double 1.5) + ret i64 %res +} + +define i64 @test_pos_1_5_d2ull_rn() { +; CHECK-LABEL: define i64 @test_pos_1_5_d2ull_rn() { +; CHECK-NEXT: ret i64 2 +; + %res = call i64 @llvm.nvvm.d2ull.rn(double 1.5) + ret i64 %res +} + + +define i64 @test_pos_1_5_d2ull_rp() { +; CHECK-LABEL: define i64 @test_pos_1_5_d2ull_rp() { +; CHECK-NEXT: ret i64 2 +; + %res = call i64 @llvm.nvvm.d2ull.rp(double 1.5) + ret i64 %res +} + +define i64 @test_pos_1_5_d2ull_rz() { +; CHECK-LABEL: define i64 @test_pos_1_5_d2ull_rz() { +; CHECK-NEXT: ret i64 1 +; + %res = call i64 @llvm.nvvm.d2ull.rz(double 1.5) + ret i64 %res +} + +;############################################################### +;# Tests with Negative 1.5 # +;############################################################### + +;+-------------------------------------------------------------+ +;| f2ll | +;+-------------------------------------------------------------+ +define i64 @test_neg_1_5_f2ll_rm() { +; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rm() { +; CHECK-NEXT: ret i64 -2 +; + %res = call i64 @llvm.nvvm.f2ll.rm(float -1.5) + ret i64 %res +} + +define i64 @test_neg_1_5_f2ll_rn() { +; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rn() { +; CHECK-NEXT: ret i64 -2 +; + %res = call i64 @llvm.nvvm.f2ll.rn(float -1.5) + ret i64 %res +} + + +define i64 @test_neg_1_5_f2ll_rp() { +; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rp() { +; CHECK-NEXT: ret i64 -1 +; + %res = call i64 @llvm.nvvm.f2ll.rp(float -1.5) + ret i64 %res +} + +define i64 @test_neg_1_5_f2ll_rz() { +; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rz() { +; CHECK-NEXT: ret i64 -1 +; + %res = call i64 @llvm.nvvm.f2ll.rz(float -1.5) + ret i64 %res +} + +;+-------------------------------------------------------------+ +;| f2ll_ftz | +;+-------------------------------------------------------------+ +define i64 @test_neg_1_5_f2ll_rm_ftz() { +; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rm_ftz() { +; CHECK-NEXT: ret i64 -2 +; + %res = call i64 @llvm.nvvm.f2ll.rm.ftz(float -1.5) + ret i64 %res +} + +define i64 @test_neg_1_5_f2ll_rn_ftz() { +; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rn_ftz() { +; CHECK-NEXT: ret i64 -2 +; + %res = call i64 @llvm.nvvm.f2ll.rn.ftz(float -1.5) + ret i64 %res +} + +define i64 @test_neg_1_5_f2ll_rp_ftz() { +; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rp_ftz() { +; CHECK-NEXT: ret i64 -1 +; + %res = call i64 @llvm.nvvm.f2ll.rp.ftz(float -1.5) + ret i64 %res +} + +define i64 @test_neg_1_5_f2ll_rz_ftz() { +; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rz_ftz() { +; CHECK-NEXT: ret i64 -1 +; + %res = call i64 @llvm.nvvm.f2ll.rz.ftz(float -1.5) + ret i64 %res +} +;+-------------------------------------------------------------+ +;| d2ll | +;+-------------------------------------------------------------+ +define i64 @test_neg_1_5_d2ll_rm() { +; CHECK-LABEL: define i64 @test_neg_1_5_d2ll_rm() { +; CHECK-NEXT: ret i64 -2 +; + %res = call i64 @llvm.nvvm.d2ll.rm(double -1.5) + ret i64 %res +} + +define i64 @test_neg_1_5_d2ll_rn() { +; CHECK-LABEL: define i64 @test_neg_1_5_d2ll_rn() { +; CHECK-NEXT: ret i64 -2 +; + %res = call i64 @llvm.nvvm.d2ll.rn(double -1.5) + ret i64 %res +} + + +define i64 @test_neg_1_5_d2ll_rp() { +; CHECK-LABEL: define i64 @test_neg_1_5_d2ll_rp() { +; CHECK-NEXT: ret i64 -1 +; + %res = call i64 @llvm.nvvm.d2ll.rp(double -1.5) + ret i64 %res +} + +define i64 @test_neg_1_5_d2ll_rz() { +; CHECK-LABEL: define i64 @test_neg_1_5_d2ll_rz() { +; CHECK-NEXT: ret i64 -1 +; + %res = call i64 @llvm.nvvm.d2ll.rz(double -1.5) + ret i64 %res +} + +;+-------------------------------------------------------------+ +;| f2ull | +;+-------------------------------------------------------------+ +define i64 @test_neg_1_5_f2ull_rm() { +; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rm() { +; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rm(float -1.500000e+00) +; CHECK-NEXT: ret i64 [[RES]] +; + %res = call i64 @llvm.nvvm.f2ull.rm(float -1.5) + ret i64 %res +} + +define i64 @test_neg_1_5_f2ull_rn() { +; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rn() { +; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rn(float -1.500000e+00) +; CHECK-NEXT: ret i64 [[RES]] +; + %res = call i64 @llvm.nvvm.f2ull.rn(float -1.5) + ret i64 %res +} + + +define i64 @test_neg_1_5_f2ull_rp() { +; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rp() { +; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rp(float -1.500000e+00) +; CHECK-NEXT: ret i64 [[RES]] +; + %res = call i64 @llvm.nvvm.f2ull.rp(float -1.5) + ret i64 %res +} + +define i64 @test_neg_1_5_f2ull_rz() { +; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rz() { +; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rz(float -1.500000e+00) +; CHECK-NEXT: ret i64 [[RES]] +; + %res = call i64 @llvm.nvvm.f2ull.rz(float -1.5) + ret i64 %res +} + +;+-------------------------------------------------------------+ +;| f2ull_ftz | +;+-------------------------------------------------------------+ +define i64 @test_neg_1_5_f2ull_rm_ftz() { +; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rm_ftz() { +; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rm.ftz(float -1.500000e+00) +; CHECK-NEXT: ret i64 [[RES]] +; + %res = call i64 @llvm.nvvm.f2ull.rm.ftz(float -1.5) + ret i64 %res +} + +define i64 @test_neg_1_5_f2ull_rn_ftz() { +; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rn_ftz() { +; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rn.ftz(float -1.500000e+00) +; CHECK-NEXT: ret i64 [[RES]] +; + %res = call i64 @llvm.nvvm.f2ull.rn.ftz(float -1.5) + ret i64 %res +} + +define i64 @test_neg_1_5_f2ull_rp_ftz() { +; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rp_ftz() { +; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rp.ftz(float -1.500000e+00) +; CHECK-NEXT: ret i64 [[RES]] +; + %res = call i64 @llvm.nvvm.f2ull.rp.ftz(float -1.5) + ret i64 %res +} + +define i64 @test_neg_1_5_f2ull_rz_ftz() { +; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rz_ftz() { +; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rz.ftz(float -1.500000e+00) +; CHECK-NEXT: ret i64 [[RES]] +; + %res = call i64 @llvm.nvvm.f2ull.rz.ftz(float -1.5) + ret i64 %res +} +;+-------------------------------------------------------------+ +;| d2ull | +;+-------------------------------------------------------------+ +define i64 @test_neg_1_5_d2ull_rm() { +; CHECK-LABEL: define i64 @test_neg_1_5_d2ull_rm() { +; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rm(double -1.500000e+00) +; CHECK-NEXT: ret i64 [[RES]] +; + %res = call i64 @llvm.nvvm.d2ull.rm(double -1.5) + ret i64 %res +} + +define i64 @test_neg_1_5_d2ull_rn() { +; CHECK-LABEL: define i64 @test_neg_1_5_d2ull_rn() { +; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rn(double -1.500000e+00) +; CHECK-NEXT: ret i64 [[RES]] +; + %res = call i64 @llvm.nvvm.d2ull.rn(double -1.5) + ret i64 %res +} + + +define i64 @test_neg_1_5_d2ull_rp() { +; CHECK-LABEL: define i64 @test_neg_1_5_d2ull_rp() { +; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rp(double -1.500000e+00) +; CHECK-NEXT: ret i64 [[RES]] +; + %res = call i64 @llvm.nvvm.d2ull.rp(double -1.5) + ret i64 %res +} + +define i64 @test_neg_1_5_d2ull_rz() { +; CHECK-LABEL: define i64 @test_neg_1_5_d2ull_rz() { +; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rz(double -1.500000e+00) +; CHECK-NEXT: ret i64 [[RES]] +; + %res = call i64 @llvm.nvvm.d2ull.rz(double -1.5) + ret i64 %res +} + +;############################################################### +;# Tests with NaN # +;############################################################### + +;+-------------------------------------------------------------+ +;| f2ll | +;+-------------------------------------------------------------+ +define i64 @test_nan_f2ll_rm() { +; CHECK-LABEL: define i64 @test_nan_f2ll_rm() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ll.rm(float 0x7FFFFF0000000000) + ret i64 %res +} + +define i64 @test_nan_f2ll_rn() { +; CHECK-LABEL: define i64 @test_nan_f2ll_rn() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ll.rn(float 0x7FFFFF0000000000) + ret i64 %res +} + + +define i64 @test_nan_f2ll_rp() { +; CHECK-LABEL: define i64 @test_nan_f2ll_rp() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ll.rp(float 0x7FFFFF0000000000) + ret i64 %res +} + +define i64 @test_nan_f2ll_rz() { +; CHECK-LABEL: define i64 @test_nan_f2ll_rz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ll.rz(float 0x7FFFFF0000000000) + ret i64 %res +} + +;+-------------------------------------------------------------+ +;| f2ll_ftz | +;+-------------------------------------------------------------+ +define i64 @test_nan_f2ll_rm_ftz() { +; CHECK-LABEL: define i64 @test_nan_f2ll_rm_ftz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ll.rm.ftz(float 0x7FFFFF0000000000) + ret i64 %res +} + +define i64 @test_nan_f2ll_rn_ftz() { +; CHECK-LABEL: define i64 @test_nan_f2ll_rn_ftz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ll.rn.ftz(float 0x7FFFFF0000000000) + ret i64 %res +} + +define i64 @test_nan_f2ll_rp_ftz() { +; CHECK-LABEL: define i64 @test_nan_f2ll_rp_ftz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ll.rp.ftz(float 0x7FFFFF0000000000) + ret i64 %res +} + +define i64 @test_nan_f2ll_rz_ftz() { +; CHECK-LABEL: define i64 @test_nan_f2ll_rz_ftz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ll.rz.ftz(float 0x7FFFFF0000000000) + ret i64 %res +} +;+-------------------------------------------------------------+ +;| d2ll | +;+-------------------------------------------------------------+ +define i64 @test_nan_d2ll_rm() { +; CHECK-LABEL: define i64 @test_nan_d2ll_rm() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.d2ll.rm(double 0xFFF8000000000000) + ret i64 %res +} + +define i64 @test_nan_d2ll_rn() { +; CHECK-LABEL: define i64 @test_nan_d2ll_rn() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.d2ll.rn(double 0xFFF8000000000000) + ret i64 %res +} + + +define i64 @test_nan_d2ll_rp() { +; CHECK-LABEL: define i64 @test_nan_d2ll_rp() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.d2ll.rp(double 0xFFF8000000000000) + ret i64 %res +} + +define i64 @test_nan_d2ll_rz() { +; CHECK-LABEL: define i64 @test_nan_d2ll_rz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.d2ll.rz(double 0xFFF8000000000000) + ret i64 %res +} + +;+-------------------------------------------------------------+ +;| f2ull | +;+-------------------------------------------------------------+ +define i64 @test_nan_f2ull_rm() { +; CHECK-LABEL: define i64 @test_nan_f2ull_rm() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ull.rm(float 0x7FFFFF0000000000) + ret i64 %res +} + +define i64 @test_nan_f2ull_rn() { +; CHECK-LABEL: define i64 @test_nan_f2ull_rn() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ull.rn(float 0x7FFFFF0000000000) + ret i64 %res +} + + +define i64 @test_nan_f2ull_rp() { +; CHECK-LABEL: define i64 @test_nan_f2ull_rp() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ull.rp(float 0x7FFFFF0000000000) + ret i64 %res +} + +define i64 @test_nan_f2ull_rz() { +; CHECK-LABEL: define i64 @test_nan_f2ull_rz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ull.rz(float 0x7FFFFF0000000000) + ret i64 %res +} + +;+-------------------------------------------------------------+ +;| f2ull_ftz | +;+-------------------------------------------------------------+ +define i64 @test_nan_f2ull_rm_ftz() { +; CHECK-LABEL: define i64 @test_nan_f2ull_rm_ftz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ull.rm.ftz(float 0x7FFFFF0000000000) + ret i64 %res +} + +define i64 @test_nan_f2ull_rn_ftz() { +; CHECK-LABEL: define i64 @test_nan_f2ull_rn_ftz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ull.rn.ftz(float 0x7FFFFF0000000000) + ret i64 %res +} + +define i64 @test_nan_f2ull_rp_ftz() { +; CHECK-LABEL: define i64 @test_nan_f2ull_rp_ftz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ull.rp.ftz(float 0x7FFFFF0000000000) + ret i64 %res +} + +define i64 @test_nan_f2ull_rz_ftz() { +; CHECK-LABEL: define i64 @test_nan_f2ull_rz_ftz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ull.rz.ftz(float 0x7FFFFF0000000000) + ret i64 %res +} +;+-------------------------------------------------------------+ +;| d2ull | +;+-------------------------------------------------------------+ +define i64 @test_nan_d2ull_rm() { +; CHECK-LABEL: define i64 @test_nan_d2ull_rm() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.d2ull.rm(double 0xFFF8000000000000) + ret i64 %res +} + +define i64 @test_nan_d2ull_rn() { +; CHECK-LABEL: define i64 @test_nan_d2ull_rn() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.d2ull.rn(double 0xFFF8000000000000) + ret i64 %res +} + + +define i64 @test_nan_d2ull_rp() { +; CHECK-LABEL: define i64 @test_nan_d2ull_rp() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.d2ull.rp(double 0xFFF8000000000000) + ret i64 %res +} + +define i64 @test_nan_d2ull_rz() { +; CHECK-LABEL: define i64 @test_nan_d2ull_rz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.d2ull.rz(double 0xFFF8000000000000) + ret i64 %res +} + +;############################################################### +;# Tests with Positive Subnormal # +;############################################################### + +;+-------------------------------------------------------------+ +;| f2ll | +;+-------------------------------------------------------------+ +define i64 @test_pos_subnormal_f2ll_rm() { +; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rm() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ll.rm(float 0x380FFFFFC0000000) + ret i64 %res +} + +define i64 @test_pos_subnormal_f2ll_rn() { +; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rn() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ll.rn(float 0x380FFFFFC0000000) + ret i64 %res +} + + +define i64 @test_pos_subnormal_f2ll_rp() { +; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rp() { +; CHECK-NEXT: ret i64 1 +; + %res = call i64 @llvm.nvvm.f2ll.rp(float 0x380FFFFFC0000000) + ret i64 %res +} + +define i64 @test_pos_subnormal_f2ll_rz() { +; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ll.rz(float 0x380FFFFFC0000000) + ret i64 %res +} + +;+-------------------------------------------------------------+ +;| f2ll_ftz | +;+-------------------------------------------------------------+ +define i64 @test_pos_subnormal_f2ll_rm_ftz() { +; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rm_ftz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ll.rm.ftz(float 0x380FFFFFC0000000) + ret i64 %res +} + +define i64 @test_pos_subnormal_f2ll_rn_ftz() { +; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rn_ftz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ll.rn.ftz(float 0x380FFFFFC0000000) + ret i64 %res +} + +define i64 @test_pos_subnormal_f2ll_rp_ftz() { +; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rp_ftz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ll.rp.ftz(float 0x380FFFFFC0000000) + ret i64 %res +} + +define i64 @test_pos_subnormal_f2ll_rz_ftz() { +; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rz_ftz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ll.rz.ftz(float 0x380FFFFFC0000000) + ret i64 %res +} +;+-------------------------------------------------------------+ +;| d2ll | +;+-------------------------------------------------------------+ +define i64 @test_pos_subnormal_d2ll_rm() { +; CHECK-LABEL: define i64 @test_pos_subnormal_d2ll_rm() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.d2ll.rm(double 0x000fffffffffffff) + ret i64 %res +} + +define i64 @test_pos_subnormal_d2ll_rn() { +; CHECK-LABEL: define i64 @test_pos_subnormal_d2ll_rn() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.d2ll.rn(double 0x000fffffffffffff) + ret i64 %res +} + + +define i64 @test_pos_subnormal_d2ll_rp() { +; CHECK-LABEL: define i64 @test_pos_subnormal_d2ll_rp() { +; CHECK-NEXT: ret i64 1 +; + %res = call i64 @llvm.nvvm.d2ll.rp(double 0x000fffffffffffff) + ret i64 %res +} + +define i64 @test_pos_subnormal_d2ll_rz() { +; CHECK-LABEL: define i64 @test_pos_subnormal_d2ll_rz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.d2ll.rz(double 0x000fffffffffffff) + ret i64 %res +} + +;+-------------------------------------------------------------+ +;| f2ull | +;+-------------------------------------------------------------+ +define i64 @test_pos_subnormal_f2ull_rm() { +; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rm() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ull.rm(float 0x380FFFFFC0000000) + ret i64 %res +} + +define i64 @test_pos_subnormal_f2ull_rn() { +; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rn() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ull.rn(float 0x380FFFFFC0000000) + ret i64 %res +} + + +define i64 @test_pos_subnormal_f2ull_rp() { +; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rp() { +; CHECK-NEXT: ret i64 1 +; + %res = call i64 @llvm.nvvm.f2ull.rp(float 0x380FFFFFC0000000) + ret i64 %res +} + +define i64 @test_pos_subnormal_f2ull_rz() { +; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ull.rz(float 0x380FFFFFC0000000) + ret i64 %res +} + +;+-------------------------------------------------------------+ +;| f2ull_ftz | +;+-------------------------------------------------------------+ +define i64 @test_pos_subnormal_f2ull_rm_ftz() { +; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rm_ftz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ull.rm.ftz(float 0x380FFFFFC0000000) + ret i64 %res +} + +define i64 @test_pos_subnormal_f2ull_rn_ftz() { +; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rn_ftz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ull.rn.ftz(float 0x380FFFFFC0000000) + ret i64 %res +} + +define i64 @test_pos_subnormal_f2ull_rp_ftz() { +; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rp_ftz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ull.rp.ftz(float 0x380FFFFFC0000000) + ret i64 %res +} + +define i64 @test_pos_subnormal_f2ull_rz_ftz() { +; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rz_ftz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ull.rz.ftz(float 0x380FFFFFC0000000) + ret i64 %res +} +;+-------------------------------------------------------------+ +;| d2ull | +;+-------------------------------------------------------------+ +define i64 @test_pos_subnormal_d2ull_rm() { +; CHECK-LABEL: define i64 @test_pos_subnormal_d2ull_rm() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.d2ull.rm(double 0x000fffffffffffff) + ret i64 %res +} + +define i64 @test_pos_subnormal_d2ull_rn() { +; CHECK-LABEL: define i64 @test_pos_subnormal_d2ull_rn() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.d2ull.rn(double 0x000fffffffffffff) + ret i64 %res +} + + +define i64 @test_pos_subnormal_d2ull_rp() { +; CHECK-LABEL: define i64 @test_pos_subnormal_d2ull_rp() { +; CHECK-NEXT: ret i64 1 +; + %res = call i64 @llvm.nvvm.d2ull.rp(double 0x000fffffffffffff) + ret i64 %res +} + +define i64 @test_pos_subnormal_d2ull_rz() { +; CHECK-LABEL: define i64 @test_pos_subnormal_d2ull_rz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.d2ull.rz(double 0x000fffffffffffff) + ret i64 %res +} + +;############################################################### +;# Tests with Negative Subnormal # +;############################################################### + +;+-------------------------------------------------------------+ +;| f2ll | +;+-------------------------------------------------------------+ +define i64 @test_neg_subnormal_f2ll_rm() { +; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rm() { +; CHECK-NEXT: ret i64 -1 +; + %res = call i64 @llvm.nvvm.f2ll.rm(float 0xB80FFFFFC0000000) + ret i64 %res +} + +define i64 @test_neg_subnormal_f2ll_rn() { +; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rn() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ll.rn(float 0xB80FFFFFC0000000) + ret i64 %res +} + + +define i64 @test_neg_subnormal_f2ll_rp() { +; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rp() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ll.rp(float 0xB80FFFFFC0000000) + ret i64 %res +} + +define i64 @test_neg_subnormal_f2ll_rz() { +; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ll.rz(float 0xB80FFFFFC0000000) + ret i64 %res +} + +;+-------------------------------------------------------------+ +;| f2ll_ftz | +;+-------------------------------------------------------------+ +define i64 @test_neg_subnormal_f2ll_rm_ftz() { +; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rm_ftz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ll.rm.ftz(float 0xB80FFFFFC0000000) + ret i64 %res +} + +define i64 @test_neg_subnormal_f2ll_rn_ftz() { +; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rn_ftz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ll.rn.ftz(float 0xB80FFFFFC0000000) + ret i64 %res +} + +define i64 @test_neg_subnormal_f2ll_rp_ftz() { +; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rp_ftz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ll.rp.ftz(float 0xB80FFFFFC0000000) + ret i64 %res +} + +define i64 @test_neg_subnormal_f2ll_rz_ftz() { +; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rz_ftz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ll.rz.ftz(float 0xB80FFFFFC0000000) + ret i64 %res +} +;+-------------------------------------------------------------+ +;| d2ll | +;+-------------------------------------------------------------+ +define i64 @test_neg_subnormal_d2ll_rm() { +; CHECK-LABEL: define i64 @test_neg_subnormal_d2ll_rm() { +; CHECK-NEXT: ret i64 -1 +; + %res = call i64 @llvm.nvvm.d2ll.rm(double 0x800fffffffffffff) + ret i64 %res +} + +define i64 @test_neg_subnormal_d2ll_rn() { +; CHECK-LABEL: define i64 @test_neg_subnormal_d2ll_rn() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.d2ll.rn(double 0x800fffffffffffff) + ret i64 %res +} + + +define i64 @test_neg_subnormal_d2ll_rp() { +; CHECK-LABEL: define i64 @test_neg_subnormal_d2ll_rp() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.d2ll.rp(double 0x800fffffffffffff) + ret i64 %res +} + +define i64 @test_neg_subnormal_d2ll_rz() { +; CHECK-LABEL: define i64 @test_neg_subnormal_d2ll_rz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.d2ll.rz(double 0x800fffffffffffff) + ret i64 %res +} + +;+-------------------------------------------------------------+ +;| f2ull | +;+-------------------------------------------------------------+ +define i64 @test_neg_subnormal_f2ull_rm() { +; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rm() { +; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rm(float 0xB80FFFFFC0000000) +; CHECK-NEXT: ret i64 [[RES]] +; + %res = call i64 @llvm.nvvm.f2ull.rm(float 0xB80FFFFFC0000000) + ret i64 %res +} + +define i64 @test_neg_subnormal_f2ull_rn() { +; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rn() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ull.rn(float 0xB80FFFFFC0000000) + ret i64 %res +} + + +define i64 @test_neg_subnormal_f2ull_rp() { +; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rp() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ull.rp(float 0xB80FFFFFC0000000) + ret i64 %res +} + +define i64 @test_neg_subnormal_f2ull_rz() { +; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ull.rz(float 0xB80FFFFFC0000000) + ret i64 %res +} + +;+-------------------------------------------------------------+ +;| f2ull_ftz | +;+-------------------------------------------------------------+ +define i64 @test_neg_subnormal_f2ull_rm_ftz() { +; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rm_ftz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ull.rm.ftz(float 0xB80FFFFFC0000000) + ret i64 %res +} + +define i64 @test_neg_subnormal_f2ull_rn_ftz() { +; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rn_ftz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ull.rn.ftz(float 0xB80FFFFFC0000000) + ret i64 %res +} + +define i64 @test_neg_subnormal_f2ull_rp_ftz() { +; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rp_ftz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ull.rp.ftz(float 0xB80FFFFFC0000000) + ret i64 %res +} + +define i64 @test_neg_subnormal_f2ull_rz_ftz() { +; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rz_ftz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.f2ull.rz.ftz(float 0xB80FFFFFC0000000) + ret i64 %res +} +;+-------------------------------------------------------------+ +;| d2ull | +;+-------------------------------------------------------------+ +define i64 @test_neg_subnormal_d2ull_rm() { +; CHECK-LABEL: define i64 @test_neg_subnormal_d2ull_rm() { +; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rm(double 0x800FFFFFFFFFFFFF) +; CHECK-NEXT: ret i64 [[RES]] +; + %res = call i64 @llvm.nvvm.d2ull.rm(double 0x800fffffffffffff) + ret i64 %res +} + +define i64 @test_neg_subnormal_d2ull_rn() { +; CHECK-LABEL: define i64 @test_neg_subnormal_d2ull_rn() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.d2ull.rn(double 0x800fffffffffffff) + ret i64 %res +} + + +define i64 @test_neg_subnormal_d2ull_rp() { +; CHECK-LABEL: define i64 @test_neg_subnormal_d2ull_rp() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.d2ull.rp(double 0x800fffffffffffff) + ret i64 %res +} + +define i64 @test_neg_subnormal_d2ull_rz() { +; CHECK-LABEL: define i64 @test_neg_subnormal_d2ull_rz() { +; CHECK-NEXT: ret i64 0 +; + %res = call i64 @llvm.nvvm.d2ull.rz(double 0x800fffffffffffff) + ret i64 %res +} + +declare i64 @llvm.nvvm.f2ll.rm(float) +declare i64 @llvm.nvvm.f2ll.rn(float) +declare i64 @llvm.nvvm.f2ll.rp(float) +declare i64 @llvm.nvvm.f2ll.rz(float) + +declare i64 @llvm.nvvm.f2ll.rm.ftz(float) +declare i64 @llvm.nvvm.f2ll.rn.ftz(float) +declare i64 @llvm.nvvm.f2ll.rp.ftz(float) +declare i64 @llvm.nvvm.f2ll.rz.ftz(float) + +declare i64 @llvm.nvvm.d2ll.rm(double) +declare i64 @llvm.nvvm.d2ll.rn(double) +declare i64 @llvm.nvvm.d2ll.rp(double) +declare i64 @llvm.nvvm.d2ll.rz(double) + + +declare i64 @llvm.nvvm.f2ull.rm(float) +declare i64 @llvm.nvvm.f2ull.rn(float) +declare i64 @llvm.nvvm.f2ull.rp(float) +declare i64 @llvm.nvvm.f2ull.rz(float) + +declare i64 @llvm.nvvm.f2ull.rm.ftz(float) +declare i64 @llvm.nvvm.f2ull.rn.ftz(float) +declare i64 @llvm.nvvm.f2ull.rp.ftz(float) +declare i64 @llvm.nvvm.f2ull.rz.ftz(float) + +declare i64 @llvm.nvvm.d2ull.rm(double) +declare i64 @llvm.nvvm.d2ull.rn(double) +declare i64 @llvm.nvvm.d2ull.rp(double) +declare i64 @llvm.nvvm.d2ull.rz(double) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll index a330b6964a660..f323231445aad 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll @@ -37,16 +37,16 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt ; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP5]] to i64 ; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 0, [[TMP18]] ; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 1, [[TMP18]] -; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 [[TMP9]] -; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 [[TMP10]] +; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]] +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]] ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]] ; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP5]] to i64 ; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP19]] ; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP19]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP14]] -; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[TMP15]] +; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]] ; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE3]], ptr align 4 [[TMP17]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64 diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll index 66bb9357750c8..3d23090dd1235 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll @@ -4,8 +4,6 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -; FIXME: GEP flags on GEPs for reverse vector pointer need to be dropped when folding the tail. - define i1 @fn(ptr %nno) #0 { ; CHECK-LABEL: define i1 @fn( ; CHECK-SAME: ptr [[NNO:%.*]]) #[[ATTR0:[0-9]+]] { @@ -26,8 +24,8 @@ define i1 @fn(ptr %nno) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i64> [[VEC_IND]], splat (i64 1) ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw i32, ptr [[NNO]], i64 [[TMP22]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 -3 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[TMP23]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 -3 ; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <4 x i32> ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP6]], i32 4, <4 x i1> [[REVERSE]], <4 x i32> poison) ; CHECK-NEXT: [[REVERSE1:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> poison, <4 x i32> diff --git a/llvm/test/tools/llvm-profgen/context-depth.test b/llvm/test/tools/llvm-profgen/context-depth.test new file mode 100644 index 0000000000000..4eaa5fa1eae9d --- /dev/null +++ b/llvm/test/tools/llvm-profgen/context-depth.test @@ -0,0 +1,125 @@ +; Test --csprof-max-context-depth and --csprof-max-unsymbolized-context-depth + +; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-hot-count=0 --csprof-max-context-depth=0 --csspgo-preinliner=0 --gen-cs-nested-profile=0 +; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-MAX-CTX-DEPTH +; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-hot-count=0 --csprof-max-unsymbolized-context-depth=2 --csspgo-preinliner=0 --gen-cs-nested-profile=0 --skip-symbolization +; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-MAX-UNSYM-CTX-DEPTH +; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-hot-count=0 --csprof-max-unsymbolized-context-depth=2 --csspgo-preinliner=0 --gen-cs-nested-profile=0 +; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-MAX-UNSYM-CTX-DEPTH-PROF +; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-hot-count=0 --csprof-max-unsymbolized-context-depth=2 --csprof-max-context-depth=0 --csspgo-preinliner=0 --gen-cs-nested-profile=0 +; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-MAX-CTX-DEPTH + + +; CHECK-MAX-CTX-DEPTH: [fb]:19:6 +; CHECK-MAX-CTX-DEPTH: 1: 6 +; CHECK-MAX-CTX-DEPTH: 2: 3 +; CHECK-MAX-CTX-DEPTH: 3: 3 +; CHECK-MAX-CTX-DEPTH: 4: 0 +; CHECK-MAX-CTX-DEPTH: 5: 4 fb:4 +; CHECK-MAX-CTX-DEPTH: 6: 3 fa:3 +; CHECK-MAX-CTX-DEPTH: !CFGChecksum: 563022570642068 +; CHECK-MAX-CTX-DEPTH: [fa]:14:4 +; CHECK-MAX-CTX-DEPTH: 1: 4 +; CHECK-MAX-CTX-DEPTH: 3: 4 +; CHECK-MAX-CTX-DEPTH: 4: 2 +; CHECK-MAX-CTX-DEPTH: 5: 1 +; CHECK-MAX-CTX-DEPTH: 6: 0 +; CHECK-MAX-CTX-DEPTH: 7: 2 fb:2 +; CHECK-MAX-CTX-DEPTH: 8: 1 fa:1 +; CHECK-MAX-CTX-DEPTH: !CFGChecksum: 563070469352221 + + +; CHECK-MAX-UNSYM-CTX-DEPTH: [0x7ab @ 0x7ab] +; CHECK-MAX-UNSYM-CTX-DEPTH: 3 +; CHECK-MAX-UNSYM-CTX-DEPTH: 7a0-7a7:1 +; CHECK-MAX-UNSYM-CTX-DEPTH: 7a0-7ab:3 +; CHECK-MAX-UNSYM-CTX-DEPTH: 7b2-7b5:1 +; CHECK-MAX-UNSYM-CTX-DEPTH: 3 +; CHECK-MAX-UNSYM-CTX-DEPTH: 7a7->7b2:1 +; CHECK-MAX-UNSYM-CTX-DEPTH: 7ab->7a0:4 +; CHECK-MAX-UNSYM-CTX-DEPTH: 7b5->7c0:1 +; CHECK-MAX-UNSYM-CTX-DEPTH: [0x7ab @ 0x7b5] +; CHECK-MAX-UNSYM-CTX-DEPTH: 1 +; CHECK-MAX-UNSYM-CTX-DEPTH: 7c0-7d4:1 +; CHECK-MAX-UNSYM-CTX-DEPTH: 1 +; CHECK-MAX-UNSYM-CTX-DEPTH: 7d4->7c0:1 +; CHECK-MAX-UNSYM-CTX-DEPTH: [0x7b5 @ 0x7d4] +; CHECK-MAX-UNSYM-CTX-DEPTH: 2 +; CHECK-MAX-UNSYM-CTX-DEPTH: 7c0-7cd:1 +; CHECK-MAX-UNSYM-CTX-DEPTH: 7db-7e0:1 +; CHECK-MAX-UNSYM-CTX-DEPTH: 2 +; CHECK-MAX-UNSYM-CTX-DEPTH: 7cd->7db:1 +; CHECK-MAX-UNSYM-CTX-DEPTH: 7e0->7a0:1 +; CHECK-MAX-UNSYM-CTX-DEPTH: [0x7b5 @ 0x7e0] +; CHECK-MAX-UNSYM-CTX-DEPTH: 2 +; CHECK-MAX-UNSYM-CTX-DEPTH: 7a0-7a7:1 +; CHECK-MAX-UNSYM-CTX-DEPTH: 7b2-7b5:1 +; CHECK-MAX-UNSYM-CTX-DEPTH: 2 +; CHECK-MAX-UNSYM-CTX-DEPTH: 7a7->7b2:1 +; CHECK-MAX-UNSYM-CTX-DEPTH: 7b5->7c0:1 +; CHECK-MAX-UNSYM-CTX-DEPTH: [0x7d4 @ 0x7e0] +; CHECK-MAX-UNSYM-CTX-DEPTH: 2 +; CHECK-MAX-UNSYM-CTX-DEPTH: 7a0-7a7:1 +; CHECK-MAX-UNSYM-CTX-DEPTH: 7b2-7b5:1 +; CHECK-MAX-UNSYM-CTX-DEPTH: 2 +; CHECK-MAX-UNSYM-CTX-DEPTH: 7a7->7b2:1 +; CHECK-MAX-UNSYM-CTX-DEPTH: 7b5->7c0:1 +; CHECK-MAX-UNSYM-CTX-DEPTH: [0x7e0 @ 0x7b5] +; CHECK-MAX-UNSYM-CTX-DEPTH: 2 +; CHECK-MAX-UNSYM-CTX-DEPTH: 7c0-7cd:2 +; CHECK-MAX-UNSYM-CTX-DEPTH: 7db-7e0:1 +; CHECK-MAX-UNSYM-CTX-DEPTH: 2 +; CHECK-MAX-UNSYM-CTX-DEPTH: 7cd->7db:2 +; CHECK-MAX-UNSYM-CTX-DEPTH: 7e0->7a0:1 + +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: [fb:5 @ fb:5 @ fb]:13:4 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 1: 4 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 2: 3 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 3: 1 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 4: 0 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 5: 4 fb:4 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 6: 1 fa:1 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: !CFGChecksum: 563022570642068 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: [fa:7 @ fb:6 @ fa]:6:2 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 1: 2 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 3: 2 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 4: 1 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 5: 0 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 6: 0 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 7: 1 fb:1 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 8: 0 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: !CFGChecksum: 563070469352221 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: [fb:5 @ fb:6 @ fa]:4:1 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 1: 1 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 3: 1 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 4: 0 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 5: 1 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 6: 0 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 7: 0 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 8: 1 fa:1 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: !CFGChecksum: 563070469352221 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: [fb:6 @ fa:8 @ fa]:4:1 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 1: 1 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 3: 1 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 4: 1 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 5: 0 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 6: 0 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 7: 1 fb:1 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 8: 0 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: !CFGChecksum: 563070469352221 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: [fa:8 @ fa:7 @ fb]:3:1 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 1: 1 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 2: 0 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 3: 1 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 4: 0 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 5: 0 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 6: 1 fa:1 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: !CFGChecksum: 563022570642068 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: [fb:6 @ fa:7 @ fb]:3:1 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 1: 1 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 2: 0 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 3: 1 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 4: 0 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 5: 0 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: 6: 1 fa:1 +; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: !CFGChecksum: 563022570642068 diff --git a/llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test b/llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test index c673028584c0d..b8e3e248e7793 100644 --- a/llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test +++ b/llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test @@ -9,9 +9,6 @@ ; RUN: FileCheck %s --input-file %t --check-prefix=CHECK-UNWINDER ; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe-nommap.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --profile-summary-hot-count=0 --csspgo-preinliner=0 --gen-cs-nested-profile=0 ; RUN: FileCheck %s --input-file %t -; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-hot-count=0 --csprof-max-context-depth=0 --csspgo-preinliner=0 --gen-cs-nested-profile=0 -; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-MAX-CTX-DEPTH - ; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:6 @ fa]:4:1 ; CHECK-UNCOMPRESS: 1: 1 @@ -68,23 +65,6 @@ ; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb]:1:0 ; CHECK-UNCOMPRESS: 5: 1 fb:1 ; CHECK-UNCOMPRESS: !CFGChecksum: 563022570642068 -; CHECK-MAX-CTX-DEPTH: [fb]:19:6 -; CHECK-MAX-CTX-DEPTH: 1: 6 -; CHECK-MAX-CTX-DEPTH: 2: 3 -; CHECK-MAX-CTX-DEPTH: 3: 3 -; CHECK-MAX-CTX-DEPTH: 4: 0 -; CHECK-MAX-CTX-DEPTH: 5: 4 fb:4 -; CHECK-MAX-CTX-DEPTH: 6: 3 fa:3 -; CHECK-MAX-CTX-DEPTH: !CFGChecksum: 563022570642068 -; CHECK-MAX-CTX-DEPTH: [fa]:14:4 -; CHECK-MAX-CTX-DEPTH: 1: 4 -; CHECK-MAX-CTX-DEPTH: 3: 4 -; CHECK-MAX-CTX-DEPTH: 4: 2 -; CHECK-MAX-CTX-DEPTH: 5: 1 -; CHECK-MAX-CTX-DEPTH: 6: 0 -; CHECK-MAX-CTX-DEPTH: 7: 2 fb:2 -; CHECK-MAX-CTX-DEPTH: 8: 1 fa:1 -; CHECK-MAX-CTX-DEPTH: !CFGChecksum: 563070469352221 ; CHECK: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb]:13:4 diff --git a/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp b/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp index 41d361532908c..5636782bdf7f6 100644 --- a/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp +++ b/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp @@ -24,6 +24,8 @@ namespace llvm { namespace exegesis { +#include "RISCVGenExegesis.inc" + namespace { // Stores constant value to a general-purpose (integer) register. @@ -132,8 +134,7 @@ class ExegesisRISCVTarget : public ExegesisTarget { }; ExegesisRISCVTarget::ExegesisRISCVTarget() - : ExegesisTarget(ArrayRef{}, - RISCV_MC::isOpcodeAvailable) {} + : ExegesisTarget(RISCVCpuPfmCounters, RISCV_MC::isOpcodeAvailable) {} bool ExegesisRISCVTarget::matchesArch(Triple::ArchType Arch) const { return Arch == Triple::riscv32 || Arch == Triple::riscv64; diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp index 111c546f5329f..ad113eda27914 100644 --- a/llvm/tools/llvm-profgen/PerfReader.cpp +++ b/llvm/tools/llvm-profgen/PerfReader.cpp @@ -42,6 +42,11 @@ static cl::opt cl::opt ShowDetailedWarning("show-detailed-warning", cl::desc("Show detailed warning message.")); +static cl::opt CSProfMaxUnsymbolizedCtxDepth( + "csprof-max-unsymbolized-context-depth", cl::init(-1), + cl::desc("Keep the last K contexts while merging unsymbolized profile. -1 " + "means no depth limit.")); + extern cl::opt PerfTraceFilename; extern cl::opt ShowDisassemblyOnly; extern cl::opt ShowSourceLocations; @@ -172,7 +177,19 @@ std::shared_ptr AddressStack::getContextKey() { std::shared_ptr KeyStr = std::make_shared(); KeyStr->Context = Stack; CSProfileGenerator::compressRecursionContext(KeyStr->Context); - CSProfileGenerator::trimContext(KeyStr->Context); + // MaxContextDepth(--csprof-max-context-depth) is used to trim both symbolized + // and unsymbolized profile context. Sometimes we want to at least preserve + // the inlinings for the leaf frame(the profiled binary inlining), + // --csprof-max-context-depth may not be flexible enough, in this case, + // --csprof-max-unsymbolized-context-depth is used to limit the context for + // unsymbolized profile. If both are set, use the minimum of them. + int Depth = CSProfileGenerator::MaxContextDepth != -1 + ? CSProfileGenerator::MaxContextDepth + : KeyStr->Context.size(); + Depth = CSProfMaxUnsymbolizedCtxDepth != -1 + ? std::min(static_cast(CSProfMaxUnsymbolizedCtxDepth), Depth) + : Depth; + CSProfileGenerator::trimContext(KeyStr->Context, Depth); return KeyStr; } diff --git a/llvm/unittests/tools/llvm-exegesis/CMakeLists.txt b/llvm/unittests/tools/llvm-exegesis/CMakeLists.txt index 3ee3a0dc6b5d0..735f17ab03e61 100644 --- a/llvm/unittests/tools/llvm-exegesis/CMakeLists.txt +++ b/llvm/unittests/tools/llvm-exegesis/CMakeLists.txt @@ -53,6 +53,9 @@ endif() if(LLVM_TARGETS_TO_BUILD MATCHES "Mips") include(Mips/CMakeLists.txt) endif() +if(LLVM_TARGETS_TO_BUILD MATCHES "RISCV") + include(RISCV/CMakeLists.txt) +endif() include_directories(${exegesis_includes}) diff --git a/llvm/unittests/tools/llvm-exegesis/RISCV/CMakeLists.txt b/llvm/unittests/tools/llvm-exegesis/RISCV/CMakeLists.txt new file mode 100644 index 0000000000000..1984819be7738 --- /dev/null +++ b/llvm/unittests/tools/llvm-exegesis/RISCV/CMakeLists.txt @@ -0,0 +1,21 @@ +add_llvm_exegesis_unittest_includes( + ${LLVM_MAIN_SRC_DIR}/lib/Target/RISCV + ${LLVM_BINARY_DIR}/lib/Target/RISCV + ${LLVM_MAIN_SRC_DIR}/tools/llvm-exegesis/lib + ) + +add_llvm_exegesis_unittest_link_components( + MC + MCParser + Object + Support + Symbolize + RISCV + ) + +add_llvm_exegesis_unittest_sources( + SnippetGeneratorTest.cpp + TargetTest.cpp + ) +add_llvm_exegesis_unittest_link_libraries( + LLVMExegesisRISCV) diff --git a/llvm/unittests/tools/llvm-exegesis/RISCV/SnippetGeneratorTest.cpp b/llvm/unittests/tools/llvm-exegesis/RISCV/SnippetGeneratorTest.cpp new file mode 100644 index 0000000000000..5920b79da9d3e --- /dev/null +++ b/llvm/unittests/tools/llvm-exegesis/RISCV/SnippetGeneratorTest.cpp @@ -0,0 +1,122 @@ +//===-- SnippetGeneratorTest.cpp --------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "../Common/AssemblerUtils.h" +#include "LlvmState.h" +#include "MCInstrDescView.h" +#include "ParallelSnippetGenerator.h" +#include "RISCVInstrInfo.h" +#include "RegisterAliasing.h" +#include "SerialSnippetGenerator.h" +#include "TestBase.h" + +namespace llvm { +namespace exegesis { +namespace { + +using testing::AnyOf; +using testing::ElementsAre; +using testing::HasSubstr; +using testing::SizeIs; + +MATCHER(IsInvalid, "") { return !arg.isValid(); } +MATCHER(IsReg, "") { return arg.isReg(); } + +template +class RISCVSnippetGeneratorTest : public RISCVTestBase { +protected: + RISCVSnippetGeneratorTest() : Generator(State, SnippetGenerator::Options()) {} + + std::vector checkAndGetCodeTemplates(unsigned Opcode) { + randomGenerator().seed(0); // Initialize seed. + const Instruction &Instr = State.getIC().getInstr(Opcode); + auto CodeTemplateOrError = Generator.generateCodeTemplates( + &Instr, State.getRATC().emptyRegisters()); + EXPECT_FALSE(CodeTemplateOrError.takeError()); // Valid configuration. + return std::move(CodeTemplateOrError.get()); + } + + SnippetGeneratorT Generator; +}; + +using RISCVSerialSnippetGeneratorTest = + RISCVSnippetGeneratorTest; + +using RISCVParallelSnippetGeneratorTest = + RISCVSnippetGeneratorTest; + +TEST_F(RISCVSerialSnippetGeneratorTest, + ImplicitSelfDependencyThroughExplicitRegs) { + // - ADD + // - Op0 Explicit Def RegClass(GPR) + // - Op1 Explicit Use RegClass(GPR) + // - Op2 Explicit Use RegClass(GPR) + // - Var0 [Op0] + // - Var1 [Op1] + // - Var2 [Op2] + // - hasAliasingRegisters + const unsigned Opcode = RISCV::ADD; + const auto CodeTemplates = checkAndGetCodeTemplates(Opcode); + ASSERT_THAT(CodeTemplates, SizeIs(1)); + const auto &CT = CodeTemplates[0]; + EXPECT_THAT(CT.Execution, ExecutionMode::SERIAL_VIA_EXPLICIT_REGS); + ASSERT_THAT(CT.Instructions, SizeIs(1)); + const InstructionTemplate &IT = CT.Instructions[0]; + EXPECT_THAT(IT.getOpcode(), Opcode); + ASSERT_THAT(IT.getVariableValues(), SizeIs(3)); + EXPECT_THAT(IT.getVariableValues(), + AnyOf(ElementsAre(IsReg(), IsInvalid(), IsReg()), + ElementsAre(IsReg(), IsReg(), IsInvalid()))) + << "Op0 is either set to Op1 or to Op2"; +} + +TEST_F(RISCVSerialSnippetGeneratorTest, + ImplicitSelfDependencyThroughExplicitRegsForbidAll) { + // - XOR + // - Op0 Explicit Def RegClass(GPR) + // - Op1 Explicit Use RegClass(GPR) + // - Op2 Explicit Use RegClass(GPR) + // - Var0 [Op0] + // - Var1 [Op1] + // - Var2 [Op2] + // - hasAliasingRegisters + randomGenerator().seed(0); // Initialize seed. + const Instruction &Instr = State.getIC().getInstr(RISCV::XOR); + auto AllRegisters = State.getRATC().emptyRegisters(); + AllRegisters.flip(); + EXPECT_TRUE(errorToBool( + Generator.generateCodeTemplates(&Instr, AllRegisters).takeError())); +} + +TEST_F(RISCVParallelSnippetGeneratorTest, MemoryUse) { + // LB reads from memory. + // - LB + // - Op0 Explicit Def RegClass(GPR) + // - Op1 Explicit Use Memory RegClass(GPR) + // - Op2 Explicit Use Memory + // - Var0 [Op0] + // - Var1 [Op1] + // - Var2 [Op2] + // - hasMemoryOperands + const unsigned Opcode = RISCV::LB; + const auto CodeTemplates = checkAndGetCodeTemplates(Opcode); + ASSERT_THAT(CodeTemplates, SizeIs(1)); + const auto &CT = CodeTemplates[0]; + EXPECT_THAT(CT.Info, HasSubstr("instruction has no tied variables")); + EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN); + ASSERT_THAT(CT.Instructions, + SizeIs(ParallelSnippetGenerator::kMinNumDifferentAddresses)); + const InstructionTemplate &IT = CT.Instructions[0]; + EXPECT_THAT(IT.getOpcode(), Opcode); + ASSERT_THAT(IT.getVariableValues(), SizeIs(3)); + EXPECT_EQ(IT.getVariableValues()[1].getReg(), RISCV::X10); +} + +} // namespace +} // namespace exegesis +} // namespace llvm diff --git a/llvm/unittests/tools/llvm-exegesis/RISCV/TargetTest.cpp b/llvm/unittests/tools/llvm-exegesis/RISCV/TargetTest.cpp new file mode 100644 index 0000000000000..12d3ce7165a86 --- /dev/null +++ b/llvm/unittests/tools/llvm-exegesis/RISCV/TargetTest.cpp @@ -0,0 +1,56 @@ +//===-- TargetTest.cpp ---------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "Target.h" + +#include +#include + +#include "MCTargetDesc/RISCVMCTargetDesc.h" +#include "TestBase.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Support/TargetSelect.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +namespace llvm { +namespace exegesis { + +void InitializeRISCVExegesisTarget(); + +namespace { + +using testing::IsEmpty; +using testing::Not; +using testing::NotNull; + +class RISCVTargetTest : public RISCVTestBase { +protected: + std::vector setRegTo(unsigned Reg, const APInt &Value) { + return State.getExegesisTarget().setRegTo(State.getSubtargetInfo(), Reg, + Value); + } +}; + +TEST_F(RISCVTargetTest, SetRegToConstant) { + const auto Insts = setRegTo(RISCV::X10, APInt()); + EXPECT_THAT(Insts, Not(IsEmpty())); +} + +TEST_F(RISCVTargetTest, DefaultPfmCounters) { + const std::string Expected = "CYCLES"; + EXPECT_EQ(State.getExegesisTarget().getPfmCounters("").CycleCounter, + Expected); + EXPECT_EQ( + State.getExegesisTarget().getPfmCounters("unknown_cpu").CycleCounter, + Expected); +} + +} // namespace +} // namespace exegesis +} // namespace llvm diff --git a/llvm/unittests/tools/llvm-exegesis/RISCV/TestBase.h b/llvm/unittests/tools/llvm-exegesis/RISCV/TestBase.h new file mode 100644 index 0000000000000..66748fb9a2ce1 --- /dev/null +++ b/llvm/unittests/tools/llvm-exegesis/RISCV/TestBase.h @@ -0,0 +1,44 @@ +//===-- TestBase.h ----------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Test fixture common to all RISC-V tests. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_UNITTESTS_TOOLS_LLVMEXEGESIS_RISCV_TESTBASE_H +#define LLVM_UNITTESTS_TOOLS_LLVMEXEGESIS_RISCV_TESTBASE_H + +#include "LlvmState.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Support/TargetSelect.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +namespace llvm { +namespace exegesis { + +void InitializeRISCVExegesisTarget(); + +class RISCVTestBase : public ::testing::Test { +protected: + RISCVTestBase() + : State(cantFail( + LLVMState::Create("riscv64-unknown-linux", "generic-rv64"))) {} + + static void SetUpTestCase() { + LLVMInitializeRISCVTargetInfo(); + LLVMInitializeRISCVTargetMC(); + LLVMInitializeRISCVTarget(); + InitializeRISCVExegesisTarget(); + } + + const LLVMState State; +}; + +} // namespace exegesis +} // namespace llvm + +#endif diff --git a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn index 47b03b42d096d..bf6a0b7523279 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn @@ -73,6 +73,7 @@ unittest("SupportTests") { "ProcessTest.cpp", "ProgramTest.cpp", "RISCVAttributeParserTest.cpp", + "RecyclerTest.cpp", "RegexTest.cpp", "ReplaceFileTest.cpp", "ReverseIterationTest.cpp", diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt index 7416e522083b7..a888ac243b044 100644 --- a/mlir/CMakeLists.txt +++ b/mlir/CMakeLists.txt @@ -170,7 +170,7 @@ configure_file( # The pybind11 library can be found (set with -DPYBIND_DIR=...) # The python executable is correct (set with -DPython3_EXECUTABLE=...) # By default, find_package and probing for installed pybind11 is performed. -# Super projects can set MLIR_DISABLE_CONFIGURE_PYTHON_DEV_PACKAGES=ON to +# Super projects can set MLIR_DISABLE_CONFIGURE_PYTHON_DEV_PACKAGES=ON to # disable all package setup and control it themselves. #------------------------------------------------------------------------------- diff --git a/mlir/docs/Bindings/Python.md b/mlir/docs/Bindings/Python.md index a0bd1cac118ba..32df3310d811d 100644 --- a/mlir/docs/Bindings/Python.md +++ b/mlir/docs/Bindings/Python.md @@ -1035,7 +1035,7 @@ class ConstantOp(_ods_ir.OpView): ... ``` -expects `value` to be a `TypedAttr` (e.g., `IntegerAttr` or `FloatAttr`). +expects `value` to be a `TypedAttr` (e.g., `IntegerAttr` or `FloatAttr`). Thus, a natural extension is a builder that accepts a MLIR type and a Python value and instantiates the appropriate `TypedAttr`: ```python @@ -1181,9 +1181,9 @@ make the passes available along with the dialect. Dialect functionality other than IR objects or passes, such as helper functions, can be exposed to Python similarly to attributes and types. C API is expected to exist for this functionality, which can then be wrapped using pybind11 and -`[include/mlir/Bindings/Python/PybindAdaptors.h](https://github.com/llvm/llvm-project/blob/main/mlir/include/mlir/Bindings/Python/PybindAdaptors.h)`, +[`include/mlir/Bindings/Python/PybindAdaptors.h`](https://github.com/llvm/llvm-project/blob/main/mlir/include/mlir/Bindings/Python/PybindAdaptors.h), or nanobind and -`[include/mlir/Bindings/Python/NanobindAdaptors.h](https://github.com/llvm/llvm-project/blob/main/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h)` +[`include/mlir/Bindings/Python/NanobindAdaptors.h`](https://github.com/llvm/llvm-project/blob/main/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h) utilities to connect to the rest of Python API. The bindings can be located in a separate module or in the same module as attributes and types, and loaded along with the dialect. diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp index 05c000bfd8ca0..453d4f7c7e8bc 100644 --- a/mlir/lib/Bindings/Python/IRCore.cpp +++ b/mlir/lib/Bindings/Python/IRCore.cpp @@ -272,13 +272,13 @@ struct PyAttrBuilderMap { static bool dunderContains(const std::string &attributeKind) { return PyGlobals::get().lookupAttributeBuilder(attributeKind).has_value(); } - static nb::callable dundeGetItemNamed(const std::string &attributeKind) { + static nb::callable dunderGetItemNamed(const std::string &attributeKind) { auto builder = PyGlobals::get().lookupAttributeBuilder(attributeKind); if (!builder) throw nb::key_error(attributeKind.c_str()); return *builder; } - static void dundeSetItemNamed(const std::string &attributeKind, + static void dunderSetItemNamed(const std::string &attributeKind, nb::callable func, bool replace) { PyGlobals::get().registerAttributeBuilder(attributeKind, std::move(func), replace); @@ -287,8 +287,8 @@ struct PyAttrBuilderMap { static void bind(nb::module_ &m) { nb::class_(m, "AttrBuilder") .def_static("contains", &PyAttrBuilderMap::dunderContains) - .def_static("get", &PyAttrBuilderMap::dundeGetItemNamed) - .def_static("insert", &PyAttrBuilderMap::dundeSetItemNamed, + .def_static("get", &PyAttrBuilderMap::dunderGetItemNamed) + .def_static("insert", &PyAttrBuilderMap::dunderSetItemNamed, "attribute_kind"_a, "attr_builder"_a, "replace"_a = false, "Register an attribute builder for building MLIR " "attributes from python values."); diff --git a/mlir/test/python/execution_engine.py b/mlir/test/python/execution_engine.py index 6d3a8db8c24be..0d12c35d96bee 100644 --- a/mlir/test/python/execution_engine.py +++ b/mlir/test/python/execution_engine.py @@ -306,7 +306,7 @@ def callback(a): log(arr) with Context(): - # The module takes a subview of the argument memref, casts it to an unranked memref and + # The module takes a subview of the argument memref, casts it to an unranked memref and # calls the callback with it. module = Module.parse( r""" diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel index 66ac46437a1d4..a8b37c5ddcc2c 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel @@ -115,7 +115,7 @@ libc_support_library( hdrs = ["SortingTest.h"], deps = [ "//libc:__support_macros_config", - "//libc:qsort_util", + "//libc:qsort", "//libc/test/UnitTest:LibcUnitTest", ], )