Skip to content

Commit

Permalink
[CostModel] Handle vector struct results and cost llvm.sincos (llvm…
Browse files Browse the repository at this point in the history
…#123210)

This patch updates the cost model to cost intrinsics that return
multiple values (in structs) correctly. Previously, the cost model only
thought intrinsics that return `VectorType` need scalarizing, which
meant it cost intrinsics that return multiple vectors (that need
scalarizing) way too cheap (giving it the cost of a single function
call).

This patch also adds a custom cost for llvm.sincos when a vector
function library is available, as certain VFs can be expanded (later in
code gen) to a vector function, reducing the cost to a single call (+
the possible loads from the vector function returns values via output
pointers).
  • Loading branch information
MacDue authored Feb 26, 2025
1 parent 178b9e5 commit 900220d
Show file tree
Hide file tree
Showing 6 changed files with 209 additions and 62 deletions.
7 changes: 5 additions & 2 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,12 +126,13 @@ class IntrinsicCostAttributes {
// If ScalarizationCost is UINT_MAX, the cost of scalarizing the
// arguments and the return value will be computed based on types.
InstructionCost ScalarizationCost = InstructionCost::getInvalid();
TargetLibraryInfo const *LibInfo = nullptr;

public:
IntrinsicCostAttributes(
Intrinsic::ID Id, const CallBase &CI,
InstructionCost ScalarCost = InstructionCost::getInvalid(),
bool TypeBasedOnly = false);
bool TypeBasedOnly = false, TargetLibraryInfo const *LibInfo = nullptr);

IntrinsicCostAttributes(
Intrinsic::ID Id, Type *RTy, ArrayRef<Type *> Tys,
Expand All @@ -145,7 +146,8 @@ class IntrinsicCostAttributes {
Intrinsic::ID Id, Type *RTy, ArrayRef<const Value *> Args,
ArrayRef<Type *> Tys, FastMathFlags Flags = FastMathFlags(),
const IntrinsicInst *I = nullptr,
InstructionCost ScalarCost = InstructionCost::getInvalid());
InstructionCost ScalarCost = InstructionCost::getInvalid(),
TargetLibraryInfo const *LibInfo = nullptr);

Intrinsic::ID getID() const { return IID; }
const IntrinsicInst *getInst() const { return II; }
Expand All @@ -154,6 +156,7 @@ class IntrinsicCostAttributes {
InstructionCost getScalarizationCost() const { return ScalarizationCost; }
const SmallVectorImpl<const Value *> &getArgs() const { return Arguments; }
const SmallVectorImpl<Type *> &getArgTypes() const { return ParamTys; }
const TargetLibraryInfo *getLibInfo() const { return LibInfo; }

bool isTypeBasedOnly() const {
return Arguments.empty();
Expand Down
113 changes: 95 additions & 18 deletions llvm/include/llvm/CodeGen/BasicTTIImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/TargetTransformInfoImpl.h"
#include "llvm/Analysis/ValueTracking.h"
Expand Down Expand Up @@ -285,6 +286,64 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return false;
}

/// Several intrinsics that return structs (including llvm.sincos[pi] and
/// llvm.modf) can be lowered to a vector library call (for certain VFs). The
/// vector library functions correspond to the scalar calls (e.g. sincos or
/// modf), which unlike the intrinsic return values via output pointers. This
/// helper checks if a vector call exists for the given intrinsic, and returns
/// the cost, which includes the cost of the mask (if required), and the loads
/// for values returned via output pointers. \p LC is the scalar libcall and
/// \p CallRetElementIndex (optional) is the struct element which is mapped to
/// the call return value. If std::nullopt is returned, then no vector library
/// call is available, so the intrinsic should be assigned the default cost
/// (e.g. scalarization).
std::optional<InstructionCost> getMultipleResultIntrinsicVectorLibCallCost(
const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind,
RTLIB::Libcall LC, std::optional<unsigned> CallRetElementIndex = {}) {
Type *RetTy = ICA.getReturnType();
// Vector variants of the intrinsic can be mapped to a vector library call.
auto const *LibInfo = ICA.getLibInfo();
if (!LibInfo || !isa<StructType>(RetTy) ||
!isVectorizedStructTy(cast<StructType>(RetTy)))
return std::nullopt;

// Find associated libcall.
const char *LCName = getTLI()->getLibcallName(LC);
if (!LCName)
return std::nullopt;

// Search for a corresponding vector variant.
LLVMContext &Ctx = RetTy->getContext();
ElementCount VF = getVectorizedTypeVF(RetTy);
VecDesc const *VD = nullptr;
for (bool Masked : {false, true}) {
if ((VD = LibInfo->getVectorMappingInfo(LCName, VF, Masked)))
break;
}
if (!VD)
return std::nullopt;

// Cost the call + mask.
auto Cost =
thisT()->getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(), CostKind);
if (VD->isMasked())
Cost += thisT()->getShuffleCost(
TargetTransformInfo::SK_Broadcast,
VectorType::get(IntegerType::getInt1Ty(Ctx), VF), {}, CostKind, 0,
nullptr, {});

// Lowering to a library call (with output pointers) may require us to emit
// reloads for the results.
for (auto [Idx, VectorTy] : enumerate(getContainedTypes(RetTy))) {
if (Idx == CallRetElementIndex)
continue;
Cost += thisT()->getMemoryOpCost(
Instruction::Load, VectorTy,
thisT()->getDataLayout().getABITypeAlign(VectorTy), 0, CostKind);
}
return Cost;
}

protected:
explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
: BaseT(DL) {}
Expand Down Expand Up @@ -1726,9 +1785,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {

Type *RetTy = ICA.getReturnType();

ElementCount RetVF =
(RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
: ElementCount::getFixed(1));
ElementCount RetVF = isVectorizedTy(RetTy) ? getVectorizedTypeVF(RetTy)
: ElementCount::getFixed(1);

const IntrinsicInst *I = ICA.getInst();
const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
FastMathFlags FMF = ICA.getFlags();
Expand Down Expand Up @@ -1997,6 +2056,16 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
}
case Intrinsic::experimental_vector_match:
return thisT()->getTypeBasedIntrinsicInstrCost(ICA, CostKind);
case Intrinsic::sincos: {
Type *Ty = getContainedTypes(RetTy).front();
EVT VT = getTLI()->getValueType(DL, Ty);
RTLIB::Libcall LC = RTLIB::getSINCOS(VT.getScalarType());
if (auto Cost =
getMultipleResultIntrinsicVectorLibCallCost(ICA, CostKind, LC))
return *Cost;
// Otherwise, fallback to default scalarization cost.
break;
}
}

// Assume that we need to scalarize this intrinsic.)
Expand All @@ -2005,10 +2074,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
InstructionCost ScalarizationCost = InstructionCost::getInvalid();
if (RetVF.isVector() && !RetVF.isScalable()) {
ScalarizationCost = 0;
if (!RetTy->isVoidTy())
ScalarizationCost += getScalarizationOverhead(
cast<VectorType>(RetTy),
/*Insert*/ true, /*Extract*/ false, CostKind);
if (!RetTy->isVoidTy()) {
for (Type *VectorTy : getContainedTypes(RetTy)) {
ScalarizationCost += getScalarizationOverhead(
cast<VectorType>(VectorTy),
/*Insert=*/true, /*Extract=*/false, CostKind);
}
}
ScalarizationCost +=
getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind);
}
Expand Down Expand Up @@ -2689,27 +2761,32 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
// Else, assume that we need to scalarize this intrinsic. For math builtins
// this will emit a costly libcall, adding call overhead and spills. Make it
// very expensive.
if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
if (isVectorizedTy(RetTy)) {
ArrayRef<Type *> RetVTys = getContainedTypes(RetTy);

// Scalable vectors cannot be scalarized, so return Invalid.
if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
return isa<ScalableVectorType>(Ty);
}))
if (any_of(concat<Type *const>(RetVTys, Tys),
[](Type *Ty) { return isa<ScalableVectorType>(Ty); }))
return InstructionCost::getInvalid();

InstructionCost ScalarizationCost =
SkipScalarizationCost
? ScalarizationCostPassed
: getScalarizationOverhead(RetVTy, /*Insert*/ true,
/*Extract*/ false, CostKind);
InstructionCost ScalarizationCost = ScalarizationCostPassed;
if (!SkipScalarizationCost) {
ScalarizationCost = 0;
for (Type *RetVTy : RetVTys) {
ScalarizationCost += getScalarizationOverhead(
cast<VectorType>(RetVTy), /*Insert=*/true,
/*Extract=*/false, CostKind);
}
}

unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
unsigned ScalarCalls = getVectorizedTypeVF(RetTy).getFixedValue();
SmallVector<Type *, 4> ScalarTys;
for (Type *Ty : Tys) {
if (Ty->isVectorTy())
Ty = Ty->getScalarType();
ScalarTys.push_back(Ty);
}
IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
IntrinsicCostAttributes Attrs(IID, toScalarizedTy(RetTy), ScalarTys, FMF);
InstructionCost ScalarCost =
thisT()->getIntrinsicInstrCost(Attrs, CostKind);
for (Type *Ty : Tys) {
Expand Down
18 changes: 13 additions & 5 deletions llvm/lib/Analysis/CostModel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,15 @@
//===----------------------------------------------------------------------===//

#include "llvm/Analysis/CostModel.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/PassManager.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/raw_ostream.h"

using namespace llvm;

static cl::opt<TargetTransformInfo::TargetCostKind> CostKind(
Expand All @@ -42,25 +44,31 @@ static cl::opt<bool> TypeBasedIntrinsicCost("type-based-intrinsic-cost",
cl::desc("Calculate intrinsics cost based only on argument types"),
cl::init(false));

static cl::opt<bool> PreferIntrinsicCost(
"prefer-intrinsic-cost",
cl::desc("Prefer using getIntrinsicInstrCost over getInstructionCost"),
cl::init(false));

#define CM_NAME "cost-model"
#define DEBUG_TYPE CM_NAME

PreservedAnalyses CostModelPrinterPass::run(Function &F,
FunctionAnalysisManager &AM) {
auto &TTI = AM.getResult<TargetIRAnalysis>(F);
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
OS << "Printing analysis 'Cost Model Analysis' for function '" << F.getName() << "':\n";
for (BasicBlock &B : F) {
for (Instruction &Inst : B) {
// TODO: Use a pass parameter instead of cl::opt CostKind to determine
// which cost kind to print.
InstructionCost Cost;
auto *II = dyn_cast<IntrinsicInst>(&Inst);
if (II && TypeBasedIntrinsicCost) {
IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II,
InstructionCost::getInvalid(), true);
if (II && (PreferIntrinsicCost || TypeBasedIntrinsicCost)) {
IntrinsicCostAttributes ICA(
II->getIntrinsicID(), *II, InstructionCost::getInvalid(),
/*TypeBasedOnly=*/TypeBasedIntrinsicCost, &TLI);
Cost = TTI.getIntrinsicInstrCost(ICA, CostKind);
}
else {
} else {
Cost = TTI.getInstructionCost(&Inst, CostKind);
}

Expand Down
17 changes: 8 additions & 9 deletions llvm/lib/Analysis/TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,9 @@ bool HardwareLoopInfo::canAnalyze(LoopInfo &LI) {

IntrinsicCostAttributes::IntrinsicCostAttributes(
Intrinsic::ID Id, const CallBase &CI, InstructionCost ScalarizationCost,
bool TypeBasedOnly)
bool TypeBasedOnly, const TargetLibraryInfo *LibInfo)
: II(dyn_cast<IntrinsicInst>(&CI)), RetTy(CI.getType()), IID(Id),
ScalarizationCost(ScalarizationCost) {
ScalarizationCost(ScalarizationCost), LibInfo(LibInfo) {

if (const auto *FPMO = dyn_cast<FPMathOperator>(&CI))
FMF = FPMO->getFastMathFlags();
Expand Down Expand Up @@ -101,13 +101,12 @@ IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *Ty,
ParamTys.push_back(Argument->getType());
}

IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
ArrayRef<const Value *> Args,
ArrayRef<Type *> Tys,
FastMathFlags Flags,
const IntrinsicInst *I,
InstructionCost ScalarCost)
: II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) {
IntrinsicCostAttributes::IntrinsicCostAttributes(
Intrinsic::ID Id, Type *RTy, ArrayRef<const Value *> Args,
ArrayRef<Type *> Tys, FastMathFlags Flags, const IntrinsicInst *I,
InstructionCost ScalarCost, TargetLibraryInfo const *LibInfo)
: II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost),
LibInfo(LibInfo) {
ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end());
Arguments.insert(Arguments.begin(), Args.begin(), Args.end());
}
Expand Down
60 changes: 60 additions & 0 deletions llvm/test/Analysis/CostModel/AArch64/sincos.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "sincos"
; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s
; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=ArmPL -passes="print<cost-model>" -prefer-intrinsic-cost -cost-kind=throughput 2>&1 -disable-output | FileCheck %s -check-prefix=CHECK-VECLIB

define void @sincos() {
; CHECK-LABEL: 'sincos'
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, half } @llvm.sincos.f16(half poison)
; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison)
; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison)
; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison)
;
; CHECK: Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
; CHECK: Cost Model: Found an estimated cost of 52 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
; CHECK: Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
;
; CHECK: Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison)
; CHECK: Cost Model: Invalid cost for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison)
; CHECK: Cost Model: Invalid cost for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison)
; CHECK: Cost Model: Invalid cost for instruction: %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.nxv1f128(<vscale x 1 x fp128> poison)
; CHECK: Cost Model: Invalid cost for instruction: %nxv8f32 = call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.sincos.nxv8f32(<vscale x 8 x float> poison)
;
; CHECK-VECLIB-LABEL: 'sincos'
; CHECK-VECLIB: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, half } @llvm.sincos.f16(half poison)
; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison)
; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison)
; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison)
;
; CHECK-VECLIB: Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
; CHECK-VECLIB: Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
; CHECK-VECLIB: Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
; CHECK-VECLIB: Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
;
; CHECK-VECLIB: Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison)
; CHECK-VECLIB: Cost Model: Found an estimated cost of 13 for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison)
; CHECK-VECLIB: Cost Model: Found an estimated cost of 13 for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison)
; CHECK-VECLIB: Cost Model: Invalid cost for instruction: %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.nxv1f128(<vscale x 1 x fp128> poison)
; CHECK-VECLIB: Cost Model: Invalid cost for instruction: %nxv8f32 = call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.sincos.nxv8f32(<vscale x 8 x float> poison)
;
%f16 = call { half, half } @llvm.sincos.f16(half poison)
%f32 = call { float, float } @llvm.sincos.f32(float poison)
%f64 = call { double, double } @llvm.sincos.f64(double poison)
%f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison)

%v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
%v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
%v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
%v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
%v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)

%nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.v8f16(<vscale x 8 x half> poison)
%nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.v4f32(<vscale x 4 x float> poison)
%nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.v2f64(<vscale x 2 x double> poison)
%nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.v1f128(<vscale x 1 x fp128> poison)
%nxv8f32 = call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.sincos.v8f32(<vscale x 8 x float> poison)

ret void
}
Loading

0 comments on commit 900220d

Please sign in to comment.