Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: add legacy IPAM metrics back to IPAMv2 #2970

Merged
merged 1 commit into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 19 additions & 40 deletions cns/ipampool/metrics.go → cns/ipampool/metrics/metrics.go
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
package ipampool
package metrics

import (
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

const (
subnetLabel = "subnet"
subnetCIDRLabel = "subnet_cidr"
podnetARMIDLabel = "podnet_arm_id"
SubnetLabel = "subnet"
SubnetCIDRLabel = "subnet_cidr"
PodnetARMIDLabel = "podnet_arm_id"
customerMetricLabel = "customer_metric"
customerMetricLabelValue = "customer metric"
subnetExhaustionStateLabel = "subnet_exhaustion_state"
SubnetExhaustionStateLabel = "subnet_exhaustion_state"
SubnetIPExhausted = 1
SubnetIPNotExhausted = 0
)
Expand All @@ -23,110 +23,110 @@ var (
Help: "IPs currently in use by Pods on this CNS Node.",
ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel},
)
IpamAvailableIPCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cx_ipam_available_ips",
Help: "IPs available on this CNS Node for use by a Pod.",
ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel},
)
IpamBatchSize = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cx_ipam_batch_size",
Help: "IPAM IP pool scaling batch size.",
ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel},
)
IpamCurrentAvailableIPcount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cx_ipam_current_available_ips",
Help: "Current available IP count.",
ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel},
)
IpamExpectedAvailableIPCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cx_ipam_expect_available_ips",
Help: "Expected future available IP count assuming the Requested IP count is honored.",
ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel},
)
IpamMaxIPCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cx_ipam_max_ips",
Help: "Maximum Secondary IPs allowed on this Node.",
ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel},
)
IpamPendingProgramIPCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cx_ipam_pending_programming_ips",
Help: "IPs reserved but not yet available (Pending Programming).",
ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel},
)
IpamPendingReleaseIPCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cx_ipam_pending_release_ips",
Help: "IPs reserved but not available anymore (Pending Release).",
ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel},
)
IpamPrimaryIPCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cx_ipam_primary_ips",
Help: "NC Primary IP count (reserved from Pod Subnet for DNS and IMDS SNAT).",
ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel},
)
IpamRequestedIPConfigCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cx_ipam_requested_ips",
Help: "Secondary Pod Subnet IPs requested by this CNS Node (for Pods).",
ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel},
)
IpamSecondaryIPCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cx_ipam_secondary_ips",
Help: "Node NC Secondary IP count (reserved usable by Pods).",
ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel},
)
IpamTotalIPCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cx_ipam_total_ips",
Help: "Count of total IP pool size allocated to CNS by DNC.",
ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel},
)
IpamSubnetExhaustionState = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cx_ipam_subnet_exhaustion_state",
Help: "IPAM view of subnet exhaustion state",
ConstLabels: prometheus.Labels{customerMetricLabel: customerMetricLabelValue},
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel},
)
IpamSubnetExhaustionCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "cx_ipam_subnet_exhaustion_state_count_total",
Help: "Count of the number of times the ipam pool monitor sees subnet exhaustion",
},
[]string{subnetLabel, subnetCIDRLabel, podnetARMIDLabel, subnetExhaustionStateLabel},
[]string{SubnetLabel, SubnetCIDRLabel, PodnetARMIDLabel, SubnetExhaustionStateLabel},
)
)

Expand All @@ -148,24 +148,3 @@ func init() {
IpamSubnetExhaustionCount,
)
}

func observeIPPoolState(state ipPoolState, meta metaState) {
labels := []string{meta.subnet, meta.subnetCIDR, meta.subnetARMID}
IpamAllocatedIPCount.WithLabelValues(labels...).Set(float64(state.allocatedToPods))
IpamAvailableIPCount.WithLabelValues(labels...).Set(float64(state.available))
IpamBatchSize.WithLabelValues(labels...).Set(float64(meta.batch))
IpamCurrentAvailableIPcount.WithLabelValues(labels...).Set(float64(state.currentAvailableIPs))
IpamExpectedAvailableIPCount.WithLabelValues(labels...).Set(float64(state.expectedAvailableIPs))
IpamMaxIPCount.WithLabelValues(labels...).Set(float64(meta.max))
IpamPendingProgramIPCount.WithLabelValues(labels...).Set(float64(state.pendingProgramming))
IpamPendingReleaseIPCount.WithLabelValues(labels...).Set(float64(state.pendingRelease))
IpamPrimaryIPCount.WithLabelValues(labels...).Set(float64(len(meta.primaryIPAddresses)))
IpamRequestedIPConfigCount.WithLabelValues(labels...).Set(float64(state.requestedIPs))
IpamSecondaryIPCount.WithLabelValues(labels...).Set(float64(state.secondaryIPs))
IpamTotalIPCount.WithLabelValues(labels...).Set(float64(state.secondaryIPs + int64(len(meta.primaryIPAddresses))))
if meta.exhausted {
IpamSubnetExhaustionState.WithLabelValues(labels...).Set(float64(SubnetIPExhausted))
} else {
IpamSubnetExhaustionState.WithLabelValues(labels...).Set(float64(SubnetIPNotExhausted))
}
}
157 changes: 157 additions & 0 deletions cns/ipampool/metrics/observer.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
package metrics

import (
"context"
"fmt"
"net/netip"

"github.com/Azure/azure-container-networking/cns"
"github.com/Azure/azure-container-networking/cns/types"
"github.com/Azure/azure-container-networking/crd/clustersubnetstate/api/v1alpha1"
"github.com/Azure/azure-container-networking/crd/nodenetworkconfig/api/v1alpha"
"github.com/pkg/errors"
)

// Subnet ARM ID /subscriptions/$(SUB)/resourceGroups/$(GROUP)/providers/Microsoft.Network/virtualNetworks/$(VNET)/subnets/$(SUBNET)
const subnetARMIDTemplate = "/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/virtualNetworks/%s/subnets/%s"

// ipPoolState is the current actual state of the CNS IP pool.
type ipPoolState struct {
// allocatedToPods are the IPs CNS gives to Pods.
allocatedToPods int64
// available are the IPs in state "Available".
available int64
// currentAvailableIPs are the current available IPs: allocated - assigned - pendingRelease.
currentAvailableIPs int64
// expectedAvailableIPs are the "future" available IPs, if the requested IP count is honored: requested - assigned.
expectedAvailableIPs int64
// pendingProgramming are the IPs in state "PendingProgramming".
pendingProgramming int64
// pendingRelease are the IPs in state "PendingRelease".
pendingRelease int64
// requestedIPs are the IPs CNS has requested that it be allocated by DNC.
requestedIPs int64
// secondaryIPs are all the IPs given to CNS by DNC, not including the primary IP of the NC.
secondaryIPs int64
}

// metaState is the Monitor's configuration state for the IP pool.
type metaState struct {
batch int64
exhausted bool
max int64
primaryIPAddresses map[string]struct{}
subnet string
subnetARMID string
subnetCIDR string
}

// NewLegacyMetricsObserver creates a closed functional scope which can be invoked to
// observe the legacy IPAM pool metrics.
//
//nolint:lll // ignore line length
func NewLegacyMetricsObserver(ctx context.Context, ipcli func() map[string]cns.IPConfigurationStatus, nnccli func(context.Context) (*v1alpha.NodeNetworkConfig, error), csscli func(context.Context) ([]v1alpha1.ClusterSubnetState, error)) func() error {
return func() error {
return observeMetrics(ctx, ipcli, nnccli, csscli)
}
}

// generateARMID uses the Subnet ARM ID format to populate the ARM ID with the metadata.
// If either of the metadata attributes are empty, then the ARM ID will be an empty string.
func generateARMID(nc *v1alpha.NetworkContainer) string {
subscription := nc.SubscriptionID
resourceGroup := nc.ResourceGroupID
vnetID := nc.VNETID
subnetID := nc.SubnetID

if subscription == "" || resourceGroup == "" || vnetID == "" || subnetID == "" {
return ""
}
return fmt.Sprintf(subnetARMIDTemplate, subscription, resourceGroup, vnetID, subnetID)
}

// observeMetrics observes the IP pool and updates the metrics. Blocking.
//
//nolint:lll // ignore line length
func observeMetrics(ctx context.Context, ipcli func() map[string]cns.IPConfigurationStatus, nnccli func(context.Context) (*v1alpha.NodeNetworkConfig, error), csscli func(context.Context) ([]v1alpha1.ClusterSubnetState, error)) error {
nddq marked this conversation as resolved.
Show resolved Hide resolved
csslist, err := csscli(ctx)
if err != nil {
return err
}
nnc, err := nnccli(ctx)
if err != nil {
return err
}
ips := ipcli()

var meta metaState
for i := range csslist {
if csslist[i].Status.Exhausted {
meta.exhausted = true
break
}
}
if len(nnc.Status.NetworkContainers) > 0 {
// Set SubnetName, SubnetAddressSpace and Pod Network ARM ID values to the global subnet, subnetCIDR and subnetARM variables.
meta.subnet = nnc.Status.NetworkContainers[0].SubnetName
meta.subnetCIDR = nnc.Status.NetworkContainers[0].SubnetAddressSpace
meta.subnetARMID = generateARMID(&nnc.Status.NetworkContainers[0])
}
meta.primaryIPAddresses = make(map[string]struct{})
// Add Primary IP to Map, if not present.
// This is only for Swift i.e. if NC Type is vnet.
for i := 0; i < len(nnc.Status.NetworkContainers); i++ {
nc := nnc.Status.NetworkContainers[i]
if nc.Type == "" || nc.Type == v1alpha.VNET {
meta.primaryIPAddresses[nc.PrimaryIP] = struct{}{}
}

if nc.Type == v1alpha.VNETBlock {
primaryPrefix, err := netip.ParsePrefix(nc.PrimaryIP)
if err != nil {
return errors.Wrapf(err, "unable to parse ip prefix: %s", nc.PrimaryIP)
}
meta.primaryIPAddresses[primaryPrefix.Addr().String()] = struct{}{}
}
}

state := ipPoolState{
secondaryIPs: int64(len(ips)),
requestedIPs: nnc.Spec.RequestedIPCount,
}
for i := range ips {
ip := ips[i]
switch ip.GetState() {
case types.Assigned:
state.allocatedToPods++
case types.Available:
state.available++
case types.PendingProgramming:
state.pendingProgramming++
case types.PendingRelease:
state.pendingRelease++
}
}
state.currentAvailableIPs = state.secondaryIPs - state.allocatedToPods - state.pendingRelease
state.expectedAvailableIPs = state.requestedIPs - state.allocatedToPods

labels := []string{meta.subnet, meta.subnetCIDR, meta.subnetARMID}
IpamAllocatedIPCount.WithLabelValues(labels...).Set(float64(state.allocatedToPods))
IpamAvailableIPCount.WithLabelValues(labels...).Set(float64(state.available))
IpamBatchSize.WithLabelValues(labels...).Set(float64(meta.batch))
IpamCurrentAvailableIPcount.WithLabelValues(labels...).Set(float64(state.currentAvailableIPs))
IpamExpectedAvailableIPCount.WithLabelValues(labels...).Set(float64(state.expectedAvailableIPs))
IpamMaxIPCount.WithLabelValues(labels...).Set(float64(meta.max))
IpamPendingProgramIPCount.WithLabelValues(labels...).Set(float64(state.pendingProgramming))
IpamPendingReleaseIPCount.WithLabelValues(labels...).Set(float64(state.pendingRelease))
IpamPrimaryIPCount.WithLabelValues(labels...).Set(float64(len(meta.primaryIPAddresses)))
IpamRequestedIPConfigCount.WithLabelValues(labels...).Set(float64(state.requestedIPs))
IpamSecondaryIPCount.WithLabelValues(labels...).Set(float64(state.secondaryIPs))
IpamTotalIPCount.WithLabelValues(labels...).Set(float64(state.secondaryIPs + int64(len(meta.primaryIPAddresses))))
if meta.exhausted {
IpamSubnetExhaustionState.WithLabelValues(labels...).Set(float64(SubnetIPExhausted))
} else {
IpamSubnetExhaustionState.WithLabelValues(labels...).Set(float64(SubnetIPNotExhausted))
}
return nil
}
28 changes: 25 additions & 3 deletions cns/ipampool/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"time"

"github.com/Azure/azure-container-networking/cns"
"github.com/Azure/azure-container-networking/cns/ipampool/metrics"
"github.com/Azure/azure-container-networking/cns/logger"
"github.com/Azure/azure-container-networking/cns/metric"
"github.com/Azure/azure-container-networking/cns/types"
Expand Down Expand Up @@ -105,9 +106,9 @@ func (pm *Monitor) Start(ctx context.Context) error {
case css := <-pm.cssSource: // received an updated ClusterSubnetState
pm.metastate.exhausted = css.Status.Exhausted
logger.Printf("subnet exhausted status = %t", pm.metastate.exhausted)
IpamSubnetExhaustionCount.With(prometheus.Labels{
subnetLabel: pm.metastate.subnet, subnetCIDRLabel: pm.metastate.subnetCIDR,
podnetARMIDLabel: pm.metastate.subnetARMID, subnetExhaustionStateLabel: strconv.FormatBool(pm.metastate.exhausted),
metrics.IpamSubnetExhaustionCount.With(prometheus.Labels{
metrics.SubnetLabel: pm.metastate.subnet, metrics.SubnetCIDRLabel: pm.metastate.subnetCIDR,
metrics.PodnetARMIDLabel: pm.metastate.subnetARMID, metrics.SubnetExhaustionStateLabel: strconv.FormatBool(pm.metastate.exhausted),
}).Inc()
select {
default:
Expand Down Expand Up @@ -482,6 +483,27 @@ func (pm *Monitor) clampScaler(scaler *v1alpha.Scaler) {
}
}

func observeIPPoolState(state ipPoolState, meta metaState) {
labels := []string{meta.subnet, meta.subnetCIDR, meta.subnetARMID}
metrics.IpamAllocatedIPCount.WithLabelValues(labels...).Set(float64(state.allocatedToPods))
metrics.IpamAvailableIPCount.WithLabelValues(labels...).Set(float64(state.available))
metrics.IpamBatchSize.WithLabelValues(labels...).Set(float64(meta.batch))
metrics.IpamCurrentAvailableIPcount.WithLabelValues(labels...).Set(float64(state.currentAvailableIPs))
metrics.IpamExpectedAvailableIPCount.WithLabelValues(labels...).Set(float64(state.expectedAvailableIPs))
metrics.IpamMaxIPCount.WithLabelValues(labels...).Set(float64(meta.max))
metrics.IpamPendingProgramIPCount.WithLabelValues(labels...).Set(float64(state.pendingProgramming))
metrics.IpamPendingReleaseIPCount.WithLabelValues(labels...).Set(float64(state.pendingRelease))
metrics.IpamPrimaryIPCount.WithLabelValues(labels...).Set(float64(len(meta.primaryIPAddresses)))
metrics.IpamRequestedIPConfigCount.WithLabelValues(labels...).Set(float64(state.requestedIPs))
metrics.IpamSecondaryIPCount.WithLabelValues(labels...).Set(float64(state.secondaryIPs))
metrics.IpamTotalIPCount.WithLabelValues(labels...).Set(float64(state.secondaryIPs + int64(len(meta.primaryIPAddresses))))
if meta.exhausted {
metrics.IpamSubnetExhaustionState.WithLabelValues(labels...).Set(float64(metrics.SubnetIPExhausted))
} else {
metrics.IpamSubnetExhaustionState.WithLabelValues(labels...).Set(float64(metrics.SubnetIPNotExhausted))
}
}

// CalculateMinFreeIPs calculates the minimum free IP quantity based on the Scaler
// in the passed NodeNetworkConfig.
// Half of odd batches are rounded up!
Expand Down
Loading
Loading