metrics

package

v1.1.0 Latest Latest Go to latest Published: Oct 24, 2025 License: Apache-2.0 Imports: 9 Imported by: 3

Documentation ¶

Index ¶

Constants
Variables
func DecFlowControlQueueSize(fairnessID, priority string)
func DecRunningRequests(modelName string)
func IncFlowControlQueueSize(fairnessID, priority string)
func IncRunningRequests(modelName string)
func RecordFlowControlRequestQueueDuration(fairnessID, priority, outcome string, duration time.Duration)
func RecordInferenceExtensionInfo(commitSha, buildRef string)
func RecordInferencePoolAvgKVCache(name string, utilization float64)
func RecordInferencePoolAvgQueueSize(name string, queueSize float64)
func RecordInferencePoolReadyPods(name string, runningPods float64)
func RecordInputTokens(modelName, targetModelName string, size int)
func RecordNormalizedTimePerOutputToken(ctx context.Context, modelName, targetModelName string, received time.Time, ...) bool
func RecordOutputTokens(modelName, targetModelName string, size int)
func RecordPluginProcessingLatency(extensionPoint, pluginType, pluginName string, duration time.Duration)
func RecordPrefixCacheMatch(matchedLength, totalLength int)
func RecordPrefixCacheSize(size int64)
func RecordRequestCounter(modelName, targetModelName string)
func RecordRequestErrCounter(modelName, targetModelName string, code string)
func RecordRequestLatencies(ctx context.Context, modelName, targetModelName string, received time.Time, ...) bool
func RecordRequestSizes(modelName, targetModelName string, reqSize int)
func RecordResponseSizes(modelName, targetModelName string, size int)
func RecordSchedulerE2ELatency(duration time.Duration)
func Register(customCollectors ...prometheus.Collector)
func Reset()

Constants ¶

const (
	InferenceObjectiveComponent = "inference_objective"
	InferencePoolComponent      = "inference_pool"
	InferenceExtension          = "inference_extension"
)

Variables ¶

View Source

var (

	// NTPOT - Normalized Time Per Output Token
	NormalizedTimePerOutputToken = prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Subsystem: InferenceObjectiveComponent,
			Name:      "normalized_time_per_output_token_seconds",
			Help:      metricsutil.HelpMsgWithStability("Inference objective latency divided by number of output tokens in seconds for each model and target model.", compbasemetrics.ALPHA),

			Buckets: []float64{
				0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0,
			},
		},
		[]string{"model_name", "target_model_name"},
	)

	// Scheduler Metrics
	SchedulerE2ELatency = prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Subsystem: InferenceExtension,
			Name:      "scheduler_e2e_duration_seconds",
			Help:      metricsutil.HelpMsgWithStability("End-to-end scheduling latency distribution in seconds.", compbasemetrics.ALPHA),
			Buckets: []float64{
				0.0001, 0.0002, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1,
			},
		},
		[]string{},
	)

	PluginProcessingLatencies = prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Subsystem: InferenceExtension,
			Name:      "plugin_duration_seconds",
			Help:      metricsutil.HelpMsgWithStability("Plugin processing latency distribution in seconds for each extension point, plugin type and plugin name.", compbasemetrics.ALPHA),
			Buckets: []float64{
				0.0001, 0.0002, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1,
			},
		},
		[]string{"extension_point", "plugin_type", "plugin_name"},
	)

	// Prefix indexer Metrics
	PrefixCacheSize = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: InferenceExtension,
			Name:      "prefix_indexer_size",
			Help:      metricsutil.HelpMsgWithStability("Size of the prefix indexer.", compbasemetrics.ALPHA),
		},
		[]string{},
	)

	PrefixCacheHitRatio = prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Subsystem: InferenceExtension,
			Name:      "prefix_indexer_hit_ratio",
			Help:      metricsutil.HelpMsgWithStability("Ratio of prefix length matched to total prefix length in the cache lookup.", compbasemetrics.ALPHA),

			Buckets: []float64{0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0},
		},
		[]string{},
	)

	PrefixCacheHitLength = prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Subsystem: InferenceExtension,
			Name:      "prefix_indexer_hit_bytes",
			Help:      metricsutil.HelpMsgWithStability("Length of the prefix match in number of bytes in the cache lookup.", compbasemetrics.ALPHA),
			Buckets:   []float64{0, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536},
		},
		[]string{},
	)

	// Info Metrics
	InferenceExtensionInfo = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: InferenceExtension,
			Name:      "info",
			Help:      metricsutil.HelpMsgWithStability("General information of the current build of Inference Extension.", compbasemetrics.ALPHA),
		},
		[]string{"commit", "build_ref"},
	)
)

Functions ¶

func DecFlowControlQueueSize ¶ added in v1.1.0

func DecFlowControlQueueSize(fairnessID, priority string)

DecFlowControlQueueSize decrements the Flow Control queue size gauge.

func DecRunningRequests ¶ added in v0.3.0

func DecRunningRequests(modelName string)

DecRunningRequests decreases the current running requests.

func IncFlowControlQueueSize ¶ added in v1.1.0

func IncFlowControlQueueSize(fairnessID, priority string)

IncFlowControlQueueSize increments the Flow Control queue size gauge.

func IncRunningRequests ¶ added in v0.3.0

func IncRunningRequests(modelName string)

IncRunningRequests increases the current running requests.

func RecordFlowControlRequestQueueDuration ¶ added in v1.1.0

func RecordFlowControlRequestQueueDuration(fairnessID, priority, outcome string, duration time.Duration)

RecordFlowControlRequestQueueDuration records the duration a request spent in the Flow Control layer.

func RecordInferenceExtensionInfo ¶ added in v0.4.0

func RecordInferenceExtensionInfo(commitSha, buildRef string)

func RecordInferencePoolAvgKVCache ¶

func RecordInferencePoolAvgKVCache(name string, utilization float64)

func RecordInferencePoolAvgQueueSize ¶

func RecordInferencePoolAvgQueueSize(name string, queueSize float64)

func RecordInferencePoolReadyPods ¶ added in v0.5.0

func RecordInferencePoolReadyPods(name string, runningPods float64)

func RecordInputTokens ¶

func RecordInputTokens(modelName, targetModelName string, size int)

RecordInputTokens records input tokens count.

func RecordNormalizedTimePerOutputToken ¶ added in v0.4.0

func RecordNormalizedTimePerOutputToken(ctx context.Context, modelName, targetModelName string, received time.Time, complete time.Time, outputTokenCount int) bool

RecordNormalizedTimePerOutputToken (NTPOT) records the normalized time per output token.

func RecordOutputTokens ¶

func RecordOutputTokens(modelName, targetModelName string, size int)

RecordOutputTokens records output tokens count.

func RecordPluginProcessingLatency ¶ added in v1.0.0

func RecordPluginProcessingLatency(extensionPoint, pluginType, pluginName string, duration time.Duration)

RecordPluginProcessingLatency records the processing latency for a plugin.

func RecordPrefixCacheMatch ¶ added in v0.4.0

func RecordPrefixCacheMatch(matchedLength, totalLength int)

RecordPrefixCacheMatch records both the hit ratio and hit length for a prefix indexer match. matchedLength is the number of characters that matched, and totalLength is the total prefix length.

func RecordPrefixCacheSize ¶ added in v0.4.0

func RecordPrefixCacheSize(size int64)

RecordPrefixCacheSize records the size of the prefix indexer in megabytes.

func RecordRequestCounter ¶

func RecordRequestCounter(modelName, targetModelName string)

RecordRequstCounter records the number of requests.

func RecordRequestErrCounter ¶

func RecordRequestErrCounter(modelName, targetModelName string, code string)

RecordRequestErrCounter records the number of error requests.

func RecordRequestLatencies ¶

func RecordRequestLatencies(ctx context.Context, modelName, targetModelName string, received time.Time, complete time.Time) bool

RecordRequestLatencies records duration of request.

func RecordRequestSizes ¶

func RecordRequestSizes(modelName, targetModelName string, reqSize int)

RecordRequestSizes records the request sizes.

func RecordResponseSizes ¶

func RecordResponseSizes(modelName, targetModelName string, size int)

RecordResponseSizes records the response sizes.

func RecordSchedulerE2ELatency ¶ added in v0.4.0

func RecordSchedulerE2ELatency(duration time.Duration)

RecordSchedulerE2ELatency records the end-to-end scheduling latency.

func Register ¶

func Register(customCollectors ...prometheus.Collector)

Register all metrics.

func Reset ¶ added in v0.4.0

func Reset()

Just for integration test

Types ¶

This section is empty.

Source Files ¶

View all Source files

metrics.go

Directories ¶

Path	Synopsis
collectors

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL