metrics

package
v1.1.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 24, 2025 License: Apache-2.0 Imports: 9 Imported by: 3

Documentation

Index

Constants

View Source
const (
	InferenceObjectiveComponent = "inference_objective"
	InferencePoolComponent      = "inference_pool"
	InferenceExtension          = "inference_extension"
)

Variables

View Source
var (

	// NTPOT - Normalized Time Per Output Token
	NormalizedTimePerOutputToken = prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Subsystem: InferenceObjectiveComponent,
			Name:      "normalized_time_per_output_token_seconds",
			Help:      metricsutil.HelpMsgWithStability("Inference objective latency divided by number of output tokens in seconds for each model and target model.", compbasemetrics.ALPHA),

			Buckets: []float64{
				0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0,
			},
		},
		[]string{"model_name", "target_model_name"},
	)

	// Scheduler Metrics
	SchedulerE2ELatency = prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Subsystem: InferenceExtension,
			Name:      "scheduler_e2e_duration_seconds",
			Help:      metricsutil.HelpMsgWithStability("End-to-end scheduling latency distribution in seconds.", compbasemetrics.ALPHA),
			Buckets: []float64{
				0.0001, 0.0002, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1,
			},
		},
		[]string{},
	)

	PluginProcessingLatencies = prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Subsystem: InferenceExtension,
			Name:      "plugin_duration_seconds",
			Help:      metricsutil.HelpMsgWithStability("Plugin processing latency distribution in seconds for each extension point, plugin type and plugin name.", compbasemetrics.ALPHA),
			Buckets: []float64{
				0.0001, 0.0002, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1,
			},
		},
		[]string{"extension_point", "plugin_type", "plugin_name"},
	)

	// Prefix indexer Metrics
	PrefixCacheSize = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: InferenceExtension,
			Name:      "prefix_indexer_size",
			Help:      metricsutil.HelpMsgWithStability("Size of the prefix indexer.", compbasemetrics.ALPHA),
		},
		[]string{},
	)

	PrefixCacheHitRatio = prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Subsystem: InferenceExtension,
			Name:      "prefix_indexer_hit_ratio",
			Help:      metricsutil.HelpMsgWithStability("Ratio of prefix length matched to total prefix length in the cache lookup.", compbasemetrics.ALPHA),

			Buckets: []float64{0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0},
		},
		[]string{},
	)

	PrefixCacheHitLength = prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Subsystem: InferenceExtension,
			Name:      "prefix_indexer_hit_bytes",
			Help:      metricsutil.HelpMsgWithStability("Length of the prefix match in number of bytes in the cache lookup.", compbasemetrics.ALPHA),
			Buckets:   []float64{0, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536},
		},
		[]string{},
	)

	// Info Metrics
	InferenceExtensionInfo = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: InferenceExtension,
			Name:      "info",
			Help:      metricsutil.HelpMsgWithStability("General information of the current build of Inference Extension.", compbasemetrics.ALPHA),
		},
		[]string{"commit", "build_ref"},
	)
)

Functions

func DecFlowControlQueueSize added in v1.1.0

func DecFlowControlQueueSize(fairnessID, priority string)

DecFlowControlQueueSize decrements the Flow Control queue size gauge.

func DecRunningRequests added in v0.3.0

func DecRunningRequests(modelName string)

DecRunningRequests decreases the current running requests.

func IncFlowControlQueueSize added in v1.1.0

func IncFlowControlQueueSize(fairnessID, priority string)

IncFlowControlQueueSize increments the Flow Control queue size gauge.

func IncRunningRequests added in v0.3.0

func IncRunningRequests(modelName string)

IncRunningRequests increases the current running requests.

func RecordFlowControlRequestQueueDuration added in v1.1.0

func RecordFlowControlRequestQueueDuration(fairnessID, priority, outcome string, duration time.Duration)

RecordFlowControlRequestQueueDuration records the duration a request spent in the Flow Control layer.

func RecordInferenceExtensionInfo added in v0.4.0

func RecordInferenceExtensionInfo(commitSha, buildRef string)

func RecordInferencePoolAvgKVCache

func RecordInferencePoolAvgKVCache(name string, utilization float64)

func RecordInferencePoolAvgQueueSize

func RecordInferencePoolAvgQueueSize(name string, queueSize float64)

func RecordInferencePoolReadyPods added in v0.5.0

func RecordInferencePoolReadyPods(name string, runningPods float64)

func RecordInputTokens

func RecordInputTokens(modelName, targetModelName string, size int)

RecordInputTokens records input tokens count.

func RecordNormalizedTimePerOutputToken added in v0.4.0

func RecordNormalizedTimePerOutputToken(ctx context.Context, modelName, targetModelName string, received time.Time, complete time.Time, outputTokenCount int) bool

RecordNormalizedTimePerOutputToken (NTPOT) records the normalized time per output token.

func RecordOutputTokens

func RecordOutputTokens(modelName, targetModelName string, size int)

RecordOutputTokens records output tokens count.

func RecordPluginProcessingLatency added in v1.0.0

func RecordPluginProcessingLatency(extensionPoint, pluginType, pluginName string, duration time.Duration)

RecordPluginProcessingLatency records the processing latency for a plugin.

func RecordPrefixCacheMatch added in v0.4.0

func RecordPrefixCacheMatch(matchedLength, totalLength int)

RecordPrefixCacheMatch records both the hit ratio and hit length for a prefix indexer match. matchedLength is the number of characters that matched, and totalLength is the total prefix length.

func RecordPrefixCacheSize added in v0.4.0

func RecordPrefixCacheSize(size int64)

RecordPrefixCacheSize records the size of the prefix indexer in megabytes.

func RecordRequestCounter

func RecordRequestCounter(modelName, targetModelName string)

RecordRequstCounter records the number of requests.

func RecordRequestErrCounter

func RecordRequestErrCounter(modelName, targetModelName string, code string)

RecordRequestErrCounter records the number of error requests.

func RecordRequestLatencies

func RecordRequestLatencies(ctx context.Context, modelName, targetModelName string, received time.Time, complete time.Time) bool

RecordRequestLatencies records duration of request.

func RecordRequestSizes

func RecordRequestSizes(modelName, targetModelName string, reqSize int)

RecordRequestSizes records the request sizes.

func RecordResponseSizes

func RecordResponseSizes(modelName, targetModelName string, size int)

RecordResponseSizes records the response sizes.

func RecordSchedulerE2ELatency added in v0.4.0

func RecordSchedulerE2ELatency(duration time.Duration)

RecordSchedulerE2ELatency records the end-to-end scheduling latency.

func Register

func Register(customCollectors ...prometheus.Collector)

Register all metrics.

func Reset added in v0.4.0

func Reset()

Just for integration test

Types

This section is empty.

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL