Documentation
¶
Index ¶
- Constants
- Variables
- func DecFlowControlQueueSize(fairnessID, priority string)
- func DecRunningRequests(modelName string)
- func IncFlowControlQueueSize(fairnessID, priority string)
- func IncRunningRequests(modelName string)
- func RecordFlowControlRequestQueueDuration(fairnessID, priority, outcome string, duration time.Duration)
- func RecordInferenceExtensionInfo(commitSha, buildRef string)
- func RecordInferencePoolAvgKVCache(name string, utilization float64)
- func RecordInferencePoolAvgQueueSize(name string, queueSize float64)
- func RecordInferencePoolReadyPods(name string, runningPods float64)
- func RecordInputTokens(modelName, targetModelName string, size int)
- func RecordNormalizedTimePerOutputToken(ctx context.Context, modelName, targetModelName string, received time.Time, ...) bool
- func RecordOutputTokens(modelName, targetModelName string, size int)
- func RecordPluginProcessingLatency(extensionPoint, pluginType, pluginName string, duration time.Duration)
- func RecordPrefixCacheMatch(matchedLength, totalLength int)
- func RecordPrefixCacheSize(size int64)
- func RecordRequestCounter(modelName, targetModelName string)
- func RecordRequestErrCounter(modelName, targetModelName string, code string)
- func RecordRequestLatencies(ctx context.Context, modelName, targetModelName string, received time.Time, ...) bool
- func RecordRequestSizes(modelName, targetModelName string, reqSize int)
- func RecordResponseSizes(modelName, targetModelName string, size int)
- func RecordSchedulerE2ELatency(duration time.Duration)
- func Register(customCollectors ...prometheus.Collector)
- func Reset()
Constants ¶
const ( InferenceObjectiveComponent = "inference_objective" InferencePoolComponent = "inference_pool" InferenceExtension = "inference_extension" )
Variables ¶
var ( // NTPOT - Normalized Time Per Output Token NormalizedTimePerOutputToken = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Subsystem: InferenceObjectiveComponent, Name: "normalized_time_per_output_token_seconds", Help: metricsutil.HelpMsgWithStability("Inference objective latency divided by number of output tokens in seconds for each model and target model.", compbasemetrics.ALPHA), Buckets: []float64{ 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0, }, }, []string{"model_name", "target_model_name"}, ) // Scheduler Metrics SchedulerE2ELatency = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Subsystem: InferenceExtension, Name: "scheduler_e2e_duration_seconds", Help: metricsutil.HelpMsgWithStability("End-to-end scheduling latency distribution in seconds.", compbasemetrics.ALPHA), Buckets: []float64{ 0.0001, 0.0002, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, }, }, []string{}, ) PluginProcessingLatencies = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Subsystem: InferenceExtension, Name: "plugin_duration_seconds", Help: metricsutil.HelpMsgWithStability("Plugin processing latency distribution in seconds for each extension point, plugin type and plugin name.", compbasemetrics.ALPHA), Buckets: []float64{ 0.0001, 0.0002, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, }, }, []string{"extension_point", "plugin_type", "plugin_name"}, ) // Prefix indexer Metrics PrefixCacheSize = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Subsystem: InferenceExtension, Name: "prefix_indexer_size", Help: metricsutil.HelpMsgWithStability("Size of the prefix indexer.", compbasemetrics.ALPHA), }, []string{}, ) PrefixCacheHitRatio = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Subsystem: InferenceExtension, Name: "prefix_indexer_hit_ratio", Help: metricsutil.HelpMsgWithStability("Ratio of prefix length matched to total prefix length in the cache lookup.", compbasemetrics.ALPHA), Buckets: []float64{0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0}, }, []string{}, ) PrefixCacheHitLength = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Subsystem: InferenceExtension, Name: "prefix_indexer_hit_bytes", Help: metricsutil.HelpMsgWithStability("Length of the prefix match in number of bytes in the cache lookup.", compbasemetrics.ALPHA), Buckets: []float64{0, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536}, }, []string{}, ) // Info Metrics InferenceExtensionInfo = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Subsystem: InferenceExtension, Name: "info", Help: metricsutil.HelpMsgWithStability("General information of the current build of Inference Extension.", compbasemetrics.ALPHA), }, []string{"commit", "build_ref"}, ) )
Functions ¶
func DecFlowControlQueueSize ¶ added in v1.1.0
func DecFlowControlQueueSize(fairnessID, priority string)
DecFlowControlQueueSize decrements the Flow Control queue size gauge.
func DecRunningRequests ¶ added in v0.3.0
func DecRunningRequests(modelName string)
DecRunningRequests decreases the current running requests.
func IncFlowControlQueueSize ¶ added in v1.1.0
func IncFlowControlQueueSize(fairnessID, priority string)
IncFlowControlQueueSize increments the Flow Control queue size gauge.
func IncRunningRequests ¶ added in v0.3.0
func IncRunningRequests(modelName string)
IncRunningRequests increases the current running requests.
func RecordFlowControlRequestQueueDuration ¶ added in v1.1.0
func RecordFlowControlRequestQueueDuration(fairnessID, priority, outcome string, duration time.Duration)
RecordFlowControlRequestQueueDuration records the duration a request spent in the Flow Control layer.
func RecordInferenceExtensionInfo ¶ added in v0.4.0
func RecordInferenceExtensionInfo(commitSha, buildRef string)
func RecordInferencePoolReadyPods ¶ added in v0.5.0
func RecordInputTokens ¶
RecordInputTokens records input tokens count.
func RecordNormalizedTimePerOutputToken ¶ added in v0.4.0
func RecordNormalizedTimePerOutputToken(ctx context.Context, modelName, targetModelName string, received time.Time, complete time.Time, outputTokenCount int) bool
RecordNormalizedTimePerOutputToken (NTPOT) records the normalized time per output token.
func RecordOutputTokens ¶
RecordOutputTokens records output tokens count.
func RecordPluginProcessingLatency ¶ added in v1.0.0
func RecordPluginProcessingLatency(extensionPoint, pluginType, pluginName string, duration time.Duration)
RecordPluginProcessingLatency records the processing latency for a plugin.
func RecordPrefixCacheMatch ¶ added in v0.4.0
func RecordPrefixCacheMatch(matchedLength, totalLength int)
RecordPrefixCacheMatch records both the hit ratio and hit length for a prefix indexer match. matchedLength is the number of characters that matched, and totalLength is the total prefix length.
func RecordPrefixCacheSize ¶ added in v0.4.0
func RecordPrefixCacheSize(size int64)
RecordPrefixCacheSize records the size of the prefix indexer in megabytes.
func RecordRequestCounter ¶
func RecordRequestCounter(modelName, targetModelName string)
RecordRequstCounter records the number of requests.
func RecordRequestErrCounter ¶
RecordRequestErrCounter records the number of error requests.
func RecordRequestLatencies ¶
func RecordRequestLatencies(ctx context.Context, modelName, targetModelName string, received time.Time, complete time.Time) bool
RecordRequestLatencies records duration of request.
func RecordRequestSizes ¶
RecordRequestSizes records the request sizes.
func RecordResponseSizes ¶
RecordResponseSizes records the response sizes.
func RecordSchedulerE2ELatency ¶ added in v0.4.0
RecordSchedulerE2ELatency records the end-to-end scheduling latency.
Types ¶
This section is empty.