From 46bd84cb9f68cbe1f94193402738c783d6ffbe9e Mon Sep 17 00:00:00 2001 From: Charlie Le Date: Thu, 18 Jun 2026 10:58:15 -0700 Subject: [PATCH 1/3] Add native histogram support to all remaining production histograms Convert ~45 remaining classic-only histograms to dual-format (classic + native) by adding NativeHistogramBucketFactor, NativeHistogramMaxBucketNumber, and NativeHistogramMinResetDuration fields. This follows the same pattern already established for ~11 histograms in the codebase. Signed-off-by: Charlie Le --- pkg/alertmanager/alertmanager_client.go | 9 +++-- pkg/alertmanager/state_replication.go | 9 +++-- pkg/api/handlers.go | 34 ++++++++++++------- pkg/chunk/cache/instrumented.go | 14 +++++--- pkg/chunk/cache/memcached.go | 7 ++-- pkg/chunk/cache/redis_cache.go | 13 ++++--- pkg/compactor/compactor_metrics.go | 19 ++++++++--- pkg/configs/client/client.go | 11 +++--- pkg/configs/db/timed.go | 12 ++++--- pkg/distributor/distributor.go | 22 +++++++----- pkg/frontend/transport/retry.go | 12 ++++--- pkg/frontend/v1/frontend.go | 9 +++-- pkg/ha/ha_tracker.go | 9 +++-- pkg/ingester/client/client.go | 12 ++++--- pkg/ingester/ingester.go | 27 ++++++++++----- pkg/ingester/metrics.go | 20 ++++++++--- pkg/parquetconverter/metrics.go | 11 ++++-- pkg/querier/blocks_finder_bucket_scan.go | 9 +++-- pkg/querier/blocks_store_queryable.go | 22 +++++++----- pkg/querier/store_gateway_client.go | 13 ++++--- .../exemplar_merge_queryable.go | 12 ++++--- .../tenantfederation/merge_queryable.go | 12 ++++--- .../metadata_merge_querier.go | 12 ++++--- pkg/querier/tripperware/instrumentation.go | 11 +++--- pkg/querier/worker/scheduler_processor.go | 9 +++-- pkg/ring/kv/consul/metrics.go | 10 ++++-- pkg/ring/kv/dynamodb/metrics.go | 10 ++++-- pkg/ring/kv/metrics.go | 3 ++ pkg/ring/lifecycler_metrics.go | 13 ++++--- pkg/ruler/client_pool.go | 9 +++-- pkg/ruler/frontend_client_pool.go | 9 +++-- pkg/scheduler/scheduler.go | 9 +++-- pkg/storage/tsdb/bucketindex/loader.go | 9 +++-- pkg/storage/tsdb/multilevel_bucket_cache.go | 18 ++++++---- pkg/storage/tsdb/multilevel_index_cache.go | 19 +++++++---- pkg/storegateway/bucket_stores.go | 9 +++-- 36 files changed, 316 insertions(+), 152 deletions(-) diff --git a/pkg/alertmanager/alertmanager_client.go b/pkg/alertmanager/alertmanager_client.go index 5e144e579a2..f81debd35ed 100644 --- a/pkg/alertmanager/alertmanager_client.go +++ b/pkg/alertmanager/alertmanager_client.go @@ -73,9 +73,12 @@ func newAlertmanagerClientsPool(discovery client.PoolServiceDiscovery, amClientC } requestDuration := promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ - Name: "cortex_alertmanager_distributor_client_request_duration_seconds", - Help: "Time spent executing requests from an alertmanager to another alertmanager.", - Buckets: prometheus.ExponentialBuckets(0.008, 4, 7), + Name: "cortex_alertmanager_distributor_client_request_duration_seconds", + Help: "Time spent executing requests from an alertmanager to another alertmanager.", + Buckets: prometheus.ExponentialBuckets(0.008, 4, 7), + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }, []string{"operation", "status_code"}) factory := func(addr string) (client.PoolClient, error) { diff --git a/pkg/alertmanager/state_replication.go b/pkg/alertmanager/state_replication.go index bb317bd3191..ea60f0358bf 100644 --- a/pkg/alertmanager/state_replication.go +++ b/pkg/alertmanager/state_replication.go @@ -111,9 +111,12 @@ func newReplicatedStates(userID string, rf int, re Replicator, st alertstore.Ale Help: "Number of times we have completed syncing initial state for each possible outcome.", }, []string{"outcome"}), initialSyncDuration: promauto.With(r).NewHistogram(prometheus.HistogramOpts{ - Name: "alertmanager_state_initial_sync_duration_seconds", - Help: "Time spent syncing initial state from peers or remote storage.", - Buckets: prometheus.ExponentialBuckets(0.008, 4, 7), + Name: "alertmanager_state_initial_sync_duration_seconds", + Help: "Time spent syncing initial state from peers or remote storage.", + Buckets: prometheus.ExponentialBuckets(0.008, 4, 7), + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }), } s.initialSyncCompleted.WithLabelValues(syncFromReplica) diff --git a/pkg/api/handlers.go b/pkg/api/handlers.go index 5852a65cbb1..cf91163b511 100644 --- a/pkg/api/handlers.go +++ b/pkg/api/handlers.go @@ -8,6 +8,7 @@ import ( "net/http" "path" "sync" + "time" "github.com/go-kit/log" "github.com/go-kit/log/level" @@ -171,24 +172,33 @@ func NewQuerierHandler( ) http.Handler { // Prometheus histograms for requests to the querier. querierRequestDuration := promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ - Namespace: "cortex", - Name: "querier_request_duration_seconds", - Help: "Time (in seconds) spent serving HTTP requests to the querier.", - Buckets: instrument.DefBuckets, + Namespace: "cortex", + Name: "querier_request_duration_seconds", + Help: "Time (in seconds) spent serving HTTP requests to the querier.", + Buckets: instrument.DefBuckets, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }, []string{"method", "route", "status_code", "ws"}) receivedMessageSize := promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ - Namespace: "cortex", - Name: "querier_request_message_bytes", - Help: "Size (in bytes) of messages received in the request to the querier.", - Buckets: middleware.BodySizeBuckets, + Namespace: "cortex", + Name: "querier_request_message_bytes", + Help: "Size (in bytes) of messages received in the request to the querier.", + Buckets: middleware.BodySizeBuckets, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }, []string{"method", "route"}) sentMessageSize := promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ - Namespace: "cortex", - Name: "querier_response_message_bytes", - Help: "Size (in bytes) of messages sent in response by the querier.", - Buckets: middleware.BodySizeBuckets, + Namespace: "cortex", + Name: "querier_response_message_bytes", + Help: "Size (in bytes) of messages sent in response by the querier.", + Buckets: middleware.BodySizeBuckets, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }, []string{"method", "route"}) inflightRequests := promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ diff --git a/pkg/chunk/cache/instrumented.go b/pkg/chunk/cache/instrumented.go index 7c9062581f2..3c762ae1f0d 100644 --- a/pkg/chunk/cache/instrumented.go +++ b/pkg/chunk/cache/instrumented.go @@ -20,8 +20,11 @@ func Instrument(name string, cache Cache, reg prometheus.Registerer) Cache { // Cached chunks are generally in the KBs, but cached index can // get big. Histogram goes from 1KB to 4MB. // 1024 * 4^(7-1) = 4MB - Buckets: prometheus.ExponentialBuckets(1024, 4, 7), - ConstLabels: prometheus.Labels{"name": name}, + Buckets: prometheus.ExponentialBuckets(1024, 4, 7), + ConstLabels: prometheus.Labels{"name": name}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }, []string{"method"}) return &instrumentedCache{ @@ -33,8 +36,11 @@ func Instrument(name string, cache Cache, reg prometheus.Registerer) Cache { Name: "cache_request_duration_seconds", Help: "Total time spent in seconds doing cache requests.", // Cache requests are very quick: smallest bucket is 16us, biggest is 1s. - Buckets: prometheus.ExponentialBuckets(0.000016, 4, 8), - ConstLabels: prometheus.Labels{"name": name}, + Buckets: prometheus.ExponentialBuckets(0.000016, 4, 8), + ConstLabels: prometheus.Labels{"name": name}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }, []string{"method", "status_code"})), fetchedKeys: promauto.With(reg).NewCounter(prometheus.CounterOpts{ diff --git a/pkg/chunk/cache/memcached.go b/pkg/chunk/cache/memcached.go index 70ee76fa41d..8194dfa9f09 100644 --- a/pkg/chunk/cache/memcached.go +++ b/pkg/chunk/cache/memcached.go @@ -62,8 +62,11 @@ func NewMemcached(cfg MemcachedConfig, client MemcachedClient, name string, reg Name: "memcache_request_duration_seconds", Help: "Total time spent in seconds doing memcache requests.", // Memcached requests are very quick: smallest bucket is 16us, biggest is 1s - Buckets: prometheus.ExponentialBuckets(0.000016, 4, 8), - ConstLabels: prometheus.Labels{"name": name}, + Buckets: prometheus.ExponentialBuckets(0.000016, 4, 8), + ConstLabels: prometheus.Labels{"name": name}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }, []string{"method", "status_code"}), ), } diff --git a/pkg/chunk/cache/redis_cache.go b/pkg/chunk/cache/redis_cache.go index 6cc9206cc04..3805fb4ca5c 100644 --- a/pkg/chunk/cache/redis_cache.go +++ b/pkg/chunk/cache/redis_cache.go @@ -31,11 +31,14 @@ func NewRedisCache(name string, redisClient *RedisClient, reg prometheus.Registe logger: logger, requestDuration: instr.NewHistogramCollector( promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ - Namespace: "cortex", - Name: "rediscache_request_duration_seconds", - Help: "Total time spent in seconds doing Redis requests.", - Buckets: prometheus.ExponentialBuckets(0.000016, 4, 8), - ConstLabels: prometheus.Labels{"name": name}, + Namespace: "cortex", + Name: "rediscache_request_duration_seconds", + Help: "Total time spent in seconds doing Redis requests.", + Buckets: prometheus.ExponentialBuckets(0.000016, 4, 8), + ConstLabels: prometheus.Labels{"name": name}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }, []string{"method", "status_code"}), ), } diff --git a/pkg/compactor/compactor_metrics.go b/pkg/compactor/compactor_metrics.go index 6c7f9c14087..d777ca9e8e7 100644 --- a/pkg/compactor/compactor_metrics.go +++ b/pkg/compactor/compactor_metrics.go @@ -1,6 +1,8 @@ package compactor import ( + "time" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" "github.com/thanos-io/thanos/pkg/block" @@ -89,9 +91,12 @@ func newCompactorMetricsWithLabels(reg prometheus.Registerer, commonLabels []str Help: "Total blocks metadata synchronization failures.", }, nil) m.metaFetcherSyncDuration = promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ - Name: "cortex_compactor_meta_sync_duration_seconds", - Help: "Duration of the blocks metadata synchronization in seconds.", - Buckets: []float64{0.01, 1, 10, 100, 300, 600, 1000}, + Name: "cortex_compactor_meta_sync_duration_seconds", + Help: "Duration of the blocks metadata synchronization in seconds.", + Buckets: []float64{0.01, 1, 10, 100, 300, 600, 1000}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }, nil) m.metaFetcherSynced = extprom.NewTxGaugeVec( reg, @@ -126,8 +131,12 @@ func newCompactorMetricsWithLabels(reg prometheus.Registerer, commonLabels []str Help: "Total number of failed garbage collection operations.", }, nil) m.syncerGarbageCollectionDuration = promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ - Name: "cortex_compactor_garbage_collection_duration_seconds", - Help: "Time it took to perform garbage collection iteration.", + Name: "cortex_compactor_garbage_collection_duration_seconds", + Help: "Time it took to perform garbage collection iteration.", + Buckets: prometheus.DefBuckets, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }, nil) m.syncerBlocksMarkedForDeletion = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ Name: blocksMarkedForDeletionName, diff --git a/pkg/configs/client/client.go b/pkg/configs/client/client.go index 263d59ca508..1bc73578ae6 100644 --- a/pkg/configs/client/client.go +++ b/pkg/configs/client/client.go @@ -42,10 +42,13 @@ func (cfg *Config) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) { } var configsRequestDuration = instrument.NewHistogramCollector(promauto.NewHistogramVec(prometheus.HistogramOpts{ - Namespace: "cortex", - Name: "configs_request_duration_seconds", - Help: "Time spent requesting userconfig.", - Buckets: prometheus.DefBuckets, + Namespace: "cortex", + Name: "configs_request_duration_seconds", + Help: "Time spent requesting userconfig.", + Buckets: prometheus.DefBuckets, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }, []string{"operation", "status_code"})) // Client is what the ruler and altermanger needs from a config store to process rules. diff --git a/pkg/configs/db/timed.go b/pkg/configs/db/timed.go index 58cbfdc9e19..bd164b8131b 100644 --- a/pkg/configs/db/timed.go +++ b/pkg/configs/db/timed.go @@ -2,6 +2,7 @@ package db import ( "context" + "time" "github.com/prometheus/client_golang/prometheus" "github.com/weaveworks/common/instrument" @@ -11,10 +12,13 @@ import ( var ( databaseRequestDuration = instrument.NewHistogramCollector(prometheus.NewHistogramVec(prometheus.HistogramOpts{ - Namespace: "cortex", - Name: "database_request_duration_seconds", - Help: "Time spent (in seconds) doing database requests.", - Buckets: prometheus.DefBuckets, + Namespace: "cortex", + Name: "database_request_duration_seconds", + Help: "Time spent (in seconds) doing database requests.", + Buckets: prometheus.DefBuckets, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }, []string{"method", "status_code"})) ) diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 7dd2e66c4bd..edf9b750b87 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -336,10 +336,13 @@ func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Ove ingestionRate: util_math.NewEWMARate(0.2, instanceIngestionRateTickInterval), queryDuration: instrument.NewHistogramCollector(promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ - Namespace: "cortex", - Name: "distributor_query_duration_seconds", - Help: "Time spent executing expression and exemplar queries.", - Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 20, 30}, + Namespace: "cortex", + Name: "distributor_query_duration_seconds", + Help: "Time spent executing expression and exemplar queries.", + Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 20, 30}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }, []string{"method", "status_code"})), receivedSamples: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ Namespace: "cortex", @@ -396,10 +399,13 @@ func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Ove Help: "The total number of deduplicated samples.", }, []string{"user", "cluster"}), labelsHistogram: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ - Namespace: "cortex", - Name: "labels_per_sample", - Help: "Number of labels per sample.", - Buckets: []float64{5, 10, 15, 20, 25}, + Namespace: "cortex", + Name: "labels_per_sample", + Help: "Number of labels per sample.", + Buckets: []float64{5, 10, 15, 20, 25}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }), ingesterAppends: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ Namespace: "cortex", diff --git a/pkg/frontend/transport/retry.go b/pkg/frontend/transport/retry.go index e28abc72efb..ccf662f8bca 100644 --- a/pkg/frontend/transport/retry.go +++ b/pkg/frontend/transport/retry.go @@ -4,6 +4,7 @@ import ( "context" "errors" "strings" + "time" "unsafe" "github.com/prometheus/client_golang/prometheus" @@ -24,10 +25,13 @@ func NewRetry(maxRetries int, reg prometheus.Registerer) *Retry { return &Retry{ maxRetries: maxRetries, retriesCount: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ - Namespace: "cortex", - Name: "query_frontend_retries", - Help: "Number of times a request is retried.", - Buckets: []float64{0, 1, 2, 3, 4, 5}, + Namespace: "cortex", + Name: "query_frontend_retries", + Help: "Number of times a request is retried.", + Buckets: []float64{0, 1, 2, 3, 4, 5}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }), } } diff --git a/pkg/frontend/v1/frontend.go b/pkg/frontend/v1/frontend.go index c4294aaaeac..f58cde326af 100644 --- a/pkg/frontend/v1/frontend.go +++ b/pkg/frontend/v1/frontend.go @@ -125,9 +125,12 @@ func New(cfg Config, limits Limits, log log.Logger, registerer prometheus.Regist Help: "Total number of query requests discarded.", }, []string{"user", "priority"}), queueDuration: promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{ - Name: "cortex_query_frontend_queue_duration_seconds", - Help: "Time spend by requests queued.", - Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 20, 30, 60}, + Name: "cortex_query_frontend_queue_duration_seconds", + Help: "Time spend by requests queued.", + Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 20, 30, 60}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }), } diff --git a/pkg/ha/ha_tracker.go b/pkg/ha/ha_tracker.go index cf8cef088d1..6c9722ea8a8 100644 --- a/pkg/ha/ha_tracker.go +++ b/pkg/ha/ha_tracker.go @@ -264,9 +264,12 @@ func NewHATracker(cfg HATrackerConfig, limits HATrackerLimits, trackerStatusConf Help: "The timestamp stored for the currently elected replica, from the KVStore.", }, []string{"user", "cluster"}), electedReplicaPropagationTime: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ - Name: "ha_tracker_elected_replica_change_propagation_time_seconds", - Help: "The time it for the distributor to update the replica change.", - Buckets: prometheus.DefBuckets, + Name: "ha_tracker_elected_replica_change_propagation_time_seconds", + Help: "The time it for the distributor to update the replica change.", + Buckets: prometheus.DefBuckets, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }), kvCASCalls: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ Name: "ha_tracker_kv_store_cas_total", diff --git a/pkg/ingester/client/client.go b/pkg/ingester/client/client.go index e197109e522..0604b0de7f7 100644 --- a/pkg/ingester/client/client.go +++ b/pkg/ingester/client/client.go @@ -8,6 +8,7 @@ import ( "net/http" "strings" "sync" + "time" "github.com/go-kit/log" "github.com/pkg/errors" @@ -26,10 +27,13 @@ import ( ) var ingesterClientRequestDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{ - Namespace: "cortex", - Name: "ingester_client_request_duration_seconds", - Help: "Time spent doing Ingester requests.", - Buckets: prometheus.ExponentialBuckets(0.001, 4, 7), + Namespace: "cortex", + Name: "ingester_client_request_duration_seconds", + Help: "Time spent doing Ingester requests.", + Buckets: prometheus.ExponentialBuckets(0.001, 4, 7), + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }, []string{"operation", "status_code"}) var ingesterClientInflightPushRequests = promauto.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "cortex", diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index cdc729c8e79..81780e4a01e 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -773,19 +773,28 @@ func newTSDBState(bucketClient objstore.Bucket, registerer prometheus.Registerer Help: "Total number of compactions that failed.", }), walReplayTime: promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{ - Name: "cortex_ingester_tsdb_wal_replay_duration_seconds", - Help: "The total time it takes to open and replay a TSDB WAL.", - Buckets: prometheus.DefBuckets, + Name: "cortex_ingester_tsdb_wal_replay_duration_seconds", + Help: "The total time it takes to open and replay a TSDB WAL.", + Buckets: prometheus.DefBuckets, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }), appenderAddDuration: promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{ - Name: "cortex_ingester_tsdb_appender_add_duration_seconds", - Help: "The total time it takes for a push request to add samples to the TSDB appender.", - Buckets: []float64{.001, .005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10}, + Name: "cortex_ingester_tsdb_appender_add_duration_seconds", + Help: "The total time it takes for a push request to add samples to the TSDB appender.", + Buckets: []float64{.001, .005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }), appenderCommitDuration: promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{ - Name: "cortex_ingester_tsdb_appender_commit_duration_seconds", - Help: "The total time it takes for a push request to commit samples appended to TSDB.", - Buckets: []float64{.001, .005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10}, + Name: "cortex_ingester_tsdb_appender_commit_duration_seconds", + Help: "The total time it takes for a push request to commit samples appended to TSDB.", + Buckets: []float64{.001, .005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }), idleTsdbChecks: idleTsdbChecks, diff --git a/pkg/ingester/metrics.go b/pkg/ingester/metrics.go index 07e0cde72bb..bd0ed95ebf8 100644 --- a/pkg/ingester/metrics.go +++ b/pkg/ingester/metrics.go @@ -161,25 +161,37 @@ func newIngesterMetrics(r prometheus.Registerer, Name: "cortex_ingester_queried_samples", Help: "The total number of samples returned from queries.", // Could easily return 10m samples per query - 10*(8^(8-1)) = 20.9m. - Buckets: prometheus.ExponentialBuckets(10, 8, 8), + Buckets: prometheus.ExponentialBuckets(10, 8, 8), + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }), queriedExemplars: promauto.With(r).NewHistogram(prometheus.HistogramOpts{ Name: "cortex_ingester_queried_exemplars", Help: "The total number of exemplars returned from queries.", // A reasonable upper bound is around 6k - 10*(5^(5-1)) = 6250. - Buckets: prometheus.ExponentialBuckets(10, 5, 5), + Buckets: prometheus.ExponentialBuckets(10, 5, 5), + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }), queriedSeries: promauto.With(r).NewHistogram(prometheus.HistogramOpts{ Name: "cortex_ingester_queried_series", Help: "The total number of series returned from queries.", // A reasonable upper bound is around 100k - 10*(8^(6-1)) = 327k. - Buckets: prometheus.ExponentialBuckets(10, 8, 6), + Buckets: prometheus.ExponentialBuckets(10, 8, 6), + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }), queriedChunks: promauto.With(r).NewHistogram(prometheus.HistogramOpts{ Name: "cortex_ingester_queried_chunks", Help: "The total number of chunks returned from queries.", // A small number of chunks per series - 10*(8^(7-1)) = 2.6m. - Buckets: prometheus.ExponentialBuckets(10, 8, 7), + Buckets: prometheus.ExponentialBuckets(10, 8, 7), + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }), memSeries: promauto.With(r).NewGauge(prometheus.GaugeOpts{ Name: "cortex_ingester_memory_series", diff --git a/pkg/parquetconverter/metrics.go b/pkg/parquetconverter/metrics.go index 2b3e80b0cfd..d926ed6d465 100644 --- a/pkg/parquetconverter/metrics.go +++ b/pkg/parquetconverter/metrics.go @@ -1,6 +1,8 @@ package parquetconverter import ( + "time" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" ) @@ -28,9 +30,12 @@ func newMetrics(reg prometheus.Registerer) *metrics { Help: "Time taken to for the latest block conversion for the user.", }, []string{"user"}), convertParquetBlockDelay: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ - Name: "cortex_parquet_converter_convert_block_delay_minutes", - Help: "Delay in minutes of Parquet block to be converted from the TSDB block being uploaded to object store", - Buckets: []float64{5, 10, 15, 20, 30, 45, 60, 80, 100, 120, 150, 180, 210, 240, 270, 300}, + Name: "cortex_parquet_converter_convert_block_delay_minutes", + Help: "Delay in minutes of Parquet block to be converted from the TSDB block being uploaded to object store", + Buckets: []float64{5, 10, 15, 20, 30, 45, 60, 80, 100, 120, 150, 180, 210, 240, 270, 300}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }), ownedUsers: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ Name: "cortex_parquet_converter_users_owned", diff --git a/pkg/querier/blocks_finder_bucket_scan.go b/pkg/querier/blocks_finder_bucket_scan.go index 1e57b30c7e7..aae57708108 100644 --- a/pkg/querier/blocks_finder_bucket_scan.go +++ b/pkg/querier/blocks_finder_bucket_scan.go @@ -91,9 +91,12 @@ func NewBucketScanBlocksFinder(cfg BucketScanBlocksFinderConfig, usersScanner us fetchersMetrics: storegateway.NewMetadataFetcherMetrics(), usersScanner: usersScanner, scanDuration: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ - Name: "cortex_querier_blocks_scan_duration_seconds", - Help: "The total time it takes to run a full blocks scan across the storage.", - Buckets: []float64{1, 10, 20, 30, 60, 120, 180, 240, 300, 600}, + Name: "cortex_querier_blocks_scan_duration_seconds", + Help: "The total time it takes to run a full blocks scan across the storage.", + Buckets: []float64{1, 10, 20, 30, 60, 120, 180, 240, 300, 600}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }), scanLastSuccess: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ Name: "cortex_querier_blocks_last_successful_scan_timestamp_seconds", diff --git a/pkg/querier/blocks_store_queryable.go b/pkg/querier/blocks_store_queryable.go index b0e4ea648ae..e7063d99bb8 100644 --- a/pkg/querier/blocks_store_queryable.go +++ b/pkg/querier/blocks_store_queryable.go @@ -116,16 +116,22 @@ type blocksStoreQueryableMetrics struct { func newBlocksStoreQueryableMetrics(reg prometheus.Registerer) *blocksStoreQueryableMetrics { return &blocksStoreQueryableMetrics{ storesHit: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ - Namespace: "cortex", - Name: "querier_storegateway_instances_hit_per_query", - Help: "Number of store-gateway instances hit for a single query.", - Buckets: []float64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, + Namespace: "cortex", + Name: "querier_storegateway_instances_hit_per_query", + Help: "Number of store-gateway instances hit for a single query.", + Buckets: []float64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }), refetches: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ - Namespace: "cortex", - Name: "querier_storegateway_refetches_per_query", - Help: "Number of re-fetches attempted while querying store-gateway instances due to missing blocks.", - Buckets: []float64{0, 1, 2, 4, 8}, + Namespace: "cortex", + Name: "querier_storegateway_refetches_per_query", + Help: "Number of re-fetches attempted while querying store-gateway instances due to missing blocks.", + Buckets: []float64{0, 1, 2, 4, 8}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }), } } diff --git a/pkg/querier/store_gateway_client.go b/pkg/querier/store_gateway_client.go index 140e3b39078..80a869f8f48 100644 --- a/pkg/querier/store_gateway_client.go +++ b/pkg/querier/store_gateway_client.go @@ -19,11 +19,14 @@ import ( func newStoreGatewayClientFactory(clientCfg grpcclient.ConfigWithHealthCheck, reg prometheus.Registerer) client.PoolFactory { requestDuration := promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ - Namespace: "cortex", - Name: "storegateway_client_request_duration_seconds", - Help: "Time spent executing requests to the store-gateway.", - Buckets: prometheus.ExponentialBuckets(0.008, 4, 7), - ConstLabels: prometheus.Labels{"client": "querier"}, + Namespace: "cortex", + Name: "storegateway_client_request_duration_seconds", + Help: "Time spent executing requests to the store-gateway.", + Buckets: prometheus.ExponentialBuckets(0.008, 4, 7), + ConstLabels: prometheus.Labels{"client": "querier"}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }, []string{"operation", "status_code"}) return func(addr string) (client.PoolClient, error) { diff --git a/pkg/querier/tenantfederation/exemplar_merge_queryable.go b/pkg/querier/tenantfederation/exemplar_merge_queryable.go index a244dd172ea..6262c9d195a 100644 --- a/pkg/querier/tenantfederation/exemplar_merge_queryable.go +++ b/pkg/querier/tenantfederation/exemplar_merge_queryable.go @@ -3,6 +3,7 @@ package tenantfederation import ( "context" "fmt" + "time" "github.com/go-kit/log/level" "github.com/pkg/errors" @@ -85,10 +86,13 @@ func NewMergeExemplarQueryable(idLabelName string, maxConcurrent int, callback M maxConcurrent: maxConcurrent, tenantsPerExemplarQuery: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ - Namespace: "cortex", - Name: "querier_federated_tenants_per_exemplar_query", - Help: "Number of tenants per exemplar query.", - Buckets: []float64{1, 2, 4, 8, 16, 32, 64}, + Namespace: "cortex", + Name: "querier_federated_tenants_per_exemplar_query", + Help: "Number of tenants per exemplar query.", + Buckets: []float64{1, 2, 4, 8, 16, 32, 64}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }), } } diff --git a/pkg/querier/tenantfederation/merge_queryable.go b/pkg/querier/tenantfederation/merge_queryable.go index e752d9dea74..60faa7824fc 100644 --- a/pkg/querier/tenantfederation/merge_queryable.go +++ b/pkg/querier/tenantfederation/merge_queryable.go @@ -5,6 +5,7 @@ import ( "fmt" "sort" "strings" + "time" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" @@ -97,10 +98,13 @@ func NewMergeQueryable(idLabelName string, maxConcurrent int, callback MergeQuer allowPartialData: allowPartialData, tenantsPerQuery: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ - Namespace: "cortex", - Name: "querier_federated_tenants_per_query", - Help: "Number of tenants per query.", - Buckets: []float64{1, 2, 4, 8, 16, 32, 64}, + Namespace: "cortex", + Name: "querier_federated_tenants_per_query", + Help: "Number of tenants per query.", + Buckets: []float64{1, 2, 4, 8, 16, 32, 64}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }), } } diff --git a/pkg/querier/tenantfederation/metadata_merge_querier.go b/pkg/querier/tenantfederation/metadata_merge_querier.go index 5b474ea174e..de020dbb6e3 100644 --- a/pkg/querier/tenantfederation/metadata_merge_querier.go +++ b/pkg/querier/tenantfederation/metadata_merge_querier.go @@ -3,6 +3,7 @@ package tenantfederation import ( "context" "fmt" + "time" "github.com/go-kit/log/level" "github.com/pkg/errors" @@ -27,10 +28,13 @@ func NewMetadataQuerier(upstream querier.MetadataQuerier, cfg Config, reg promet allowPartialData: cfg.AllowPartialData, tenantsPerMetadataQuery: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ - Namespace: "cortex", - Name: "querier_federated_tenants_per_metadata_query", - Help: "Number of tenants per metadata query.", - Buckets: []float64{1, 2, 4, 8, 16, 32, 64}, + Namespace: "cortex", + Name: "querier_federated_tenants_per_metadata_query", + Help: "Number of tenants per metadata query.", + Buckets: []float64{1, 2, 4, 8, 16, 32, 64}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }), } } diff --git a/pkg/querier/tripperware/instrumentation.go b/pkg/querier/tripperware/instrumentation.go index a7daf631a9c..cdb93ff10ea 100644 --- a/pkg/querier/tripperware/instrumentation.go +++ b/pkg/querier/tripperware/instrumentation.go @@ -42,10 +42,13 @@ type InstrumentMiddlewareMetrics struct { func NewInstrumentMiddlewareMetrics(registerer prometheus.Registerer) *InstrumentMiddlewareMetrics { return &InstrumentMiddlewareMetrics{ duration: promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{ - Namespace: "cortex", - Name: "frontend_query_range_duration_seconds", - Help: "Total time spent in seconds doing query range requests.", - Buckets: prometheus.DefBuckets, + Namespace: "cortex", + Name: "frontend_query_range_duration_seconds", + Help: "Total time spent in seconds doing query range requests.", + Buckets: prometheus.DefBuckets, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }, []string{"method", "status_code"}), } } diff --git a/pkg/querier/worker/scheduler_processor.go b/pkg/querier/worker/scheduler_processor.go index 8a31ad72118..d152e057549 100644 --- a/pkg/querier/worker/scheduler_processor.go +++ b/pkg/querier/worker/scheduler_processor.go @@ -43,9 +43,12 @@ func newSchedulerProcessor(cfg Config, handler RequestHandler, log log.Logger, r return schedulerpb.NewSchedulerForQuerierClient(conn) }, frontendClientRequestDuration: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ - Name: "cortex_querier_query_frontend_request_duration_seconds", - Help: "Time spend doing requests to frontend.", - Buckets: prometheus.ExponentialBuckets(0.001, 4, 6), + Name: "cortex_querier_query_frontend_request_duration_seconds", + Help: "Time spend doing requests to frontend.", + Buckets: prometheus.ExponentialBuckets(0.001, 4, 6), + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }, []string{"operation", "status_code"}), querierAddress: querierAddress, } diff --git a/pkg/ring/kv/consul/metrics.go b/pkg/ring/kv/consul/metrics.go index 52a1d4e843a..80c94d90e05 100644 --- a/pkg/ring/kv/consul/metrics.go +++ b/pkg/ring/kv/consul/metrics.go @@ -2,6 +2,7 @@ package consul import ( "context" + "time" consul "github.com/hashicorp/consul/api" "github.com/prometheus/client_golang/prometheus" @@ -20,9 +21,12 @@ type consulMetrics struct { func newConsulMetrics(registerer prometheus.Registerer) *consulMetrics { consulRequestDurationCollector := instrument.NewHistogramCollector(promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{ - Name: "consul_request_duration_seconds", - Help: "Time spent on consul requests.", - Buckets: prometheus.DefBuckets, + Name: "consul_request_duration_seconds", + Help: "Time spent on consul requests.", + Buckets: prometheus.DefBuckets, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }, []string{"operation", "status_code"})) consulMetrics := consulMetrics{consulRequestDurationCollector} return &consulMetrics diff --git a/pkg/ring/kv/dynamodb/metrics.go b/pkg/ring/kv/dynamodb/metrics.go index 1d0f051da0e..e144315f541 100644 --- a/pkg/ring/kv/dynamodb/metrics.go +++ b/pkg/ring/kv/dynamodb/metrics.go @@ -3,6 +3,7 @@ package dynamodb import ( "context" "strconv" + "time" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" @@ -25,9 +26,12 @@ type dynamodbMetrics struct { func newDynamoDbMetrics(registerer prometheus.Registerer) *dynamodbMetrics { dynamodbRequestDurationCollector := instrument.NewHistogramCollector(promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{ - Name: "dynamodb_kv_request_duration_seconds", - Help: "Time spent on dynamodb requests.", - Buckets: prometheus.DefBuckets, + Name: "dynamodb_kv_request_duration_seconds", + Help: "Time spent on dynamodb requests.", + Buckets: prometheus.DefBuckets, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }, []string{"operation", "status_code"})) dynamodbUsageMetrics := promauto.With(registerer).NewCounterVec(prometheus.CounterOpts{ diff --git a/pkg/ring/kv/metrics.go b/pkg/ring/kv/metrics.go index 3d2fa8928c8..fe8af650d68 100644 --- a/pkg/ring/kv/metrics.go +++ b/pkg/ring/kv/metrics.go @@ -58,6 +58,9 @@ func newMetricsClient(backend string, c Client, reg prometheus.Registerer) Clien ConstLabels: prometheus.Labels{ "type": backend, }, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }, []string{"operation", "status_code"}), ), } diff --git a/pkg/ring/lifecycler_metrics.go b/pkg/ring/lifecycler_metrics.go index 422a564c18b..e60a7662c84 100644 --- a/pkg/ring/lifecycler_metrics.go +++ b/pkg/ring/lifecycler_metrics.go @@ -1,6 +1,8 @@ package ring import ( + "time" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" ) @@ -30,10 +32,13 @@ func NewLifecyclerMetrics(ringName string, reg prometheus.Registerer) *Lifecycle ConstLabels: prometheus.Labels{"name": ringName}, }), shutdownDuration: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ - Name: "shutdown_duration_seconds", - Help: "Duration (in seconds) of shutdown procedure (ie transfer or flush).", - Buckets: prometheus.ExponentialBuckets(10, 2, 8), // Biggest bucket is 10*2^(9-1) = 2560, or 42 mins. - ConstLabels: prometheus.Labels{"name": ringName}, + Name: "shutdown_duration_seconds", + Help: "Duration (in seconds) of shutdown procedure (ie transfer or flush).", + Buckets: prometheus.ExponentialBuckets(10, 2, 8), // Biggest bucket is 10*2^(9-1) = 2560, or 42 mins. + ConstLabels: prometheus.Labels{"name": ringName}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }, []string{"op", "status"}), } diff --git a/pkg/ruler/client_pool.go b/pkg/ruler/client_pool.go index 0bb5ff05f14..70ee84f7ef5 100644 --- a/pkg/ruler/client_pool.go +++ b/pkg/ruler/client_pool.go @@ -54,9 +54,12 @@ func newRulerClientPool(clientCfg grpcclient.Config, logger log.Logger, reg prom func newRulerClientFactory(clientCfg grpcclient.Config, reg prometheus.Registerer) client.PoolFactory { requestDuration := promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ - Name: "cortex_ruler_client_request_duration_seconds", - Help: "Time spent executing requests to the ruler.", - Buckets: prometheus.ExponentialBuckets(0.008, 4, 7), + Name: "cortex_ruler_client_request_duration_seconds", + Help: "Time spent executing requests to the ruler.", + Buckets: prometheus.ExponentialBuckets(0.008, 4, 7), + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }, []string{"operation", "status_code"}) return func(addr string) (client.PoolClient, error) { diff --git a/pkg/ruler/frontend_client_pool.go b/pkg/ruler/frontend_client_pool.go index 38847861387..9ab140ccd90 100644 --- a/pkg/ruler/frontend_client_pool.go +++ b/pkg/ruler/frontend_client_pool.go @@ -30,9 +30,12 @@ func newFrontendPool(cfg Config, log log.Logger, reg prometheus.Registerer) *cli prometheusHTTPPrefix: cfg.PrometheusHTTPPrefix, grpcConfig: cfg.GRPCClientConfig, frontendClientRequestDuration: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ - Name: "cortex_ruler_query_frontend_request_duration_seconds", - Help: "Time spend doing requests to frontend.", - Buckets: prometheus.ExponentialBuckets(0.001, 4, 6), + Name: "cortex_ruler_query_frontend_request_duration_seconds", + Help: "Time spend doing requests to frontend.", + Buckets: prometheus.ExponentialBuckets(0.001, 4, 6), + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }, []string{"operation", "status_code"}), } diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index 3539c01e6bc..d84cdb5f8df 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -143,9 +143,12 @@ func NewScheduler(cfg Config, limits Limits, log log.Logger, registerer promethe s.requestQueue = queue.NewRequestQueue(cfg.QuerierForgetDelay, s.queueLength, s.discardedRequests, s.limits, registerer) s.queueDuration = promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{ - Name: "cortex_query_scheduler_queue_duration_seconds", - Help: "Time spend by requests in queue before getting picked up by a querier.", - Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 20, 30, 60}, + Name: "cortex_query_scheduler_queue_duration_seconds", + Help: "Time spend by requests in queue before getting picked up by a querier.", + Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 20, 30, 60}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }) s.connectedQuerierClients = promauto.With(registerer).NewGaugeFunc(prometheus.GaugeOpts{ Name: "cortex_query_scheduler_connected_querier_clients", diff --git a/pkg/storage/tsdb/bucketindex/loader.go b/pkg/storage/tsdb/bucketindex/loader.go index 59d7121a93b..ed9445805df 100644 --- a/pkg/storage/tsdb/bucketindex/loader.go +++ b/pkg/storage/tsdb/bucketindex/loader.go @@ -71,9 +71,12 @@ func NewLoader(cfg LoaderConfig, bucketClient objstore.Bucket, cfgProvider bucke Help: "Total number of bucket index loading failures.", }), loadDuration: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ - Name: "cortex_bucket_index_load_duration_seconds", - Help: "Duration of the a single bucket index loading operation in seconds.", - Buckets: []float64{0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 1, 10}, + Name: "cortex_bucket_index_load_duration_seconds", + Help: "Duration of the a single bucket index loading operation in seconds.", + Buckets: []float64{0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 1, 10}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }), } diff --git a/pkg/storage/tsdb/multilevel_bucket_cache.go b/pkg/storage/tsdb/multilevel_bucket_cache.go index 66c6c432918..128217d3f32 100644 --- a/pkg/storage/tsdb/multilevel_bucket_cache.go +++ b/pkg/storage/tsdb/multilevel_bucket_cache.go @@ -83,14 +83,20 @@ func newMultiLevelBucketCache(name string, cfg MultiLevelBucketCacheConfig, reg caches: c, backfillProcessor: cacheutil.NewAsyncOperationProcessor(cfg.MaxAsyncBufferSize, cfg.MaxAsyncConcurrency), fetchLatency: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ - Name: fmt.Sprintf("cortex_store_multilevel_%s_fetch_duration_seconds", itemName), - Help: fmt.Sprintf("Histogram to track latency to fetch items from multi level %s", metricHelpText), - Buckets: []float64{0.01, 0.1, 0.3, 0.6, 1, 3, 6, 10, 15, 20, 25, 30, 40, 50, 60, 90}, + Name: fmt.Sprintf("cortex_store_multilevel_%s_fetch_duration_seconds", itemName), + Help: fmt.Sprintf("Histogram to track latency to fetch items from multi level %s", metricHelpText), + Buckets: []float64{0.01, 0.1, 0.3, 0.6, 1, 3, 6, 10, 15, 20, 25, 30, 40, 50, 60, 90}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }, nil), backFillLatency: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ - Name: fmt.Sprintf("cortex_store_multilevel_%s_backfill_duration_seconds", itemName), - Help: fmt.Sprintf("Histogram to track latency to backfill items from multi level %s", metricHelpText), - Buckets: []float64{0.01, 0.1, 0.3, 0.6, 1, 3, 6, 10, 15, 20, 25, 30, 40, 50, 60, 90}, + Name: fmt.Sprintf("cortex_store_multilevel_%s_backfill_duration_seconds", itemName), + Help: fmt.Sprintf("Histogram to track latency to backfill items from multi level %s", metricHelpText), + Buckets: []float64{0.01, 0.1, 0.3, 0.6, 1, 3, 6, 10, 15, 20, 25, 30, 40, 50, 60, 90}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }, nil), storeDroppedItems: promauto.With(reg).NewCounter(prometheus.CounterOpts{ Name: fmt.Sprintf("cortex_store_multilevel_%s_backfill_dropped_items_total", itemName), diff --git a/pkg/storage/tsdb/multilevel_index_cache.go b/pkg/storage/tsdb/multilevel_index_cache.go index bab35f74710..32681c2d8e1 100644 --- a/pkg/storage/tsdb/multilevel_index_cache.go +++ b/pkg/storage/tsdb/multilevel_index_cache.go @@ -5,6 +5,7 @@ import ( "errors" "maps" "slices" + "time" "github.com/oklog/ulid/v2" "github.com/prometheus/client_golang/prometheus" @@ -226,14 +227,20 @@ func newMultiLevelCache(reg prometheus.Registerer, cfg MultiLevelIndexCacheConfi expandedPostingCaches: filterCachesByItem(enabledItems, storecache.CacheTypeExpandedPostings, c...), backfillProcessor: cacheutil.NewAsyncOperationProcessor(cfg.MaxAsyncBufferSize, cfg.MaxAsyncConcurrency), fetchLatency: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ - Name: "cortex_store_multilevel_index_cache_fetch_duration_seconds", - Help: "Histogram to track latency to fetch items from multi level index cache", - Buckets: []float64{0.01, 0.1, 0.3, 0.6, 1, 3, 6, 10, 15, 20, 25, 30, 40, 50, 60, 90}, + Name: "cortex_store_multilevel_index_cache_fetch_duration_seconds", + Help: "Histogram to track latency to fetch items from multi level index cache", + Buckets: []float64{0.01, 0.1, 0.3, 0.6, 1, 3, 6, 10, 15, 20, 25, 30, 40, 50, 60, 90}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }, []string{"item_type"}), backFillLatency: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ - Name: "cortex_store_multilevel_index_cache_backfill_duration_seconds", - Help: "Histogram to track latency to backfill items from multi level index cache", - Buckets: []float64{0.01, 0.1, 0.3, 0.6, 1, 3, 6, 10, 15, 20, 25, 30, 40, 50, 60, 90}, + Name: "cortex_store_multilevel_index_cache_backfill_duration_seconds", + Help: "Histogram to track latency to backfill items from multi level index cache", + Buckets: []float64{0.01, 0.1, 0.3, 0.6, 1, 3, 6, 10, 15, 20, 25, 30, 40, 50, 60, 90}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }, []string{"item_type"}), backfillDroppedItems: map[string]prometheus.Counter{ storecache.CacheTypePostings: backfillDroppedItems.WithLabelValues(storecache.CacheTypePostings), diff --git a/pkg/storegateway/bucket_stores.go b/pkg/storegateway/bucket_stores.go index f017457a9f3..c905ff92396 100644 --- a/pkg/storegateway/bucket_stores.go +++ b/pkg/storegateway/bucket_stores.go @@ -148,9 +148,12 @@ func newThanosBucketStores(cfg tsdb.BlocksStorageConfig, shardingStrategy Shardi userTokenBuckets: make(map[string]*util.TokenBucket), inflightRequests: util.NewInflightRequestTracker(), syncTimes: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ - Name: "cortex_bucket_stores_blocks_sync_seconds", - Help: "The total time it takes to perform a sync stores", - Buckets: []float64{0.1, 1, 10, 30, 60, 120, 300, 600, 900}, + Name: "cortex_bucket_stores_blocks_sync_seconds", + Help: "The total time it takes to perform a sync stores", + Buckets: []float64{0.1, 1, 10, 30, 60, 120, 300, 600, 900}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: time.Hour, }), syncLastSuccess: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ Name: "cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds", From f1515d9b960ec43beff0a500e36bddc5dc1347e4 Mon Sep 17 00:00:00 2001 From: Charlie Le Date: Thu, 18 Jun 2026 16:00:45 -0700 Subject: [PATCH 2/3] Add proposal for native histogram support across all production metrics Signed-off-by: Charlie Le --- docs/proposals/native-histograms.md | 93 +++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 docs/proposals/native-histograms.md diff --git a/docs/proposals/native-histograms.md b/docs/proposals/native-histograms.md new file mode 100644 index 00000000000..8d890e7b349 --- /dev/null +++ b/docs/proposals/native-histograms.md @@ -0,0 +1,93 @@ +# Native Histogram Support for All Production Metrics + +## Problem + +Cortex exposes ~56 histogram metrics across its components. Native histograms (introduced in Prometheus 2.40) offer significant advantages over classic histograms: + +1. **Higher resolution** — Exponential bucket boundaries adapt to observed values, eliminating the need to pre-define bucket ranges. +2. **Lower storage cost** — Native histograms typically use fewer time series than classic histograms with many buckets. +3. **Better aggregation** — Native histograms can be merged across instances without information loss from mismatched bucket boundaries. + +Previously, ~11 histograms were converted to dual-format (classic + native). The remaining ~45 production histograms were still classic-only, providing an inconsistent experience for operators who want to adopt native histograms. + +## Design + +### Dual-Format Histograms + +All production histograms are configured as dual-format by adding three fields to each `prometheus.HistogramOpts`: + +```go +NativeHistogramBucketFactor: 1.1, +NativeHistogramMaxBucketNumber: 100, +NativeHistogramMinResetDuration: time.Hour, +``` + +This means: +- **Classic scrapes** continue to work unchanged — the histogram exposes classic buckets as before. +- **Native-aware scrapes** (using `Accept: application/vnd.google.protobuf`) receive native histogram data instead. +- No behavioral change for existing deployments until they opt in to native histogram scraping. + +### Configuration Values + +| Field | Value | Rationale | +|-------|-------|-----------| +| `NativeHistogramBucketFactor` | 1.1 | ~10% relative resolution per bucket, providing good precision without excessive bucket count. Matches the pattern already established in the codebase. | +| `NativeHistogramMaxBucketNumber` | 100 | Upper bound on bucket count to prevent cardinality explosion from pathological distributions. | +| `NativeHistogramMinResetDuration` | `time.Hour` | Prevents premature schema resets during transient spikes, reducing unnecessary churn. | + +These values match the existing pattern used by the ~11 histograms already converted (e.g., in `pkg/distributor`, `pkg/ingester`). + +### Affected Components + +| Component | Histograms | +|-----------|-----------| +| Ingester | 8 (WAL replay, appender add/commit, queried samples/exemplars/series/chunks) | +| Querier | 7 (store gateway client, instances hit, refetches, blocks scan, tenant federation) | +| Cache | 4 (instrumented cache value size/duration, memcached, redis) | +| API | 3 (request duration, message sizes) | +| Distributor | 2 (query duration, labels per sample) | +| Compactor | 2 (meta sync duration, garbage collection duration) | +| Storage/TSDB | 5 (bucket index load, multilevel cache fetch/backfill) | +| Store Gateway | 1 (blocks sync) | +| Frontend | 2 (queue duration, retries) | +| Scheduler | 1 (queue duration) | +| Alertmanager | 2 (client request duration, initial sync duration) | +| Ring/KV | 4 (KV request duration, consul, dynamodb, lifecycler shutdown) | +| Ruler | 2 (client pool request duration, frontend client pool) | +| HA Tracker | 1 (change propagation time) | +| Configs | 2 (client request duration, database request duration) | +| Parquet Converter | 1 (block conversion delay) | + +### Backward Compatibility + +This change is fully backward compatible: + +- **No metric name changes** — all existing metric names, labels, and bucket boundaries are preserved. +- **No scrape format change** — classic format is served unless the scraper explicitly requests native histograms via content negotiation. +- **No configuration required** — dual-format is enabled automatically; operators opt in to native scraping at the Prometheus/agent level. + +## Migration Path + +1. **Deploy updated Cortex** — histograms begin emitting dual-format data. No observable change for classic scrapers. +2. **Enable native histogram ingestion** in Prometheus (or compatible agents) by configuring `scrape_protocols` to include `PrometheusProto`. +3. **Update dashboards/alerts** — native histograms use the base metric name (without `_bucket`/`_sum`/`_count` suffixes). For example: + + ```promql + # Classic histogram query: + histogram_quantile(0.99, rate(cortex_ingester_tsdb_appender_commit_duration_seconds_bucket[5m])) + + # Native histogram query (uses base name directly): + histogram_quantile(0.99, rate(cortex_ingester_tsdb_appender_commit_duration_seconds[5m])) + ``` + + Native histograms also enable new PromQL functions not available with classic histograms: `histogram_avg()`, `histogram_fraction()`, `histogram_stddev()`, and `histogram_stdvar()`. + +## Alternatives Considered + +### Native-only (no classic buckets) + +Rejected because it would break existing dashboards and alerts that rely on classic bucket boundaries. Dual-format ensures zero disruption. + +### Per-histogram configuration + +Rejected because uniform settings simplify operations and the chosen values (1.1 factor, 100 max buckets, 1h min reset) are broadly suitable for all Cortex histograms. Operators who need different settings can override at the scrape level. From 1511b05d9a0ac982bf0ce0cdc7829cb112a076f3 Mon Sep 17 00:00:00 2001 From: Charlie Le Date: Thu, 18 Jun 2026 18:06:55 -0700 Subject: [PATCH 3/3] Add CHANGELOG entry for native histogram support Signed-off-by: Charlie Le --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e8842af32f3..b3555a94337 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,6 +36,7 @@ * [ENHANCEMENT] Distributor: Add `WrappedHistogram` with configurable size limit (`-validation.max-native-histogram-size-bytes`) to cap native histogram protobuf size before unmarshalling. #7570 * [ENHANCEMENT] Ingester: Add lazy regex evaluation on head postings cache miss. Defers expensive regex matchers on high-cardinality labels to per-series filtering when a selective equality matcher already narrows the result set. Configured via `-blocks-storage.expanded_postings_cache.head.lazy-matcher-max-cardinality` (disabled by default). #7553 * [ENHANCEMENT] Ring: Add ring metric to count number of duplicate tokens. #7626 +* [ENHANCEMENT] Metrics: Add native histogram support to all remaining production histograms, enabling dual-format (classic + native) exposition across all Cortex components. * [BUGFIX] Querier: Fix queryWithRetry and labelsWithRetry returning (nil, nil) on cancelled context by propagating ctx.Err(). #7370 * [BUGFIX] Metrics Helper: Fix non-deterministic bucket order in merged histograms by sorting buckets after map iteration, matching Prometheus client library behavior. #7380 * [BUGFIX] Distributor: Return HTTP 401 Unauthorized when tenant ID resolution fails in the Prometheus Remote Write 2.0 path. #7389