diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b029c6e9..cc7d3711a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## Unreleased * [FEATURE] HTTP handlers created by `promhttp` package now support metrics filtering by providing one or more `name[]` query parameters. The default behavior when none are provided remains the same, returning all metrics. #1925 +* [FEATURE] prometheus: `Registry` exposes a new `DisableHelpTextValidation` method to opt out of the strict per-fqName help-text consistency check during `Gather`. Useful when multiple libraries (or different versions of the same library) register compatible metrics into the same registry. Default behavior is unchanged, and `NewPedanticRegistry` always overrides the toggle. #2006 ## Unreleased `exp` module diff --git a/prometheus/registry.go b/prometheus/registry.go index 8dd906c6b..9891e3db8 100644 --- a/prometheus/registry.go +++ b/prometheus/registry.go @@ -271,12 +271,38 @@ func (errs MultiError) MaybeUnwrap() error { // Registry implements Collector to allow it to be used for creating groups of // metrics. See the Grouping example for how this can be done. type Registry struct { - mtx sync.RWMutex - collectorsByID map[uint64]Collector // ID is a hash of the descIDs. - descIDs map[uint64]struct{} - dimHashesByName map[string]uint64 - uncheckedCollectors []Collector - pedanticChecksEnabled bool + mtx sync.RWMutex + collectorsByID map[uint64]Collector // ID is a hash of the descIDs. + descIDs map[uint64]struct{} + dimHashesByName map[string]uint64 + uncheckedCollectors []Collector + pedanticChecksEnabled bool + helpTextValidationDisabled bool +} + +// DisableHelpTextValidation opts this Registry out of the strict check that +// every metric sharing a fully-qualified name also share the same help text. +// When disabled, Gather keeps the help text from the first metric it sees for +// a given fqName and aggregates later metrics into the same family even if +// their help differs. +// +// This is useful when multiple libraries — or different versions of the same +// library — register compatible metrics into a shared registry. A trivial +// difference such as a trailing period gained across a client_golang upgrade +// can otherwise turn a metadata mismatch into a hard /metrics outage. The +// Prometheus server itself applies the same lenient semantics on scrape (see +// https://github.com/prometheus/client_golang/issues/1820). +// +// The strict default is recommended for development and testing, where it +// surfaces accidental help-text changes early. Calling this method on a +// Registry returned by NewPedanticRegistry has no effect: pedantic mode +// always enforces help-text consistency. The change applies to subsequent +// Gather calls and is safe to invoke after registration; concurrent calls +// are serialized by the Registry's lock. +func (r *Registry) DisableHelpTextValidation() { + r.mtx.Lock() + defer r.mtx.Unlock() + r.helpTextValidationDisabled = true } // Register implements Registerer. @@ -468,6 +494,13 @@ func (r *Registry) Gather() ([]*dto.MetricFamily, error) { registeredDescIDs[id] = struct{}{} } } + // Snapshot the effective help-text validation toggle while still under + // the lock so that all processMetric calls in this Gather see a + // consistent value. Pedantic mode unconditionally overrides the toggle + // because pedantic callers have explicitly opted into the strictest set + // of checks; mixing pedantic with lenient help validation would be + // internally contradictory. + helpTextValidationDisabled := r.helpTextValidationDisabled && !r.pedanticChecksEnabled r.mtx.RUnlock() wg.Add(goroutineBudget) @@ -526,6 +559,7 @@ func (r *Registry) Gather() ([]*dto.MetricFamily, error) { metric, metricFamiliesByName, metricHashes, registeredDescIDs, + helpTextValidationDisabled, )) case metric, ok := <-umc: if !ok { @@ -536,6 +570,7 @@ func (r *Registry) Gather() ([]*dto.MetricFamily, error) { metric, metricFamiliesByName, metricHashes, nil, + helpTextValidationDisabled, )) default: if goroutineBudget <= 0 || len(checkedCollectors)+len(uncheckedCollectors) == 0 { @@ -553,6 +588,7 @@ func (r *Registry) Gather() ([]*dto.MetricFamily, error) { metric, metricFamiliesByName, metricHashes, registeredDescIDs, + helpTextValidationDisabled, )) case metric, ok := <-umc: if !ok { @@ -563,6 +599,7 @@ func (r *Registry) Gather() ([]*dto.MetricFamily, error) { metric, metricFamiliesByName, metricHashes, nil, + helpTextValidationDisabled, )) } break @@ -659,11 +696,18 @@ func WriteToTextfile(filename string, g Gatherer) error { } // processMetric is an internal helper method only used by the Gather method. +// +// helpTextValidationDisabled gates the strict per-fqName help-text consistency +// check. When true, a metric whose help string differs from the family's +// existing help is still aggregated into the family, and the family keeps the +// first help text seen. See Registry.DisableHelpTextValidation for the +// motivating use case (#1820). func processMetric( metric Metric, metricFamiliesByName map[string]*dto.MetricFamily, metricHashes map[uint64]struct{}, registeredDescIDs map[uint64]struct{}, + helpTextValidationDisabled bool, ) error { desc := metric.Desc() // Wrapped metrics collected by an unchecked Collector can have an @@ -677,7 +721,7 @@ func processMetric( } metricFamily, ok := metricFamiliesByName[desc.fqName] if ok { // Existing name. - if metricFamily.GetHelp() != desc.help { + if metricFamily.GetHelp() != desc.help && !helpTextValidationDisabled { return fmt.Errorf( "collected metric %s %s has help %q but should have %q", desc.fqName, dtoMetric, desc.help, metricFamily.GetHelp(), diff --git a/prometheus/registry_test.go b/prometheus/registry_test.go index 379984fef..911d83319 100644 --- a/prometheus/registry_test.go +++ b/prometheus/registry_test.go @@ -1396,6 +1396,162 @@ func TestCheckMetricConsistency(t *testing.T) { reg.Unregister(invalidCollector) } +// helpTextCollector emits a single counter with a configurable fully-qualified +// name, help text, and label set. Its Describe method yields no Desc so the +// collector is treated as "unchecked" by the registry, which means it bypasses +// the help/dimHash consistency checks performed at Register time. This is the +// shape that triggers the cross-version scenario described in +// https://github.com/prometheus/client_golang/issues/1820: two libraries (often +// different versions of the same library) register compatible metrics into the +// same registry but use slightly different help strings — for example, one +// version adds a trailing period. +type helpTextCollector struct { + fqName string + help string + label string +} + +func (c helpTextCollector) Describe(_ chan<- *prometheus.Desc) {} + +func (c helpTextCollector) Collect(ch chan<- prometheus.Metric) { + desc := prometheus.NewDesc(c.fqName, c.help, []string{"src"}, nil) + ch <- prometheus.MustNewConstMetric(desc, prometheus.CounterValue, 1, c.label) +} + +// orderedHelpTextCollector emits multiple ConstMetrics under the same fqName +// with different help texts in deterministic source order. A single collector +// is used so that the order of emissions on the metric channel — and therefore +// which help text populates the MetricFamily first — is well-defined. Two +// separately-registered collectors are insufficient here because the registry +// fans out collection across goroutines. +type orderedHelpTextCollector struct { + fqName string + emits []struct{ help, label string } +} + +func (c orderedHelpTextCollector) Describe(_ chan<- *prometheus.Desc) {} + +func (c orderedHelpTextCollector) Collect(ch chan<- prometheus.Metric) { + for _, e := range c.emits { + desc := prometheus.NewDesc(c.fqName, e.help, []string{"src"}, nil) + ch <- prometheus.MustNewConstMetric(desc, prometheus.CounterValue, 1, e.label) + } +} + +// TestGatherRejectsHelpTextMismatchByDefault locks in the long-standing strict +// behavior: a non-pedantic Registry with no opt-in still rejects metrics that +// share an fqName but disagree on help text. This is the backwards-compatibility +// guarantee that callers depending on the existing error contract continue to +// observe. +func TestGatherRejectsHelpTextMismatchByDefault(t *testing.T) { + const fqName = "interop_total" + first := helpTextCollector{fqName: fqName, help: "Total count of interop events", label: "first"} + second := helpTextCollector{fqName: fqName, help: "Total count of interop events.", label: "second"} + + reg := prometheus.NewRegistry() + reg.MustRegister(first, second) + + if _, err := reg.Gather(); err == nil { + t.Fatal("expected Gather to return an error for help-text mismatch on a strict (default) Registry, got nil") + } +} + +// TestGatherAcceptsHelpTextMismatchWhenDisabled is a regression test for +// https://github.com/prometheus/client_golang/issues/1820. When the caller has +// opted out of help-text validation via Registry.DisableHelpTextValidation, +// Gather must aggregate metrics that share an fqName but use slightly +// different help strings — most commonly two libraries (or two versions of the +// same library) registering compatible metrics into the same registry. +// +// The test asserts that Gather succeeds, returns exactly one MetricFamily +// containing the metrics from both collectors, and keeps one of the observed +// help strings on the family (the registry fans out collection across +// goroutines, so we do not constrain which one — see +// TestGatherWithDisabledHelpTextValidationKeepsFirstHelp below for the +// deterministic guarantee). +func TestGatherAcceptsHelpTextMismatchWhenDisabled(t *testing.T) { + const fqName = "interop_total" + first := helpTextCollector{fqName: fqName, help: "Total count of interop events", label: "first"} + second := helpTextCollector{fqName: fqName, help: "Total count of interop events.", label: "second"} + + reg := prometheus.NewRegistry() + reg.DisableHelpTextValidation() + reg.MustRegister(first, second) + + got, err := reg.Gather() + if err != nil { + t.Fatalf("Gather returned an error after DisableHelpTextValidation: %v", err) + } + if len(got) != 1 { + t.Fatalf("expected exactly one MetricFamily, got %d: %v", len(got), got) + } + mf := got[0] + if mf.GetName() != fqName { + t.Errorf("MetricFamily name = %q, want %q", mf.GetName(), fqName) + } + if h := mf.GetHelp(); h != first.help && h != second.help { + t.Errorf("MetricFamily help = %q, want one of %q or %q", h, first.help, second.help) + } + if len(mf.GetMetric()) != 2 { + t.Errorf("expected MetricFamily to hold metrics from both collectors, got %d", len(mf.GetMetric())) + } +} + +// TestGatherWithDisabledHelpTextValidationKeepsFirstHelp pins the documented +// "first help text wins" semantics. A single collector emits both metrics in +// source order, so the metric-channel order — and therefore which help text +// is recorded on the MetricFamily — is deterministic. +func TestGatherWithDisabledHelpTextValidationKeepsFirstHelp(t *testing.T) { + const ( + fqName = "interop_total" + firstHelp = "Total count of interop events" + laterHelp = "Total count of interop events." + ) + c := orderedHelpTextCollector{ + fqName: fqName, + emits: []struct{ help, label string }{ + {help: firstHelp, label: "first"}, + {help: laterHelp, label: "later"}, + }, + } + + reg := prometheus.NewRegistry() + reg.DisableHelpTextValidation() + reg.MustRegister(c) + + got, err := reg.Gather() + if err != nil { + t.Fatalf("Gather returned an error: %v", err) + } + if len(got) != 1 { + t.Fatalf("expected exactly one MetricFamily, got %d: %v", len(got), got) + } + if h := got[0].GetHelp(); h != firstHelp { + t.Errorf("MetricFamily help = %q, want %q (first-emitted help text should win)", h, firstHelp) + } + if n := len(got[0].GetMetric()); n != 2 { + t.Errorf("expected MetricFamily to hold 2 metrics, got %d", n) + } +} + +// TestPedanticRegistryOverridesHelpTextValidationToggle documents that calling +// DisableHelpTextValidation on a pedantic Registry has no effect: pedantic +// callers have opted into the strictest set of checks, so a help-text +// mismatch must still fail Gather even if the toggle was flipped. +func TestPedanticRegistryOverridesHelpTextValidationToggle(t *testing.T) { + const fqName = "interop_total" + first := helpTextCollector{fqName: fqName, help: "Total count of interop events", label: "first"} + second := helpTextCollector{fqName: fqName, help: "Total count of interop events.", label: "second"} + + reg := prometheus.NewPedanticRegistry() + reg.DisableHelpTextValidation() // Must be ignored under pedantic mode. + reg.MustRegister(first, second) + + if _, err := reg.Gather(); err == nil { + t.Fatal("expected pedantic Registry to still reject help-text mismatch even after DisableHelpTextValidation, got nil error") + } +} + func TestGatherDoesNotLeakGoroutines(t *testing.T) { // Use goleak to verify that no unexpected goroutines are leaked during the test. defer goleak.VerifyNone(t)