1
0
Fork 1
mirror of https://github.com/git-pkgs/proxy.git synced 2026-06-02 08:38:17 -04:00
pkg-proxy/internal/metrics/metrics.go

240 lines
6.5 KiB
Go
Raw Permalink Normal View History

2026-02-03 22:40:23 +00:00
// Package metrics provides Prometheus metrics collection for the proxy.
package metrics
import (
"net/http"
"strconv"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
var (
// Request metrics
RequestsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "proxy_requests_total",
Help: "Total number of requests by ecosystem and status",
},
[]string{"ecosystem", "status"},
)
RequestDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "proxy_request_duration_seconds",
Help: "Request duration in seconds",
Buckets: prometheus.DefBuckets,
},
[]string{"ecosystem", "status"},
)
// Cache metrics
CacheHits = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "proxy_cache_hits_total",
Help: "Total number of cache hits by ecosystem",
},
[]string{"ecosystem"},
)
CacheMisses = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "proxy_cache_misses_total",
Help: "Total number of cache misses by ecosystem",
},
[]string{"ecosystem"},
)
CacheSize = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "proxy_cache_size_bytes",
Help: "Total size of cached artifacts in bytes",
},
)
CachedArtifacts = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "proxy_cached_artifacts_total",
Help: "Total number of cached artifacts",
},
)
// Upstream metrics
UpstreamFetchDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "proxy_upstream_fetch_duration_seconds",
Help: "Upstream fetch duration in seconds",
Buckets: []float64{.1, .25, .5, 1, 2.5, 5, 10, 30},
},
[]string{"ecosystem"},
)
UpstreamErrors = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "proxy_upstream_errors_total",
Help: "Total number of upstream fetch errors by type",
},
[]string{"ecosystem", "error_type"},
)
// Circuit breaker metrics
CircuitBreakerState = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "proxy_circuit_breaker_state",
Help: "Circuit breaker state (0=closed, 1=half-open, 2=open)",
},
[]string{"registry"},
)
CircuitBreakerTrips = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "proxy_circuit_breaker_trips_total",
Help: "Total number of circuit breaker trips",
},
[]string{"registry"},
)
// Storage metrics
StorageOperationDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "proxy_storage_operation_duration_seconds",
Help: "Storage operation duration in seconds",
Buckets: []float64{.001, .005, .01, .025, .05, .1, .25, .5, 1},
},
[]string{"operation"},
)
StorageErrors = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "proxy_storage_errors_total",
Help: "Total number of storage errors by operation",
},
[]string{"operation"},
)
// Active requests
ActiveRequests = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "proxy_active_requests",
Help: "Number of currently active requests",
},
)
IntegrityFailures = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "proxy_integrity_failures_total",
Help: "Cached artifacts that failed hash verification on read",
},
[]string{"ecosystem"},
)
Add storage backend probe to /health (closes #73) (#119) * config: add Health.StorageProbeInterval * metrics: add proxy_health_probe_failures_total counter * server: add storageProbe with happy-path test * server: add storageProbe failure-mode tests * server: add healthCache with TTL, single-flight, transition logging * server: wire storage probe into /health * server: update TestHealthEndpoint for JSON; wire healthCache into newTestServer Also fix Windows file-locking issue in storageProbe: close the reader explicitly before Delete so the file handle is released prior to os.Remove. * server: clean up stale comment in storageProbe * docs: document storage health probe and new metric * docs: regenerate Swagger for /health JSON response * server: simplify rc.Close error handling in storageProbe * server: defer probe cleanup so size/open/read/verify failures don't leak objects Previously, storageProbe only called Delete on the success path. Any failure between Store and the final Delete (size mismatch, Open error, mid-stream read failure, content mismatch) left the probe object orphaned in the storage backend. With caching disabled and Kubernetes-rate probing, the leak could accumulate noticeably on backends like S3. Use a named return + defer to attempt Delete after every successful Store. The earlier-step failure remains the primary error; Delete failure only surfaces as step="delete" when nothing else went wrong. Add a table-driven test that asserts cleanup runs for each non-delete failure path. Reported by Copilot on #119. * config: validate health.storage_probe_interval in Config.Validate The new duration field was only validated at use time in newHealthCache. The existing codebase already validates other duration fields (MetadataTTL, DirectServeTTL, Gradle.MaxAge, Gradle.SweepInterval) in Config.Validate() so misconfiguration fails fast at startup with a config-key-specific error. Match that pattern. The parse-at-use code in newHealthCache stays as a safety net, mirroring the MetadataTTL precedent. Reported by Copilot on #119. * docs: lowercase "counter" in metrics table for consistency Other rows in the table use lowercase type names (counter/gauge/histogram). Match that style. Reported by Copilot on #119. * docs: include size-check step in /health probe description The probe is write → size-check → read → verify → delete; the architecture note was missing the size-check step. Reported by Copilot on #119. * server: address andrew's review on #119 - Drop unused callerCtx parameter from healthCache.Check (Check is now parameter-less; the comment-only "accepted for symmetry" justification wasn't carrying its weight). - Emit "storage": {"status": "skipped"} on DB short-circuit instead of omitting the key, so monitors expecting a fixed key set keep working. - Reject negative storage_probe_interval at config validation time (previously parsed and silently behaved like "0"). - Extract HealthConfig.Validate to keep Config.Validate under the gocognit threshold and match the existing GradleBuildCacheConfig pattern. - README Health Check section: note that /health is intended as a readiness probe rather than a liveness probe (Check holds a mutex for up to the 10s probe timeout). - cmd/proxy/main.go godoc: column-align the new env var with the surrounding Gradle entries. Reported by andrew on #119.
2026-05-22 14:14:01 +03:00
HealthProbeFailures = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "proxy_health_probe_failures_total",
Help: "Total number of storage health probe failures, by step (write|size|read|verify|delete).",
},
[]string{"step"},
)
2026-02-03 22:40:23 +00:00
)
func init() {
// Register all metrics with Prometheus
prometheus.MustRegister(
RequestsTotal,
RequestDuration,
CacheHits,
CacheMisses,
CacheSize,
CachedArtifacts,
UpstreamFetchDuration,
UpstreamErrors,
CircuitBreakerState,
CircuitBreakerTrips,
StorageOperationDuration,
StorageErrors,
ActiveRequests,
IntegrityFailures,
Add storage backend probe to /health (closes #73) (#119) * config: add Health.StorageProbeInterval * metrics: add proxy_health_probe_failures_total counter * server: add storageProbe with happy-path test * server: add storageProbe failure-mode tests * server: add healthCache with TTL, single-flight, transition logging * server: wire storage probe into /health * server: update TestHealthEndpoint for JSON; wire healthCache into newTestServer Also fix Windows file-locking issue in storageProbe: close the reader explicitly before Delete so the file handle is released prior to os.Remove. * server: clean up stale comment in storageProbe * docs: document storage health probe and new metric * docs: regenerate Swagger for /health JSON response * server: simplify rc.Close error handling in storageProbe * server: defer probe cleanup so size/open/read/verify failures don't leak objects Previously, storageProbe only called Delete on the success path. Any failure between Store and the final Delete (size mismatch, Open error, mid-stream read failure, content mismatch) left the probe object orphaned in the storage backend. With caching disabled and Kubernetes-rate probing, the leak could accumulate noticeably on backends like S3. Use a named return + defer to attempt Delete after every successful Store. The earlier-step failure remains the primary error; Delete failure only surfaces as step="delete" when nothing else went wrong. Add a table-driven test that asserts cleanup runs for each non-delete failure path. Reported by Copilot on #119. * config: validate health.storage_probe_interval in Config.Validate The new duration field was only validated at use time in newHealthCache. The existing codebase already validates other duration fields (MetadataTTL, DirectServeTTL, Gradle.MaxAge, Gradle.SweepInterval) in Config.Validate() so misconfiguration fails fast at startup with a config-key-specific error. Match that pattern. The parse-at-use code in newHealthCache stays as a safety net, mirroring the MetadataTTL precedent. Reported by Copilot on #119. * docs: lowercase "counter" in metrics table for consistency Other rows in the table use lowercase type names (counter/gauge/histogram). Match that style. Reported by Copilot on #119. * docs: include size-check step in /health probe description The probe is write → size-check → read → verify → delete; the architecture note was missing the size-check step. Reported by Copilot on #119. * server: address andrew's review on #119 - Drop unused callerCtx parameter from healthCache.Check (Check is now parameter-less; the comment-only "accepted for symmetry" justification wasn't carrying its weight). - Emit "storage": {"status": "skipped"} on DB short-circuit instead of omitting the key, so monitors expecting a fixed key set keep working. - Reject negative storage_probe_interval at config validation time (previously parsed and silently behaved like "0"). - Extract HealthConfig.Validate to keep Config.Validate under the gocognit threshold and match the existing GradleBuildCacheConfig pattern. - README Health Check section: note that /health is intended as a readiness probe rather than a liveness probe (Check holds a mutex for up to the 10s probe timeout). - cmd/proxy/main.go godoc: column-align the new env var with the surrounding Gradle entries. Reported by andrew on #119.
2026-05-22 14:14:01 +03:00
HealthProbeFailures,
2026-02-03 22:40:23 +00:00
)
}
// Handler returns an HTTP handler for the Prometheus /metrics endpoint.
func Handler() http.Handler {
return promhttp.Handler()
}
// RecordRequest tracks request metrics with timing.
func RecordRequest(ecosystem string, status int, duration time.Duration) {
statusStr := strconv.Itoa(status)
RequestsTotal.WithLabelValues(ecosystem, statusStr).Inc()
RequestDuration.WithLabelValues(ecosystem, statusStr).Observe(duration.Seconds())
}
// RecordCacheHit increments cache hit counter.
func RecordCacheHit(ecosystem string) {
CacheHits.WithLabelValues(ecosystem).Inc()
}
// RecordCacheMiss increments cache miss counter.
func RecordCacheMiss(ecosystem string) {
CacheMisses.WithLabelValues(ecosystem).Inc()
}
// RecordUpstreamFetch tracks upstream fetch duration.
func RecordUpstreamFetch(ecosystem string, duration time.Duration) {
UpstreamFetchDuration.WithLabelValues(ecosystem).Observe(duration.Seconds())
}
// RecordUpstreamError increments upstream error counter.
func RecordUpstreamError(ecosystem, errorType string) {
UpstreamErrors.WithLabelValues(ecosystem, errorType).Inc()
}
// RecordStorageOperation tracks storage operation duration.
func RecordStorageOperation(operation string, duration time.Duration) {
StorageOperationDuration.WithLabelValues(operation).Observe(duration.Seconds())
}
// RecordIntegrityFailure increments the integrity failure counter.
func RecordIntegrityFailure(ecosystem string) {
IntegrityFailures.WithLabelValues(ecosystem).Inc()
Add storage backend probe to /health (closes #73) (#119) * config: add Health.StorageProbeInterval * metrics: add proxy_health_probe_failures_total counter * server: add storageProbe with happy-path test * server: add storageProbe failure-mode tests * server: add healthCache with TTL, single-flight, transition logging * server: wire storage probe into /health * server: update TestHealthEndpoint for JSON; wire healthCache into newTestServer Also fix Windows file-locking issue in storageProbe: close the reader explicitly before Delete so the file handle is released prior to os.Remove. * server: clean up stale comment in storageProbe * docs: document storage health probe and new metric * docs: regenerate Swagger for /health JSON response * server: simplify rc.Close error handling in storageProbe * server: defer probe cleanup so size/open/read/verify failures don't leak objects Previously, storageProbe only called Delete on the success path. Any failure between Store and the final Delete (size mismatch, Open error, mid-stream read failure, content mismatch) left the probe object orphaned in the storage backend. With caching disabled and Kubernetes-rate probing, the leak could accumulate noticeably on backends like S3. Use a named return + defer to attempt Delete after every successful Store. The earlier-step failure remains the primary error; Delete failure only surfaces as step="delete" when nothing else went wrong. Add a table-driven test that asserts cleanup runs for each non-delete failure path. Reported by Copilot on #119. * config: validate health.storage_probe_interval in Config.Validate The new duration field was only validated at use time in newHealthCache. The existing codebase already validates other duration fields (MetadataTTL, DirectServeTTL, Gradle.MaxAge, Gradle.SweepInterval) in Config.Validate() so misconfiguration fails fast at startup with a config-key-specific error. Match that pattern. The parse-at-use code in newHealthCache stays as a safety net, mirroring the MetadataTTL precedent. Reported by Copilot on #119. * docs: lowercase "counter" in metrics table for consistency Other rows in the table use lowercase type names (counter/gauge/histogram). Match that style. Reported by Copilot on #119. * docs: include size-check step in /health probe description The probe is write → size-check → read → verify → delete; the architecture note was missing the size-check step. Reported by Copilot on #119. * server: address andrew's review on #119 - Drop unused callerCtx parameter from healthCache.Check (Check is now parameter-less; the comment-only "accepted for symmetry" justification wasn't carrying its weight). - Emit "storage": {"status": "skipped"} on DB short-circuit instead of omitting the key, so monitors expecting a fixed key set keep working. - Reject negative storage_probe_interval at config validation time (previously parsed and silently behaved like "0"). - Extract HealthConfig.Validate to keep Config.Validate under the gocognit threshold and match the existing GradleBuildCacheConfig pattern. - README Health Check section: note that /health is intended as a readiness probe rather than a liveness probe (Check holds a mutex for up to the 10s probe timeout). - cmd/proxy/main.go godoc: column-align the new env var with the surrounding Gradle entries. Reported by andrew on #119.
2026-05-22 14:14:01 +03:00
}
// RecordHealthProbeFailure increments the health probe failure counter.
// step is one of: "write", "size", "read", "verify", "delete".
func RecordHealthProbeFailure(step string) {
HealthProbeFailures.WithLabelValues(step).Inc()
}
2026-02-03 22:40:23 +00:00
// RecordStorageError increments storage error counter.
func RecordStorageError(operation string) {
StorageErrors.WithLabelValues(operation).Inc()
}
// UpdateCacheStats updates cache size and artifact count gauges.
func UpdateCacheStats(sizeBytes, artifactCount int64) {
CacheSize.Set(float64(sizeBytes))
CachedArtifacts.Set(float64(artifactCount))
}
// UpdateCircuitBreakerState updates circuit breaker state gauge.
// state: 0=closed, 1=half-open, 2=open
func UpdateCircuitBreakerState(registry string, state int) {
CircuitBreakerState.WithLabelValues(registry).Set(float64(state))
}
// RecordCircuitBreakerTrip increments circuit breaker trip counter.
func RecordCircuitBreakerTrip(registry string) {
CircuitBreakerTrips.WithLabelValues(registry).Inc()
}
// IncrementActiveRequests increments the active request counter.
func IncrementActiveRequests() {
ActiveRequests.Inc()
}
// DecrementActiveRequests decrements the active request counter.
func DecrementActiveRequests() {
ActiveRequests.Dec()
}