1
0
Fork 1
mirror of https://github.com/git-pkgs/proxy.git synced 2026-06-02 16:48:16 -04:00
pkg-proxy/internal/server/health.go

182 lines
5.3 KiB
Go
Raw Permalink Normal View History

Add storage backend probe to /health (closes #73) (#119) * config: add Health.StorageProbeInterval * metrics: add proxy_health_probe_failures_total counter * server: add storageProbe with happy-path test * server: add storageProbe failure-mode tests * server: add healthCache with TTL, single-flight, transition logging * server: wire storage probe into /health * server: update TestHealthEndpoint for JSON; wire healthCache into newTestServer Also fix Windows file-locking issue in storageProbe: close the reader explicitly before Delete so the file handle is released prior to os.Remove. * server: clean up stale comment in storageProbe * docs: document storage health probe and new metric * docs: regenerate Swagger for /health JSON response * server: simplify rc.Close error handling in storageProbe * server: defer probe cleanup so size/open/read/verify failures don't leak objects Previously, storageProbe only called Delete on the success path. Any failure between Store and the final Delete (size mismatch, Open error, mid-stream read failure, content mismatch) left the probe object orphaned in the storage backend. With caching disabled and Kubernetes-rate probing, the leak could accumulate noticeably on backends like S3. Use a named return + defer to attempt Delete after every successful Store. The earlier-step failure remains the primary error; Delete failure only surfaces as step="delete" when nothing else went wrong. Add a table-driven test that asserts cleanup runs for each non-delete failure path. Reported by Copilot on #119. * config: validate health.storage_probe_interval in Config.Validate The new duration field was only validated at use time in newHealthCache. The existing codebase already validates other duration fields (MetadataTTL, DirectServeTTL, Gradle.MaxAge, Gradle.SweepInterval) in Config.Validate() so misconfiguration fails fast at startup with a config-key-specific error. Match that pattern. The parse-at-use code in newHealthCache stays as a safety net, mirroring the MetadataTTL precedent. Reported by Copilot on #119. * docs: lowercase "counter" in metrics table for consistency Other rows in the table use lowercase type names (counter/gauge/histogram). Match that style. Reported by Copilot on #119. * docs: include size-check step in /health probe description The probe is write → size-check → read → verify → delete; the architecture note was missing the size-check step. Reported by Copilot on #119. * server: address andrew's review on #119 - Drop unused callerCtx parameter from healthCache.Check (Check is now parameter-less; the comment-only "accepted for symmetry" justification wasn't carrying its weight). - Emit "storage": {"status": "skipped"} on DB short-circuit instead of omitting the key, so monitors expecting a fixed key set keep working. - Reject negative storage_probe_interval at config validation time (previously parsed and silently behaved like "0"). - Extract HealthConfig.Validate to keep Config.Validate under the gocognit threshold and match the existing GradleBuildCacheConfig pattern. - README Health Check section: note that /health is intended as a readiness probe rather than a liveness probe (Check holds a mutex for up to the 10s probe timeout). - cmd/proxy/main.go godoc: column-align the new env var with the surrounding Gradle entries. Reported by andrew on #119.
2026-05-22 14:14:01 +03:00
// Package server implements the proxy HTTP server.
package server
import (
"bytes"
"context"
"crypto/rand"
"encoding/hex"
"errors"
"fmt"
"io"
"log/slog"
"strconv"
"sync"
"time"
"github.com/git-pkgs/proxy/internal/metrics"
"github.com/git-pkgs/proxy/internal/storage"
)
const (
probePathPrefix = ".healthcheck/"
probeMarker = "proxy-healthcheck:"
probeSuffixBytes = 8
defaultProbeTTL = 30 * time.Second
defaultProbeTimeout = 10 * time.Second
)
// HealthResponse is the JSON payload returned by /health.
type HealthResponse struct {
Status string `json:"status"`
Checks map[string]HealthCheck `json:"checks"`
}
// HealthCheck reports the status of a single subsystem check.
type HealthCheck struct {
Status string `json:"status"`
Error string `json:"error,omitempty"`
Step string `json:"step,omitempty"`
}
// probeError tags a storage probe failure with the step that failed.
type probeError struct {
step string
err error
}
func (e *probeError) Error() string { return e.step + ": " + e.err.Error() }
func (e *probeError) Unwrap() error { return e.err }
// storageProbe runs a write → size-check → read → verify → delete round-trip
// against the storage backend. Returns nil on success or a *probeError on failure.
func storageProbe(ctx context.Context, s storage.Storage) (err error) {
suffix, suffixErr := randomSuffix()
if suffixErr != nil {
return &probeError{step: "write", err: fmt.Errorf("generating random suffix: %w", suffixErr)}
}
path := probePathPrefix + strconv.FormatInt(time.Now().UnixNano(), 10) + "-" + suffix
payload := []byte(probeMarker + suffix)
// 1. Store
size, _, storeErr := s.Store(ctx, path, bytes.NewReader(payload))
if storeErr != nil {
return &probeError{step: "write", err: storeErr}
}
// After Store succeeds, always attempt to delete on the way out so probe
// objects don't accumulate when a later step (size/open/read/verify) fails.
// Delete is reported as the primary error only if no earlier failure
// already set one.
defer func() {
if delErr := s.Delete(ctx, path); delErr != nil && err == nil {
err = &probeError{step: "delete", err: delErr}
}
}()
// 2. Size check
if size != int64(len(payload)) {
return &probeError{step: "size", err: fmt.Errorf("wrote %d bytes, expected %d", size, len(payload))}
}
// 3. Open
rc, openErr := s.Open(ctx, path)
if openErr != nil {
return &probeError{step: "read", err: openErr}
}
// 4. Read all (classify mid-stream errors as read, not verify).
// Close explicitly (not deferred) so the file handle is released before
// Delete — on Windows, an open handle prevents deletion.
data, readErr := io.ReadAll(rc)
_ = rc.Close()
if readErr != nil {
return &probeError{step: "read", err: readErr}
}
// 5. Verify
if !bytes.Equal(data, payload) {
return &probeError{step: "verify", err: fmt.Errorf("content mismatch")}
}
// 6. Delete is handled via the deferred cleanup above.
return nil
}
// randomSuffix returns 8 cryptographically random bytes hex-encoded.
func randomSuffix() (string, error) {
b := make([]byte, probeSuffixBytes)
if _, err := rand.Read(b); err != nil {
return "", err
}
return hex.EncodeToString(b), nil
}
// healthCache memoizes the result of storageProbe for a configurable TTL.
// It is safe for concurrent use.
type healthCache struct {
storage storage.Storage
interval time.Duration
probeTimeout time.Duration
logger *slog.Logger
mu sync.Mutex
lastAt time.Time
lastErr error
}
// newHealthCache builds a cache, parsing the interval from a duration string.
// Empty interval string defaults to 30s. "0" or "0s" disables caching.
func newHealthCache(s storage.Storage, intervalStr string, logger *slog.Logger) (*healthCache, error) {
interval := defaultProbeTTL
if intervalStr != "" {
d, err := time.ParseDuration(intervalStr)
if err != nil {
return nil, fmt.Errorf("parsing storage_probe_interval %q: %w", intervalStr, err)
}
interval = d
}
return &healthCache{
storage: s,
interval: interval,
probeTimeout: defaultProbeTimeout,
logger: logger,
}, nil
}
// Check returns the cached probe result if still fresh, otherwise runs a fresh probe.
// The probe runs under a context derived from context.Background() with a fixed
// timeout so that caller cancellation (e.g. client disconnect) cannot poison the
// cache with context.Canceled.
func (c *healthCache) Check() error {
c.mu.Lock()
defer c.mu.Unlock()
// Cache hit
if c.interval > 0 && !c.lastAt.IsZero() && time.Since(c.lastAt) < c.interval {
return c.lastErr
}
// Fresh probe under a detached context
probeCtx, cancel := context.WithTimeout(context.Background(), c.probeTimeout)
defer cancel()
err := storageProbe(probeCtx, c.storage)
// Transition logging and metric increment happen only on the fresh-probe path.
c.logTransition(c.lastErr, err)
if err != nil {
var pe *probeError
if errors.As(err, &pe) {
metrics.RecordHealthProbeFailure(pe.step)
} else {
metrics.RecordHealthProbeFailure("unknown")
}
}
c.lastErr = err
c.lastAt = time.Now()
return err
}
func (c *healthCache) logTransition(prev, curr error) {
switch {
case prev != nil && curr == nil:
c.logger.Info("storage probe recovered")
case prev == nil && curr != nil:
c.logger.Error("storage probe failed", "error", curr.Error())
}
}