various: add internal health and runtime metrics
Signed-off-by: NotAShelf <raf@notashelf.dev> Change-Id: Iae1dcf8495a00159d588c6e2344312f36a6a6964
This commit is contained in:
parent
d7cdf2cc49
commit
27b3641717
5 changed files with 187 additions and 4 deletions
|
|
@ -11,8 +11,11 @@ import (
|
|||
)
|
||||
|
||||
var (
|
||||
cfgFile string
|
||||
cfg *config.Config
|
||||
cfgFile string
|
||||
cfg *config.Config
|
||||
version string
|
||||
commit string
|
||||
buildDate string
|
||||
)
|
||||
|
||||
var rootCmd = &cobra.Command{
|
||||
|
|
@ -84,7 +87,8 @@ func initConfig() {
|
|||
}
|
||||
}
|
||||
|
||||
func Main() {
|
||||
func Main(v, c, bd string) {
|
||||
version, commit, buildDate = v, c, bd
|
||||
if err := rootCmd.Execute(); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
|
||||
os.Exit(1)
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ import (
|
|||
"notashelf.dev/watchdog/internal/aggregate"
|
||||
"notashelf.dev/watchdog/internal/api"
|
||||
"notashelf.dev/watchdog/internal/config"
|
||||
"notashelf.dev/watchdog/internal/health"
|
||||
"notashelf.dev/watchdog/internal/limits"
|
||||
"notashelf.dev/watchdog/internal/normalize"
|
||||
)
|
||||
|
|
@ -55,6 +56,12 @@ func Run(cfg *config.Config) error {
|
|||
metricsAgg.MustRegister(promRegistry)
|
||||
promRegistry.MustRegister(blockedRequests)
|
||||
|
||||
// Register health and runtime metrics
|
||||
healthCollector := health.NewCollector(version, commit, buildDate)
|
||||
if err := healthCollector.Register(promRegistry); err != nil {
|
||||
return fmt.Errorf("failed to register health metrics: %w", err)
|
||||
}
|
||||
|
||||
// Create HTTP handlers
|
||||
ingestionHandler := api.NewIngestionHandler(
|
||||
cfg,
|
||||
|
|
|
|||
56
internal/health/metrics.go
Normal file
56
internal/health/metrics.go
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
package health
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/collectors"
|
||||
)
|
||||
|
||||
// Holds health and runtime metrics for the watchdog process
|
||||
type Collector struct {
|
||||
buildInfo prometheus.Gauge
|
||||
startTime prometheus.Gauge
|
||||
}
|
||||
|
||||
// Creates a health metrics collector with build metadata
|
||||
func NewCollector(version, commit, buildDate string) *Collector {
|
||||
buildInfo := prometheus.NewGauge(prometheus.GaugeOpts{
|
||||
Name: "watchdog_build_info",
|
||||
Help: "Build metadata for the running watchdog instance",
|
||||
ConstLabels: prometheus.Labels{
|
||||
"version": version,
|
||||
"commit": commit,
|
||||
"build_date": buildDate,
|
||||
},
|
||||
})
|
||||
buildInfo.Set(1)
|
||||
|
||||
startTime := prometheus.NewGauge(prometheus.GaugeOpts{
|
||||
Name: "watchdog_start_time_seconds",
|
||||
Help: "Unix timestamp of when the watchdog process started",
|
||||
})
|
||||
startTime.Set(float64(time.Now().Unix()))
|
||||
|
||||
return &Collector{
|
||||
buildInfo: buildInfo,
|
||||
startTime: startTime,
|
||||
}
|
||||
}
|
||||
|
||||
// Registers all health metrics plus Go runtime collectors
|
||||
func (c *Collector) Register(reg prometheus.Registerer) error {
|
||||
if err := reg.Register(c.buildInfo); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := reg.Register(c.startTime); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := reg.Register(collectors.NewGoCollector()); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := reg.Register(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{})); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
103
internal/health/metrics_test.go
Normal file
103
internal/health/metrics_test.go
Normal file
|
|
@ -0,0 +1,103 @@
|
|||
package health
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
func TestNewCollector_RegistersMetrics(t *testing.T) {
|
||||
reg := prometheus.NewRegistry()
|
||||
c := NewCollector("v0.1.0", "abc1234", "2026-03-02")
|
||||
|
||||
if err := c.Register(reg); err != nil {
|
||||
t.Fatalf("Register failed: %v", err)
|
||||
}
|
||||
|
||||
metrics, err := reg.Gather()
|
||||
if err != nil {
|
||||
t.Fatalf("Gather failed: %v", err)
|
||||
}
|
||||
|
||||
// Should have at least build_info and uptime
|
||||
names := make(map[string]bool)
|
||||
for _, m := range metrics {
|
||||
names[m.GetName()] = true
|
||||
}
|
||||
|
||||
if !names["watchdog_build_info"] {
|
||||
t.Error("expected watchdog_build_info metric")
|
||||
}
|
||||
if !names["watchdog_start_time_seconds"] {
|
||||
t.Error("expected watchdog_start_time_seconds metric")
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewCollector_BuildInfoLabels(t *testing.T) {
|
||||
reg := prometheus.NewRegistry()
|
||||
c := NewCollector("v1.2.3", "deadbeef", "2026-03-02")
|
||||
|
||||
if err := c.Register(reg); err != nil {
|
||||
t.Fatalf("Register failed: %v", err)
|
||||
}
|
||||
|
||||
metrics, err := reg.Gather()
|
||||
if err != nil {
|
||||
t.Fatalf("Gather failed: %v", err)
|
||||
}
|
||||
|
||||
for _, m := range metrics {
|
||||
if m.GetName() != "watchdog_build_info" {
|
||||
continue
|
||||
}
|
||||
|
||||
labels := make(map[string]string)
|
||||
for _, l := range m.GetMetric()[0].GetLabel() {
|
||||
labels[l.GetName()] = l.GetValue()
|
||||
}
|
||||
|
||||
if labels["version"] != "v1.2.3" {
|
||||
t.Errorf("expected version label %q, got %q", "v1.2.3", labels["version"])
|
||||
}
|
||||
if labels["commit"] != "deadbeef" {
|
||||
t.Errorf("expected commit label %q, got %q", "deadbeef", labels["commit"])
|
||||
}
|
||||
if labels["build_date"] != "2026-03-02" {
|
||||
t.Errorf(
|
||||
"expected build_date label %q, got %q",
|
||||
"2026-03-02",
|
||||
labels["build_date"],
|
||||
)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
t.Error("watchdog_build_info metric not found in gathered metrics")
|
||||
}
|
||||
|
||||
func TestNewCollector_StartTimeIsPositive(t *testing.T) {
|
||||
reg := prometheus.NewRegistry()
|
||||
c := NewCollector("v0.1.0", "abc1234", "2026-03-02")
|
||||
|
||||
if err := c.Register(reg); err != nil {
|
||||
t.Fatalf("Register failed: %v", err)
|
||||
}
|
||||
|
||||
metrics, err := reg.Gather()
|
||||
if err != nil {
|
||||
t.Fatalf("Gather failed: %v", err)
|
||||
}
|
||||
|
||||
for _, m := range metrics {
|
||||
if m.GetName() != "watchdog_start_time_seconds" {
|
||||
continue
|
||||
}
|
||||
val := m.GetMetric()[0].GetGauge().GetValue()
|
||||
if val <= 0 {
|
||||
t.Errorf("expected positive start time, got %v", val)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
t.Error("watchdog_start_time_seconds metric not found")
|
||||
}
|
||||
15
main.go
15
main.go
|
|
@ -2,6 +2,19 @@ package main
|
|||
|
||||
import "notashelf.dev/watchdog/cmd/watchdog"
|
||||
|
||||
// Injected at build time via ldflags:
|
||||
//
|
||||
// -X main.Version=v1.0.0
|
||||
// -X main.Commit=abc1234
|
||||
// -X main.BuildDate=2026-03-02
|
||||
//
|
||||
// I hate this pattern btw.
|
||||
var (
|
||||
Version = "dev"
|
||||
Commit = "unknown"
|
||||
BuildDate = "unknown"
|
||||
)
|
||||
|
||||
func main() {
|
||||
watchdog.Main()
|
||||
watchdog.Main(Version, Commit, BuildDate)
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue