From 7b06c4f2ca7c410a9a0ff334c7c77b7884bc62e6 Mon Sep 17 00:00:00 2001 From: NotAShelf Date: Sun, 1 Mar 2026 21:14:07 +0300 Subject: [PATCH 01/10] various: extract magic numbers into named constants Signed-off-by: NotAShelf Change-Id: I854b2f9b5f39e4629c32e5681e6322826a6a6964 --- cmd/watchdog/root.go | 12 ++++++------ internal/aggregate/metrics.go | 5 +++-- internal/limits/constants.go | 11 +++++++++++ internal/normalize/path.go | 3 ++- 4 files changed, 22 insertions(+), 9 deletions(-) diff --git a/cmd/watchdog/root.go b/cmd/watchdog/root.go index e64e158..9ab1df0 100644 --- a/cmd/watchdog/root.go +++ b/cmd/watchdog/root.go @@ -11,13 +11,13 @@ import ( "path/filepath" "strings" "syscall" - "time" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" "notashelf.dev/watchdog/internal/aggregate" "notashelf.dev/watchdog/internal/api" "notashelf.dev/watchdog/internal/config" + "notashelf.dev/watchdog/internal/limits" "notashelf.dev/watchdog/internal/normalize" ) @@ -91,9 +91,9 @@ func Run(cfg *config.Config) error { srv := &http.Server{ Addr: cfg.Server.ListenAddr, Handler: mux, - ReadTimeout: 10 * time.Second, - WriteTimeout: 10 * time.Second, - IdleTimeout: 60 * time.Second, + ReadTimeout: limits.HTTPReadTimeout, + WriteTimeout: limits.HTTPWriteTimeout, + IdleTimeout: limits.HTTPIdleTimeout, } // Start server in goroutine @@ -115,8 +115,8 @@ func Run(cfg *config.Config) error { case sig := <-shutdown: log.Printf("Received signal: %v, starting graceful shutdown", sig) - // Give outstanding requests 30 seconds to complete - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + // Give outstanding requests time to complete + ctx, cancel := context.WithTimeout(context.Background(), limits.ShutdownTimeout) defer cancel() // Shutdown metrics aggregator. diff --git a/internal/aggregate/metrics.go b/internal/aggregate/metrics.go index a9ba3ad..055896e 100644 --- a/internal/aggregate/metrics.go +++ b/internal/aggregate/metrics.go @@ -9,6 +9,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "notashelf.dev/watchdog/internal/config" + "notashelf.dev/watchdog/internal/limits" ) var prometheusLabelPattern = regexp.MustCompile(`^[a-zA-Z0-9_/:.-]*$`) @@ -121,11 +122,11 @@ func NewMetricsAggregator( return m } -// Background goroutine to update the unique visitors gauge every 10 seconds +// Background goroutine to update the unique visitors gauge periodically // instead of on every request. This should help with performance. func (m *MetricsAggregator) updateUniquesGauge() { defer m.wg.Done() - ticker := time.NewTicker(10 * time.Second) + ticker := time.NewTicker(limits.UniquesUpdatePeriod) defer ticker.Stop() for { diff --git a/internal/limits/constants.go b/internal/limits/constants.go index 14985a9..2db5dc1 100644 --- a/internal/limits/constants.go +++ b/internal/limits/constants.go @@ -1,5 +1,7 @@ package limits +import "time" + // Size limits for request processing const ( MaxEventSize = 4 * 1024 // 4KB max event payload @@ -7,3 +9,12 @@ const ( MaxRefLen = 2048 // max referrer length MaxWidth = 10000 // max reasonable screen width ) + +// Timeout constants +const ( + HTTPReadTimeout = 10 * time.Second // HTTP server read timeout + HTTPWriteTimeout = 10 * time.Second // HTTP server write timeout + HTTPIdleTimeout = 60 * time.Second // HTTP server idle timeout + ShutdownTimeout = 30 * time.Second // graceful shutdown timeout + UniquesUpdatePeriod = 10 * time.Second // HLL gauge update interval +) diff --git a/internal/normalize/path.go b/internal/normalize/path.go index 2148f34..9836b67 100644 --- a/internal/normalize/path.go +++ b/internal/normalize/path.go @@ -4,6 +4,7 @@ import ( "strings" "notashelf.dev/watchdog/internal/config" + "notashelf.dev/watchdog/internal/limits" ) type PathNormalizer struct { @@ -14,7 +15,7 @@ type PathNormalizer struct { func NewPathNormalizer(cfg config.PathConfig) *PathNormalizer { return &PathNormalizer{ cfg: cfg, - maxLength: 2048, + maxLength: limits.MaxPathLen, } } From 896ec1a40a404796004b5ad3847eceda39069e53 Mon Sep 17 00:00:00 2001 From: NotAShelf Date: Sun, 1 Mar 2026 21:21:20 +0300 Subject: [PATCH 02/10] watchdog: add metrics for blocked requests & logging Signed-off-by: NotAShelf Change-Id: Ib1d876859422a6115772962ed9e207a46a6a6964 --- cmd/watchdog/root.go | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/cmd/watchdog/root.go b/cmd/watchdog/root.go index 9ab1df0..b933cea 100644 --- a/cmd/watchdog/root.go +++ b/cmd/watchdog/root.go @@ -31,6 +31,15 @@ func Run(cfg *config.Config) error { eventRegistry := aggregate.NewCustomEventRegistry(cfg.Limits.MaxCustomEvents) metricsAgg := aggregate.NewMetricsAggregator(pathRegistry, eventRegistry, cfg) + // Metric for tracking blocked file requests (scrapers/bots) + blockedRequests := prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "web_blocked_requests_total", + Help: "File server requests blocked by security filters", + }, + []string{"reason"}, + ) + // Load HLL state from previous run if it exists if cfg.Site.SaltRotation != "" { log.Println("HLL state persistence enabled") @@ -44,6 +53,7 @@ func Run(cfg *config.Config) error { // Register Prometheus metrics promRegistry := prometheus.NewRegistry() metricsAgg.MustRegister(promRegistry) + promRegistry.MustRegister(blockedRequests) // Create HTTP handlers ingestionHandler := api.NewIngestionHandler( @@ -84,7 +94,7 @@ func Run(cfg *config.Config) error { // Serve whitelisted static files from /web/ if the directory exists if info, err := os.Stat("web"); err == nil && info.IsDir() { log.Println("Serving static files from /web/") - mux.Handle("/web/", safeFileServer("web")) + mux.Handle("/web/", safeFileServer("web", blockedRequests)) } // Create HTTP server with timeouts @@ -153,7 +163,7 @@ func basicAuth(next http.Handler, username, password string) http.Handler { // Creates a file server that only serves whitelisted files. Blocks dotfiles, .git, .env, etc. // TODO: I need to hook this up to eris somehow so I can just forward the paths that are being // scanned despite not being on a whitelist. Would be a good way of detecting scrapers, maybe. -func safeFileServer(root string) http.Handler { +func safeFileServer(root string, blockedRequests *prometheus.CounterVec) http.Handler { fs := http.FileServer(http.Dir(root)) return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { // Clean the path @@ -161,6 +171,8 @@ func safeFileServer(root string) http.Handler { // Block directory listings if strings.HasSuffix(path, "/") { + blockedRequests.WithLabelValues("directory_listing").Inc() + log.Printf("Blocked directory listing attempt: %s from %s", path, r.RemoteAddr) http.NotFound(w, r) return } @@ -168,6 +180,8 @@ func safeFileServer(root string) http.Handler { // Block dotfiles and sensitive files for segment := range strings.SplitSeq(path, "/") { if strings.HasPrefix(segment, ".") { + blockedRequests.WithLabelValues("dotfile").Inc() + log.Printf("Blocked dotfile access: %s from %s", path, r.RemoteAddr) http.NotFound(w, r) return } @@ -177,6 +191,8 @@ func safeFileServer(root string) http.Handler { strings.Contains(lower, "config") || strings.HasSuffix(lower, ".bak") || strings.HasSuffix(lower, "~") { + blockedRequests.WithLabelValues("sensitive_file").Inc() + log.Printf("Blocked sensitive file access: %s from %s", path, r.RemoteAddr) http.NotFound(w, r) return } @@ -185,6 +201,8 @@ func safeFileServer(root string) http.Handler { // Only serve .js, .html, .css files ext := strings.ToLower(filepath.Ext(path)) if ext != ".js" && ext != ".html" && ext != ".css" { + blockedRequests.WithLabelValues("invalid_extension").Inc() + log.Printf("Blocked invalid extension: %s from %s", path, r.RemoteAddr) http.NotFound(w, r) return } From 6977a501b16e4cdfa89571cebc1d5134dfec0843 Mon Sep 17 00:00:00 2001 From: NotAShelf Date: Mon, 2 Mar 2026 21:27:47 +0300 Subject: [PATCH 03/10] internal: better device classification via UA parsing Signed-off-by: NotAShelf Change-Id: I6c78f1eebe71ef4cf037ebbda2caaeb36a6a6964 --- internal/api/handler.go | 52 +++++++++++---- internal/api/handler_test.go | 123 ++++++++++++++++++++++++++--------- 2 files changed, 130 insertions(+), 45 deletions(-) diff --git a/internal/api/handler.go b/internal/api/handler.go index bed0abf..4cb3905 100644 --- a/internal/api/handler.go +++ b/internal/api/handler.go @@ -137,7 +137,7 @@ func (h *IngestionHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { // Device classification if h.cfg.Site.Collect.Device { - device = h.classifyDevice(event.Width) + device = h.classifyDevice(event.Width, userAgent) } // Referrer classification @@ -271,19 +271,43 @@ func (h *IngestionHandler) ipInCIDR(ip, cidr string) bool { return network.Contains(testIP) } -// Classifies screen width into device categories using configured breakpoints -// FIXME: we need a more robust mechanism for classifying devices. Breakpoints -// are the only ones I can think of *right now* but I'm positive there are better -// mechanisns. We'll get to this later. -func (h *IngestionHandler) classifyDevice(width int) string { - if width == 0 { - return "unknown" - } - if width < h.cfg.Limits.DeviceBreakpoints.Mobile { - return "mobile" - } - if width < h.cfg.Limits.DeviceBreakpoints.Tablet { +// Classifies device using both screen width and User-Agent parsing +// Uses UA hints for better detection, falls back to width breakpoints +func (h *IngestionHandler) classifyDevice(width int, userAgent string) string { + // First try User-Agent based detection for better accuracy + ua := strings.ToLower(userAgent) + + // Tablet detection via UA (must come before mobile: Android tablets lack "mobile" keyword) + if strings.Contains(ua, "tablet") || + strings.Contains(ua, "ipad") || + (strings.Contains(ua, "android") && !strings.Contains(ua, "mobile")) { return "tablet" } - return "desktop" + + // Mobile detection via UA + if strings.Contains(ua, "mobile") || + strings.Contains(ua, "iphone") || + strings.Contains(ua, "ipod") || + strings.Contains(ua, "windows phone") || + strings.Contains(ua, "blackberry") { + return "mobile" + } + + // If UA doesn't provide clear signal, use width breakpoints + if width > 0 { + if width < h.cfg.Limits.DeviceBreakpoints.Mobile { + return "mobile" + } + if width < h.cfg.Limits.DeviceBreakpoints.Tablet { + return "tablet" + } + return "desktop" + } + + // Default to desktop if UA suggests desktop browser + if userAgent != "" { + return "desktop" + } + + return "unknown" } diff --git a/internal/api/handler_test.go b/internal/api/handler_test.go index cb788d9..cee1dbb 100644 --- a/internal/api/handler_test.go +++ b/internal/api/handler_test.go @@ -2,7 +2,6 @@ package api import ( "bytes" - "fmt" "net/http" "net/http/httptest" "testing" @@ -207,51 +206,113 @@ func TestIngestionHandler_InvalidJSON(t *testing.T) { } } -func TestIngestionHandler_DeviceClassification(t *testing.T) { - cfg := config.Config{ - Site: config.SiteConfig{ - Domains: []string{"example.com"}, - Collect: config.CollectConfig{ - Pageviews: true, - Device: true, - }, - Path: config.PathConfig{}, - }, - Limits: config.LimitsConfig{ - MaxPaths: 100, - MaxSources: 50, - }, - } - +func newTestHandler(cfg *config.Config) *IngestionHandler { pathNorm := normalize.NewPathNormalizer(cfg.Site.Path) pathRegistry := aggregate.NewPathRegistry(cfg.Limits.MaxPaths) refRegistry := normalize.NewReferrerRegistry(cfg.Limits.MaxSources) metricsAgg := aggregate.NewMetricsAggregator( pathRegistry, aggregate.NewCustomEventRegistry(100), - &cfg, + cfg, ) + return NewIngestionHandler(cfg, pathNorm, pathRegistry, refRegistry, metricsAgg) +} - handler := NewIngestionHandler(&cfg, pathNorm, pathRegistry, refRegistry, metricsAgg) +func TestClassifyDevice_UA(t *testing.T) { + cfg := &config.Config{ + Limits: config.LimitsConfig{ + DeviceBreakpoints: config.DeviceBreaks{ + Mobile: 768, + Tablet: 1024, + }, + }, + } + h := newTestHandler(cfg) tests := []struct { - name string - width int + name string + width int + userAgent string + want string }{ - {"mobile", 375}, - {"tablet", 768}, - {"desktop", 1920}, + // UA takes priority + { + name: "iphone via UA", + width: 390, + userAgent: "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15", + want: "mobile", + }, + { + name: "android phone via UA", + width: 0, + userAgent: "Mozilla/5.0 (Linux; Android 13; Pixel 7) Mobile Safari/537.36", + want: "mobile", + }, + { + name: "windows phone via UA", + width: 0, + userAgent: "Mozilla/5.0 (compatible; MSIE 10.0; Windows Phone 8.0)", + want: "mobile", + }, + { + name: "ipad via UA", + width: 1024, + userAgent: "Mozilla/5.0 (iPad; CPU OS 17_0 like Mac OS X) AppleWebKit/605.1.15", + want: "tablet", + }, + { + name: "android tablet via UA (no mobile keyword)", + width: 0, + userAgent: "Mozilla/5.0 (Linux; Android 13; SM-T870) AppleWebKit/537.36", + want: "tablet", + }, + // Falls back to width when UA is desktop + { + name: "desktop UA wide screen", + width: 1920, + userAgent: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0", + want: "desktop", + }, + { + name: "desktop UA narrow width", + width: 500, + userAgent: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0", + want: "mobile", + }, + // Width-only fallback + { + name: "no UA mobile width", + width: 375, + userAgent: "", + want: "mobile", + }, + { + name: "no UA tablet width", + width: 800, + userAgent: "", + want: "tablet", + }, + { + name: "no UA desktop width", + width: 1440, + userAgent: "", + want: "desktop", + }, + // Unknown + { + name: "no UA no width", + width: 0, + userAgent: "", + want: "unknown", + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - body := fmt.Sprintf(`{"d":"example.com","p":"/test","w":%d}`, tt.width) - req := httptest.NewRequest("POST", "/api/event", bytes.NewBufferString(body)) - w := httptest.NewRecorder() - handler.ServeHTTP(w, req) - - if w.Code != http.StatusNoContent { - t.Errorf("expected status %d, got %d", http.StatusNoContent, w.Code) + got := h.classifyDevice(tt.width, tt.userAgent) + if got != tt.want { + t.Errorf("classifyDevice(%d, %q) = %q, want %q", + tt.width, tt.userAgent, got, tt.want) } }) } From d7cdf2cc49fdd0dad0d0f10ac8519ccdf5157b5e Mon Sep 17 00:00:00 2001 From: NotAShelf Date: Mon, 2 Mar 2026 21:27:59 +0300 Subject: [PATCH 04/10] chore: fix typo in `dailySalt` comment Signed-off-by: NotAShelf Change-Id: I8f0d0bf4bc597f0aecfd98c292f38cdb6a6a6964 --- internal/aggregate/uniques.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/aggregate/uniques.go b/internal/aggregate/uniques.go index 1c7b6de..8d8149f 100644 --- a/internal/aggregate/uniques.go +++ b/internal/aggregate/uniques.go @@ -53,7 +53,7 @@ func (u *UniquesEstimator) Estimate() uint64 { return u.hll.Estimate() } -// Cenerates a deterministic salt based on the current date +// Generates a deterministic salt based on the current date // Same day = same salt, different day = different salt func dailySalt(t time.Time) string { // Use UTC to ensure consistent rotation regardless of timezone From 27b3641717399aca8775b807ee7527c4fc065c78 Mon Sep 17 00:00:00 2001 From: NotAShelf Date: Mon, 2 Mar 2026 21:38:43 +0300 Subject: [PATCH 05/10] various: add internal health and runtime metrics Signed-off-by: NotAShelf Change-Id: Iae1dcf8495a00159d588c6e2344312f36a6a6964 --- cmd/watchdog/main.go | 10 +++- cmd/watchdog/root.go | 7 +++ internal/health/metrics.go | 56 +++++++++++++++++ internal/health/metrics_test.go | 103 ++++++++++++++++++++++++++++++++ main.go | 15 ++++- 5 files changed, 187 insertions(+), 4 deletions(-) create mode 100644 internal/health/metrics.go create mode 100644 internal/health/metrics_test.go diff --git a/cmd/watchdog/main.go b/cmd/watchdog/main.go index 3d11120..ed8c2da 100644 --- a/cmd/watchdog/main.go +++ b/cmd/watchdog/main.go @@ -11,8 +11,11 @@ import ( ) var ( - cfgFile string - cfg *config.Config + cfgFile string + cfg *config.Config + version string + commit string + buildDate string ) var rootCmd = &cobra.Command{ @@ -84,7 +87,8 @@ func initConfig() { } } -func Main() { +func Main(v, c, bd string) { + version, commit, buildDate = v, c, bd if err := rootCmd.Execute(); err != nil { fmt.Fprintf(os.Stderr, "Error: %v\n", err) os.Exit(1) diff --git a/cmd/watchdog/root.go b/cmd/watchdog/root.go index b933cea..8d2050f 100644 --- a/cmd/watchdog/root.go +++ b/cmd/watchdog/root.go @@ -17,6 +17,7 @@ import ( "notashelf.dev/watchdog/internal/aggregate" "notashelf.dev/watchdog/internal/api" "notashelf.dev/watchdog/internal/config" + "notashelf.dev/watchdog/internal/health" "notashelf.dev/watchdog/internal/limits" "notashelf.dev/watchdog/internal/normalize" ) @@ -55,6 +56,12 @@ func Run(cfg *config.Config) error { metricsAgg.MustRegister(promRegistry) promRegistry.MustRegister(blockedRequests) + // Register health and runtime metrics + healthCollector := health.NewCollector(version, commit, buildDate) + if err := healthCollector.Register(promRegistry); err != nil { + return fmt.Errorf("failed to register health metrics: %w", err) + } + // Create HTTP handlers ingestionHandler := api.NewIngestionHandler( cfg, diff --git a/internal/health/metrics.go b/internal/health/metrics.go new file mode 100644 index 0000000..a49c659 --- /dev/null +++ b/internal/health/metrics.go @@ -0,0 +1,56 @@ +package health + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/collectors" +) + +// Holds health and runtime metrics for the watchdog process +type Collector struct { + buildInfo prometheus.Gauge + startTime prometheus.Gauge +} + +// Creates a health metrics collector with build metadata +func NewCollector(version, commit, buildDate string) *Collector { + buildInfo := prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "watchdog_build_info", + Help: "Build metadata for the running watchdog instance", + ConstLabels: prometheus.Labels{ + "version": version, + "commit": commit, + "build_date": buildDate, + }, + }) + buildInfo.Set(1) + + startTime := prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "watchdog_start_time_seconds", + Help: "Unix timestamp of when the watchdog process started", + }) + startTime.Set(float64(time.Now().Unix())) + + return &Collector{ + buildInfo: buildInfo, + startTime: startTime, + } +} + +// Registers all health metrics plus Go runtime collectors +func (c *Collector) Register(reg prometheus.Registerer) error { + if err := reg.Register(c.buildInfo); err != nil { + return err + } + if err := reg.Register(c.startTime); err != nil { + return err + } + if err := reg.Register(collectors.NewGoCollector()); err != nil { + return err + } + if err := reg.Register(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{})); err != nil { + return err + } + return nil +} diff --git a/internal/health/metrics_test.go b/internal/health/metrics_test.go new file mode 100644 index 0000000..7df5dbb --- /dev/null +++ b/internal/health/metrics_test.go @@ -0,0 +1,103 @@ +package health + +import ( + "testing" + + "github.com/prometheus/client_golang/prometheus" +) + +func TestNewCollector_RegistersMetrics(t *testing.T) { + reg := prometheus.NewRegistry() + c := NewCollector("v0.1.0", "abc1234", "2026-03-02") + + if err := c.Register(reg); err != nil { + t.Fatalf("Register failed: %v", err) + } + + metrics, err := reg.Gather() + if err != nil { + t.Fatalf("Gather failed: %v", err) + } + + // Should have at least build_info and uptime + names := make(map[string]bool) + for _, m := range metrics { + names[m.GetName()] = true + } + + if !names["watchdog_build_info"] { + t.Error("expected watchdog_build_info metric") + } + if !names["watchdog_start_time_seconds"] { + t.Error("expected watchdog_start_time_seconds metric") + } +} + +func TestNewCollector_BuildInfoLabels(t *testing.T) { + reg := prometheus.NewRegistry() + c := NewCollector("v1.2.3", "deadbeef", "2026-03-02") + + if err := c.Register(reg); err != nil { + t.Fatalf("Register failed: %v", err) + } + + metrics, err := reg.Gather() + if err != nil { + t.Fatalf("Gather failed: %v", err) + } + + for _, m := range metrics { + if m.GetName() != "watchdog_build_info" { + continue + } + + labels := make(map[string]string) + for _, l := range m.GetMetric()[0].GetLabel() { + labels[l.GetName()] = l.GetValue() + } + + if labels["version"] != "v1.2.3" { + t.Errorf("expected version label %q, got %q", "v1.2.3", labels["version"]) + } + if labels["commit"] != "deadbeef" { + t.Errorf("expected commit label %q, got %q", "deadbeef", labels["commit"]) + } + if labels["build_date"] != "2026-03-02" { + t.Errorf( + "expected build_date label %q, got %q", + "2026-03-02", + labels["build_date"], + ) + } + return + } + + t.Error("watchdog_build_info metric not found in gathered metrics") +} + +func TestNewCollector_StartTimeIsPositive(t *testing.T) { + reg := prometheus.NewRegistry() + c := NewCollector("v0.1.0", "abc1234", "2026-03-02") + + if err := c.Register(reg); err != nil { + t.Fatalf("Register failed: %v", err) + } + + metrics, err := reg.Gather() + if err != nil { + t.Fatalf("Gather failed: %v", err) + } + + for _, m := range metrics { + if m.GetName() != "watchdog_start_time_seconds" { + continue + } + val := m.GetMetric()[0].GetGauge().GetValue() + if val <= 0 { + t.Errorf("expected positive start time, got %v", val) + } + return + } + + t.Error("watchdog_start_time_seconds metric not found") +} diff --git a/main.go b/main.go index ce24c8f..1f5d21b 100644 --- a/main.go +++ b/main.go @@ -2,6 +2,19 @@ package main import "notashelf.dev/watchdog/cmd/watchdog" +// Injected at build time via ldflags: +// +// -X main.Version=v1.0.0 +// -X main.Commit=abc1234 +// -X main.BuildDate=2026-03-02 +// +// I hate this pattern btw. +var ( + Version = "dev" + Commit = "unknown" + BuildDate = "unknown" +) + func main() { - watchdog.Main() + watchdog.Main(Version, Commit, BuildDate) } From dc6b6e0c0c34ca3e10865f2625d6153ba53c4cc9 Mon Sep 17 00:00:00 2001 From: NotAShelf Date: Mon, 2 Mar 2026 22:08:34 +0300 Subject: [PATCH 06/10] nix: correct ldflags Signed-off-by: NotAShelf Change-Id: I5806b91c9dc1dfa9690a6e01cd29059b6a6a6964 --- nix/package.nix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nix/package.nix b/nix/package.nix index f5a987e..58d7757 100644 --- a/nix/package.nix +++ b/nix/package.nix @@ -22,7 +22,7 @@ buildGoModule (finalAttrs: { vendorHash = "sha256-jMqPVvMZDm406Gi2g4zNSRJMySLAN7/CR/2NgF+gqtA="; - ldflags = ["-s" "-w" "-X main.version=${finalAttrs.version}"]; + ldflags = ["-s" "-w" "-X main.Version=${finalAttrs.version}"]; # Copy web assets postInstall = '' From 531aafb09453e340e8df812e6db901cc692c015a Mon Sep 17 00:00:00 2001 From: NotAShelf Date: Mon, 2 Mar 2026 22:08:50 +0300 Subject: [PATCH 07/10] docs: document configuration behaviour; notes on environment vars Signed-off-by: NotAShelf Change-Id: I071c766ba98ed03e0b10928c25af0d0b6a6a6964 --- docs/configuration.md | 237 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 237 insertions(+) create mode 100644 docs/configuration.md diff --git a/docs/configuration.md b/docs/configuration.md new file mode 100644 index 0000000..2bdd734 --- /dev/null +++ b/docs/configuration.md @@ -0,0 +1,237 @@ +# Configuration + +Watchdog supports multiple configuration sources with the following precedence +(highest to lowest): + +1. **Command-line flags** +2. **Environment variables** +3. **Configuration file** +4. **Defaults** + +## Configuration File + +The primary configuration method is via YAML file. By default, Watchdog looks +for: + +- `./config.yaml` (current directory) +- `/etc/watchdog/config.yaml` (system-wide) + +Specify a custom location: + +```bash +# Provide your configuration YAML file with --config +$ watchdog --config /path/to/config.yaml +``` + +See [config.example.yaml](../config.example.yaml) for all available options. + +## Environment Variables + +All configuration options can be set via environment variables with the +`WATCHDOG_` prefix. + +Nested fields use underscore separators. For example: + +```bash +# site.domains +$ export WATCHDOG_SITE_DOMAINS="example.com,blog.example.com" + +# server.listen_addr +$ export WATCHDOG_SERVER_LISTEN_ADDR="127.0.0.1:8080" + +# site.collect.pageviews +$ export WATCHDOG_SITE_COLLECT_PAGEVIEWS=true + +# limits.max_paths +$ export WATCHDOG_LIMITS_MAX_PATHS=10000 +``` + +### Common Environment Variables + +```bash +# Server +WATCHDOG_SERVER_LISTEN_ADDR="127.0.0.1:8080" +WATCHDOG_SERVER_METRICS_PATH="/metrics" +WATCHDOG_SERVER_INGESTION_PATH="/api/event" +WATCHDOG_SERVER_STATE_PATH="/var/lib/watchdog/hll.state" + +# Site +WATCHDOG_SITE_DOMAINS="example.com" # comma-separated for multiple +WATCHDOG_SITE_SALT_ROTATION="daily" +WATCHDOG_SITE_SAMPLING=1.0 + +# Collection +WATCHDOG_SITE_COLLECT_PAGEVIEWS=true +WATCHDOG_SITE_COLLECT_COUNTRY=true +WATCHDOG_SITE_COLLECT_DEVICE=true +WATCHDOG_SITE_COLLECT_REFERRER="domain" +WATCHDOG_SITE_COLLECT_DOMAIN=false + +# Limits +WATCHDOG_LIMITS_MAX_PATHS=10000 +WATCHDOG_LIMITS_MAX_SOURCES=500 +WATCHDOG_LIMITS_MAX_CUSTOM_EVENTS=100 +WATCHDOG_LIMITS_MAX_EVENTS_PER_MINUTE=10000 + +# Security +WATCHDOG_SECURITY_CORS_ENABLED=false +WATCHDOG_SECURITY_METRICS_AUTH_ENABLED=false +WATCHDOG_SECURITY_METRICS_AUTH_USERNAME="admin" +WATCHDOG_SECURITY_METRICS_AUTH_PASSWORD="changeme" +``` + +## Command-Line Flags + +Command-line flags override both config file and environment variables: + +```bash +# Override server address +watchdog --listen-addr :9090 + +# Override metrics path +watchdog --metrics-path /prometheus/metrics + +# Override ingestion path +watchdog --ingestion-path /api/v1/event + +# Combine multiple overrides +watchdog --config prod.yaml --listen-addr :9090 --metrics-path /metrics +``` + +Available flags: + +- `--config string` - Path to config file +- `--listen-addr string` - Server listen address +- `--metrics-path string` - Metrics endpoint path +- `--ingestion-path string` - Ingestion endpoint path + +## Configuration Precedence Example + +Given: + +**config.yaml:** + +```yaml +server: + listen_addr: ":8080" + metrics_path: "/metrics" +``` + +**Environment:** + +```bash +export WATCHDOG_SERVER_LISTEN_ADDR=":9090" +``` + +**Command:** + +```bash +watchdog --metrics-path "/prometheus/metrics" +``` + +**Result:** + +- `listen_addr`: `:9090` (from environment variable) +- `metrics_path`: `/prometheus/metrics` (from CLI flag) + +## Systemd Integration + +Environment variables work seamlessly with systemd: + +```ini +[Service] +Environment="WATCHDOG_SERVER_LISTEN_ADDR=127.0.0.1:8080" +Environment="WATCHDOG_SITE_DOMAINS=example.com" +Environment="WATCHDOG_LIMITS_MAX_PATHS=10000" +ExecStart=/usr/local/bin/watchdog --config /etc/watchdog/config.yaml +``` + +Or use `EnvironmentFile`: + +```ini +[Service] +EnvironmentFile=/etc/watchdog/env +ExecStart=/usr/local/bin/watchdog +``` + +**/etc/watchdog/env:** + +```bash +WATCHDOG_SERVER_LISTEN_ADDR=127.0.0.1:8080 +WATCHDOG_SITE_DOMAINS=example.com +WATCHDOG_LIMITS_MAX_PATHS=10000 +``` + +## NixOS Integration + +NixOS configuration automatically converts to the correct format: + +```nix +{ + services.watchdog = { + enable = true; + settings = { + site.domains = [ "example.com" ]; + server.listen_addr = "127.0.0.1:8080"; + limits.max_paths = 10000; + }; + }; +} +``` + +This is equivalent to setting environment variables or using a config file. + +## Validation + +Configuration is validated on startup. Invalid values will cause Watchdog to +exit with an error: + +```bash +$ watchdog +Error: config validation failed: site.domains is required +``` + +Common validation errors: + +- `site.domains is required` - No domains configured +- `limits.max_paths must be greater than 0` - Invalid cardinality limit +- `site.collect.referrer must be 'off', 'domain', or 'url'` - Invalid referrer + mode +- `site.sampling must be between 0.0 and 1.0` - Invalid sampling rate + +## Best Practices + +1. **Use config file for base configuration** - Easier to version control and + review +2. **Use environment variables for secrets** - Don't commit passwords to config + files +3. **Use CLI flags for testing/overrides** - Quick temporary changes without + editing files + +Example hybrid approach: + +**config.yaml:** + +```yaml +site: + domains: + - example.com + collect: + pageviews: true + device: true + +limits: + max_paths: 10000 +``` + +**Environment (secrets):** + +```bash +export WATCHDOG_SECURITY_METRICS_AUTH_PASSWORD="$SECRET_PASSWORD" +``` + +**CLI (testing):** + +```bash +watchdog --listen-addr :9090 # Test on different port +``` From 3363e5c9232849624a57d1820070c43d9ea83891 Mon Sep 17 00:00:00 2001 From: NotAShelf Date: Mon, 2 Mar 2026 22:24:52 +0300 Subject: [PATCH 08/10] docs: include process metrics under available exports Signed-off-by: NotAShelf Change-Id: I0df00ecfddf98db1ebc85c2fc7758e326a6a6964 --- README.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fae2a98..f64506a 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ installation mechanism. $ go build -o watchdog . # Run -$ ./watchdog -config config.yaml +$ ./watchdog --config config.yaml ``` ## Configuration @@ -207,11 +207,19 @@ While not final, some of the metrics collected are as follows: - `web_custom_events_total{event}` - Custom event counts - `web_daily_unique_visitors` - Estimated unique visitors (HyperLogLog) -**Health metrics:** +**Cardinality metrics:** - `web_path_overflow_total` - Paths rejected due to cardinality limit - `web_referrer_overflow_total` - Referrers rejected due to limit - `web_event_overflow_total` - Custom events rejected due to limit +- `web_blocked_requests_total{reason}` - File server requests blocked by security filters + +**Process metrics:** + +- `watchdog_build_info{version,commit,build_date}` - Build metadata +- `watchdog_start_time_seconds` - Unix timestamp of process start +- `go_*` - Go runtime metrics (goroutines, GC, memory) +- `process_*` - OS process metrics (CPU, RSS, file descriptors) ## Privacy From 13343ef2bda5805731a27e0e988918ecfd621c9f Mon Sep 17 00:00:00 2001 From: NotAShelf Date: Mon, 2 Mar 2026 22:25:13 +0300 Subject: [PATCH 09/10] nix: format Markdown with `deno fmt` Signed-off-by: NotAShelf Change-Id: Id652cb01903d1ca4de4b8839118fac556a6a6964 --- flake.nix | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/flake.nix b/flake.nix index bd38fad..15dc2d4 100644 --- a/flake.nix +++ b/flake.nix @@ -27,6 +27,7 @@ pkgs.alejandra pkgs.fd pkgs.prettier + pkgs.deno pkgs.go # provides gofmt pkgs.golines ]; @@ -38,6 +39,9 @@ # Format HTML & Javascript files with Prettier fd "$@" -t f -e html -e js -x prettier -w '{}' + # Format Markdown with Deno's Markdown formatter + fd "$@" -t f -e md -x deno fmt -q '{}' + # Format go files with both gofmt & golines fd "$@" -t f -e go -x golines -l -w --max-len=110 \ --base-formatter=gofmt \ From df06ed38bf540f5a626db04bc570050eac3286f6 Mon Sep 17 00:00:00 2001 From: NotAShelf Date: Mon, 2 Mar 2026 22:35:32 +0300 Subject: [PATCH 10/10] docs: provide obserability stack guide Signed-off-by: NotAShelf Change-Id: Ibadc31d02413da836e85eaa3d446eb9e6a6a6964 --- README.md | 3 +- docs/observability.md | 300 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 302 insertions(+), 1 deletion(-) create mode 100644 docs/observability.md diff --git a/README.md b/README.md index f64506a..325dace 100644 --- a/README.md +++ b/README.md @@ -212,7 +212,8 @@ While not final, some of the metrics collected are as follows: - `web_path_overflow_total` - Paths rejected due to cardinality limit - `web_referrer_overflow_total` - Referrers rejected due to limit - `web_event_overflow_total` - Custom events rejected due to limit -- `web_blocked_requests_total{reason}` - File server requests blocked by security filters +- `web_blocked_requests_total{reason}` - File server requests blocked by + security filters **Process metrics:** diff --git a/docs/observability.md b/docs/observability.md new file mode 100644 index 0000000..1f3b9dd --- /dev/null +++ b/docs/observability.md @@ -0,0 +1,300 @@ +# Observability Setup + +Watchdog exposes Prometheus-formatted metrics at `/metrics`. You need a +time-series database to scrape and store these metrics, then visualize them in +Grafana. + +> [!IMPORTANT] +> +> **Why you need Prometheus:** +> +> - Watchdog exposes _current state_ (counters, gauges) +> - Prometheus _scrapes periodically_ and _stores time-series data_ +> - Grafana _visualizes_ the historical data from Prometheus +> - Grafana cannot directly scrape Prometheus `/metrics` endpoints + +## Prometheus Setup + +### Configuring Prometheus + +Create `/etc/prometheus/prometheus.yml`: + +```yaml +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: "watchdog" + static_configs: + - targets: ["localhost:8080"] + + # Optional: scrape multiple Watchdog instances + # static_configs: + # - targets: + # - 'watchdog-1.example.com:8080' + # - 'watchdog-2.example.com:8080' + # labels: + # instance: 'production' + + # Scrape Prometheus itself + - job_name: "prometheus" + static_configs: + - targets: ["localhost:9090"] +``` + +### Verify Prometheus' health state + +```bash +# Check Prometheus is running +curl http://localhost:9090/-/healthy + +# Check it's scraping Watchdog +curl http://localhost:9090/api/v1/targets +``` + +### NixOS + +Add to your NixOS configuration: + +```nix +{ + services.prometheus = { + enable = true; + port = 9090; + + # Retention period + retentionTime = "30d"; + + scrapeConfigs = [ + { + job_name = "watchdog"; + static_configs = [{ + targets = [ "localhost:8080" ]; + }]; + } + ]; + }; + + # Open firewall if needed + # networking.firewall.allowedTCPPorts = [ 9090 ]; +} +``` + +For multiple Watchdog instances: + +```nix +{ + services.prometheus.scrapeConfigs = [ + { + job_name = "watchdog"; + static_configs = [ + { + labels.env = "production"; + targets = [ + "watchdog-1:8080" + "watchdog-2:8080" + "watchdog-3:8080" + ]; + } + ]; + } + ]; +} +``` + +## Grafana Setup + +### NixOS + +```nix +{ + services.grafana = { + enable = true; + settings = { + server = { + http_addr = "127.0.0.1"; + http_port = 3000; + }; + }; + + provision = { + enable = true; + + datasources.settings.datasources = [{ + name = "Prometheus"; + type = "prometheus"; + url = "http://localhost:9090"; + isDefault = true; + }]; + }; + }; +} +``` + +### Configure Data Source (Manual) + +If you're not using NixOS for provisioning, then you'll need to do provisioning +_imperatively_ from your Grafana configuration. Ths can be done through the +admin panel by navigating to `Configuration`, and choosing "add data source" +under `Data Sources`. Select your prometheus instance, and save it. + +### Import Pre-built Dashboard + +A sample Grafana dashboard is provided with support for multi-host and +multi-site configurations. Import it, configure the data source and it should +work out of the box. + +If you're not using NixOS for provisioning, the dashboard _also_ needs to be +provisioned manually. Under `Dashboards`, select `Import` and provide the JSON +contents or upload the sample dashboard from `contrib/grafana/watchdog.json`. +Select your Prometheus data source and import it. + +See [contrib/grafana/README.md](../contrib/grafana/README.md) for full +documentation. + +## Example Queries + +Once Prometheus is scraping Watchdog and Grafana is connected, you may write +your own widgets or create queries. Here are some example queries using +Prometheus query language, promql. Those are provided as examples and might not +provide everything you need. Nevertheless, use them to improve your setup at +your disposal. + +If you believe you have some valuable widgets that you'd like to contribute +back, feel free! + +### Top 10 Pages by Traffic + +```promql +topk(10, sum by (path) (rate(web_pageviews_total[5m]))) +``` + +### Mobile vs Desktop Split + +```promql +sum by (device) (rate(web_pageviews_total[1h])) +``` + +### Unique Visitors + +```promql +web_daily_unique_visitors +``` + +### Top Referrers + +```promql +topk(10, sum by (referrer) (rate(web_pageviews_total{referrer!="direct"}[1d]))) +``` + +### Multi-Site: Traffic per Domain + +```promql +sum by (domain) (rate(web_pageviews_total[1h])) +``` + +### Cardinality Health + +```promql +# Should be near zero +rate(web_path_overflow_total[5m]) +rate(web_referrer_overflow_total[5m]) +rate(web_event_overflow_total[5m]) +``` + +## Horizontal Scaling Considerations + +When running multiple Watchdog instances: + +1. **Each instance exposes its own metrics** - Prometheus scrapes all instances +2. **Prometheus aggregates automatically** - use `sum()` in queries to aggregate + across instances +3. **No shared state needed** - each Watchdog instance is independent + +Watchdog is almost entirely stateless, so horizontal scaling should be trivial +as long as you have the necessary infrastructure and, well, the patience. +Example with 3 instances: + +```promql +# Total pageviews across all instances +sum(rate(web_pageviews_total[5m])) + +# Per-instance breakdown +sum by (instance) (rate(web_pageviews_total[5m])) +``` + +## Alternatives to Prometheus + +### VictoriaMetrics + +Drop-in Prometheus replacement with better performance and compression: + +```nix +{ + services.victoriametrics = { + enable = true; + listenAddress = ":8428"; + retentionPeriod = "12month"; + }; + + # Configure Prometheus to remote-write to VictoriaMetrics + services.prometheus = { + enable = true; + remoteWrite = [{ + url = "http://localhost:8428/api/v1/write"; + }]; + }; +} +``` + +### Grafana Agent + +Lightweight alternative that scrapes and forwards to Grafana Cloud or local +Prometheus: + +```bash +# Systemd setup for Grafana Agent +sudo systemctl enable --now grafana-agent +``` + +```yaml +# /etc/grafana-agent.yaml +metrics: + wal_directory: /var/lib/grafana-agent + configs: + - name: watchdog + scrape_configs: + - job_name: watchdog + static_configs: + - targets: ["localhost:8080"] + remote_write: + - url: http://localhost:9090/api/v1/write +``` + +## Monitoring the Monitoring + +Monitor Prometheus itself: + +```promql +# Prometheus scrape success rate +up{job="watchdog"} + +# Scrape duration +scrape_duration_seconds{job="watchdog"} + +# Time since last scrape +time() - timestamp(up{job="watchdog"}) +``` + +## Additional Recommendations + +1. **Retention**: Set `--storage.tsdb.retention.time=30d` or longer based on + disk space +2. **Backups**: Back up `/var/lib/prometheus` periodically (or whatever your + state directory is) +3. **Alerting**: Configure Prometheus alerting rules for critical metrics +4. **High Availability**: Run multiple Prometheus instances with identical + configs +5. **Remote Storage**: For long-term storage, use Thanos, Cortex, or + VictoriaMetrics