diff --git a/go.mod b/go.mod index 0dbe9d8..bdafebb 100644 --- a/go.mod +++ b/go.mod @@ -7,8 +7,11 @@ require gopkg.in/yaml.v3 v3.0.1 require golang.org/x/net v0.51.0 require ( + github.com/axiomhq/hyperloglog v0.2.6 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/dgryski/go-metro v0.0.0-20250106013310-edb8663e5e33 // indirect + github.com/kamstrup/intmap v0.5.2 // indirect github.com/kylelemons/godebug v1.1.0 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/prometheus/client_golang v1.23.2 // indirect diff --git a/go.sum b/go.sum index b1985a5..97f9bca 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,13 @@ +github.com/axiomhq/hyperloglog v0.2.6 h1:sRhvvF3RIXWQgAXaTphLp4yJiX4S0IN3MWTaAgZoRJw= +github.com/axiomhq/hyperloglog v0.2.6/go.mod h1:YjX/dQqCR/7QYX0g8mu8UZAjpIenz1FKM71UEsjFoTo= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/dgryski/go-metro v0.0.0-20250106013310-edb8663e5e33 h1:ucRHb6/lvW/+mTEIGbvhcYU3S8+uSNkuMjx/qZFfhtM= +github.com/dgryski/go-metro v0.0.0-20250106013310-edb8663e5e33/go.mod h1:c9O8+fpSOX1DM8cPNSkX/qsBWdkD4yd2dpciOWQjpBw= +github.com/kamstrup/intmap v0.5.2 h1:qnwBm1mh4XAnW9W9Ue9tZtTff8pS6+s6iKF6JRIV2Dk= +github.com/kamstrup/intmap v0.5.2/go.mod h1:gWUVWHKzWj8xpJVFf5GC0O26bWmv3GqdnIX/LMT6Aq4= github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= diff --git a/internal/aggregate/metrics.go b/internal/aggregate/metrics.go index fefbb54..3742186 100644 --- a/internal/aggregate/metrics.go +++ b/internal/aggregate/metrics.go @@ -12,6 +12,8 @@ type MetricsAggregator struct { pageviews *prometheus.CounterVec customEvents *prometheus.CounterVec pathOverflow prometheus.Counter + dailyUniques prometheus.Gauge + estimator *UniquesEstimator } // Creates a new metrics aggregator with dynamic labels based on config @@ -54,12 +56,21 @@ func NewMetricsAggregator(registry *PathRegistry, cfg config.Config) *MetricsAgg }, ) + dailyUniques := prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "web_daily_unique_visitors", + Help: "Estimated unique visitors today (HyperLogLog)", + }, + ) + return &MetricsAggregator{ registry: registry, cfg: cfg, pageviews: pageviews, customEvents: customEvents, pathOverflow: pathOverflow, + dailyUniques: dailyUniques, + estimator: NewUniquesEstimator(), } } @@ -93,9 +104,20 @@ func (m *MetricsAggregator) RecordPathOverflow() { m.pathOverflow.Inc() } +// Adds a visitor to the unique visitor estimator. Only tracks if salt_rotation is configured +func (m *MetricsAggregator) AddUnique(ip, userAgent string) { + if m.cfg.Site.SaltRotation == "" { + return // only track if salt rotation is enabled + } + + m.estimator.Add(ip, userAgent) + m.dailyUniques.Set(float64(m.estimator.Estimate())) +} + // Registers all metrics with the provided Prometheus registry func (m *MetricsAggregator) MustRegister(reg prometheus.Registerer) { reg.MustRegister(m.pageviews) reg.MustRegister(m.customEvents) reg.MustRegister(m.pathOverflow) + reg.MustRegister(m.dailyUniques) } diff --git a/internal/aggregate/uniques.go b/internal/aggregate/uniques.go new file mode 100644 index 0000000..b1049e0 --- /dev/null +++ b/internal/aggregate/uniques.go @@ -0,0 +1,79 @@ +package aggregate + +import ( + "crypto/sha256" + "encoding/hex" + "sync" + "time" + + "github.com/axiomhq/hyperloglog" +) + +// UniquesEstimator tracks unique visitors using HyperLogLog with daily salt rotation +type UniquesEstimator struct { + hll *hyperloglog.Sketch + currentDay string + mu sync.Mutex +} + +// NewUniquesEstimator creates a new unique visitor estimator +func NewUniquesEstimator() *UniquesEstimator { + return &UniquesEstimator{ + hll: hyperloglog.New(), + currentDay: dailySalt(time.Now()), + } +} + +// Add records a visitor with privacy-preserving hashing +// Uses IP + UserAgent + daily salt to prevent cross-day correlation +func (u *UniquesEstimator) Add(ip, userAgent string) { + u.mu.Lock() + defer u.mu.Unlock() + + // Check if we need to rotate to a new day + today := dailySalt(time.Now()) + if today != u.currentDay { + // Reset HLL for new day + u.hll = hyperloglog.New() + u.currentDay = today + } + + // Hash visitor with daily salt to prevent cross-day tracking + hash := hashVisitor(ip, userAgent, u.currentDay) + u.hll.Insert([]byte(hash)) +} + +// Estimate returns the estimated number of unique visitors +func (u *UniquesEstimator) Estimate() uint64 { + u.mu.Lock() + defer u.mu.Unlock() + return u.hll.Estimate() +} + +// dailySalt generates a deterministic salt based on the current date +// Same day = same salt, different day = different salt +func dailySalt(t time.Time) string { + // Use UTC to ensure consistent rotation regardless of timezone + date := t.UTC().Format("2006-01-02") + h := sha256.Sum256([]byte("watchdog-salt-" + date)) + return hex.EncodeToString(h[:]) +} + +// hashVisitor creates a privacy-preserving hash of visitor identity +func hashVisitor(ip, userAgent, salt string) string { + combined := ip + "|" + userAgent + "|" + salt + h := sha256.Sum256([]byte(combined)) + return hex.EncodeToString(h[:]) +} + +// CurrentSalt returns the current salt for testing +func (u *UniquesEstimator) CurrentSalt() string { + u.mu.Lock() + defer u.mu.Unlock() + return u.currentDay +} + +// DailySalt is exported for testing +func DailySalt(t time.Time) string { + return dailySalt(t) +} diff --git a/internal/aggregate/uniques_test.go b/internal/aggregate/uniques_test.go new file mode 100644 index 0000000..0be3758 --- /dev/null +++ b/internal/aggregate/uniques_test.go @@ -0,0 +1,125 @@ +package aggregate + +import ( + "testing" + "time" +) + +func TestDailySalt(t *testing.T) { + // Same day should produce same salt + day1Morning := time.Date(2024, 1, 15, 9, 0, 0, 0, time.UTC) + day1Evening := time.Date(2024, 1, 15, 23, 59, 59, 0, time.UTC) + + salt1 := DailySalt(day1Morning) + salt2 := DailySalt(day1Evening) + + if salt1 != salt2 { + t.Errorf("Expected same salt for same day, got %s and %s", salt1, salt2) + } + + // Different day should produce different salt + day2 := time.Date(2024, 1, 16, 9, 0, 0, 0, time.UTC) + salt3 := DailySalt(day2) + + if salt1 == salt3 { + t.Errorf("Expected different salt for different days, got same: %s", salt1) + } + + // Salt should be deterministic + salt4 := DailySalt(day1Morning) + if salt1 != salt4 { + t.Errorf("Salt should be deterministic, got %s and %s", salt1, salt4) + } +} + +func TestUniquesEstimator(t *testing.T) { + estimator := NewUniquesEstimator() + + // Initially should be zero + if count := estimator.Estimate(); count != 0 { + t.Errorf("Expected initial count of 0, got %d", count) + } + + // Add a visitor + estimator.Add("192.168.1.1", "Mozilla/5.0") + if count := estimator.Estimate(); count != 1 { + t.Errorf("Expected count of 1 after adding visitor, got %d", count) + } + + // Adding same visitor should not increase count + estimator.Add("192.168.1.1", "Mozilla/5.0") + if count := estimator.Estimate(); count != 1 { + t.Errorf("Expected count of 1 after adding duplicate, got %d", count) + } + + // Adding different visitor should increase count + estimator.Add("192.168.1.2", "Chrome/90.0") + if count := estimator.Estimate(); count < 2 { + t.Errorf("Expected count of at least 2 after adding different visitor, got %d", count) + } + + // Test with many unique visitors + for i := range 1000 { + ip := "10.0." + string(rune(i/256)) + "." + string(rune(i%256)) + estimator.Add(ip, "TestAgent") + } + + count := estimator.Estimate() + // HyperLogLog has ~2% error rate, so 1000 uniques should be 980-1020 + if count < 980 || count > 1020 { + t.Errorf("Expected count around 1000, got %d (error rate too high)", count) + } +} + +func TestUniquesEstimatorDailyRotation(t *testing.T) { + // This test verifies that salts are different on different days + // The actual rotation happens in Add() based on current time + + // Verify that different days produce different salts + day1 := time.Date(2024, 1, 15, 12, 0, 0, 0, time.UTC) + day2 := time.Date(2024, 1, 16, 12, 0, 0, 0, time.UTC) + + salt1 := DailySalt(day1) + salt2 := DailySalt(day2) + + if salt1 == salt2 { + t.Error("Expected different salts for different days") + } + + // Verify estimator uses current day's salt + estimator := NewUniquesEstimator() + currentSalt := estimator.CurrentSalt() + expectedSalt := DailySalt(time.Now()) + + if currentSalt != expectedSalt { + t.Errorf("Expected estimator to use current day's salt, got %s, expected %s", currentSalt, expectedSalt) + } +} + +func TestHashVisitor(t *testing.T) { + // Same inputs should produce same hash + hash1 := hashVisitor("192.168.1.1", "Mozilla/5.0", "salt123") + hash2 := hashVisitor("192.168.1.1", "Mozilla/5.0", "salt123") + + if hash1 != hash2 { + t.Error("Expected same hash for same inputs") + } + + // Different IP should produce different hash + hash3 := hashVisitor("192.168.1.2", "Mozilla/5.0", "salt123") + if hash1 == hash3 { + t.Error("Expected different hash for different IP") + } + + // Different user agent should produce different hash + hash4 := hashVisitor("192.168.1.1", "Chrome/90.0", "salt123") + if hash1 == hash4 { + t.Error("Expected different hash for different user agent") + } + + // Different salt should produce different hash + hash5 := hashVisitor("192.168.1.1", "Mozilla/5.0", "salt456") + if hash1 == hash5 { + t.Error("Expected different hash for different salt") + } +} diff --git a/internal/api/handler.go b/internal/api/handler.go index 21827c0..e8b3545 100644 --- a/internal/api/handler.go +++ b/internal/api/handler.go @@ -3,6 +3,7 @@ package api import ( "log" "net/http" + "strings" "notashelf.dev/watchdog/internal/aggregate" "notashelf.dev/watchdog/internal/config" @@ -70,6 +71,13 @@ func (h *IngestionHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { return } + // Extract visitor identity for unique tracking + ip := extractIP(r) + userAgent := r.Header.Get("User-Agent") + + // Track unique visitor if salt rotation is enabled + h.metricsAgg.AddUnique(ip, userAgent) + // Process based on event type if event.Event != "" { // Custom event @@ -102,6 +110,32 @@ func (h *IngestionHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusNoContent) } +// extractIP extracts the client IP from the request +// Checks X-Forwarded-For and X-Real-IP headers for proxied requests +func extractIP(r *http.Request) string { + // Check X-Forwarded-For header (may contain multiple IPs) + if xff := r.Header.Get("X-Forwarded-For"); xff != "" { + // Take the first IP in the list + ips := strings.Split(xff, ",") + if len(ips) > 0 { + return strings.TrimSpace(ips[0]) + } + } + + // Check X-Real-IP header + if xri := r.Header.Get("X-Real-IP"); xri != "" { + return xri + } + + // Fall back to RemoteAddr + ip := r.RemoteAddr + // Strip port if present + if idx := strings.LastIndex(ip, ":"); idx != -1 { + ip = ip[:idx] + } + return ip +} + // Classifies screen width into device categories func classifyDevice(width int) string { // FIXME: probably not the best logic for this...