internal/aggregate: add HyperLogLog unique visitor tracking

Extracts IP from X-Forwarded-For/X-Real-IP/RemoteAddr. Only active
when `config.Site.SaltRotation` is set.

Signed-off-by: NotAShelf <raf@notashelf.dev>
Change-Id: Ieef93b81e9894fc2e9e129451bf2dfdf6a6a6964
This commit is contained in:
raf 2026-03-01 05:20:43 +03:00
commit 993e47e603
Signed by: NotAShelf
GPG key ID: 29D95B64378DB4BF
6 changed files with 269 additions and 0 deletions

View file

@ -12,6 +12,8 @@ type MetricsAggregator struct {
pageviews *prometheus.CounterVec
customEvents *prometheus.CounterVec
pathOverflow prometheus.Counter
dailyUniques prometheus.Gauge
estimator *UniquesEstimator
}
// Creates a new metrics aggregator with dynamic labels based on config
@ -54,12 +56,21 @@ func NewMetricsAggregator(registry *PathRegistry, cfg config.Config) *MetricsAgg
},
)
dailyUniques := prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "web_daily_unique_visitors",
Help: "Estimated unique visitors today (HyperLogLog)",
},
)
return &MetricsAggregator{
registry: registry,
cfg: cfg,
pageviews: pageviews,
customEvents: customEvents,
pathOverflow: pathOverflow,
dailyUniques: dailyUniques,
estimator: NewUniquesEstimator(),
}
}
@ -93,9 +104,20 @@ func (m *MetricsAggregator) RecordPathOverflow() {
m.pathOverflow.Inc()
}
// Adds a visitor to the unique visitor estimator. Only tracks if salt_rotation is configured
func (m *MetricsAggregator) AddUnique(ip, userAgent string) {
if m.cfg.Site.SaltRotation == "" {
return // only track if salt rotation is enabled
}
m.estimator.Add(ip, userAgent)
m.dailyUniques.Set(float64(m.estimator.Estimate()))
}
// Registers all metrics with the provided Prometheus registry
func (m *MetricsAggregator) MustRegister(reg prometheus.Registerer) {
reg.MustRegister(m.pageviews)
reg.MustRegister(m.customEvents)
reg.MustRegister(m.pathOverflow)
reg.MustRegister(m.dailyUniques)
}

View file

@ -0,0 +1,79 @@
package aggregate
import (
"crypto/sha256"
"encoding/hex"
"sync"
"time"
"github.com/axiomhq/hyperloglog"
)
// UniquesEstimator tracks unique visitors using HyperLogLog with daily salt rotation
type UniquesEstimator struct {
hll *hyperloglog.Sketch
currentDay string
mu sync.Mutex
}
// NewUniquesEstimator creates a new unique visitor estimator
func NewUniquesEstimator() *UniquesEstimator {
return &UniquesEstimator{
hll: hyperloglog.New(),
currentDay: dailySalt(time.Now()),
}
}
// Add records a visitor with privacy-preserving hashing
// Uses IP + UserAgent + daily salt to prevent cross-day correlation
func (u *UniquesEstimator) Add(ip, userAgent string) {
u.mu.Lock()
defer u.mu.Unlock()
// Check if we need to rotate to a new day
today := dailySalt(time.Now())
if today != u.currentDay {
// Reset HLL for new day
u.hll = hyperloglog.New()
u.currentDay = today
}
// Hash visitor with daily salt to prevent cross-day tracking
hash := hashVisitor(ip, userAgent, u.currentDay)
u.hll.Insert([]byte(hash))
}
// Estimate returns the estimated number of unique visitors
func (u *UniquesEstimator) Estimate() uint64 {
u.mu.Lock()
defer u.mu.Unlock()
return u.hll.Estimate()
}
// dailySalt generates a deterministic salt based on the current date
// Same day = same salt, different day = different salt
func dailySalt(t time.Time) string {
// Use UTC to ensure consistent rotation regardless of timezone
date := t.UTC().Format("2006-01-02")
h := sha256.Sum256([]byte("watchdog-salt-" + date))
return hex.EncodeToString(h[:])
}
// hashVisitor creates a privacy-preserving hash of visitor identity
func hashVisitor(ip, userAgent, salt string) string {
combined := ip + "|" + userAgent + "|" + salt
h := sha256.Sum256([]byte(combined))
return hex.EncodeToString(h[:])
}
// CurrentSalt returns the current salt for testing
func (u *UniquesEstimator) CurrentSalt() string {
u.mu.Lock()
defer u.mu.Unlock()
return u.currentDay
}
// DailySalt is exported for testing
func DailySalt(t time.Time) string {
return dailySalt(t)
}

View file

@ -0,0 +1,125 @@
package aggregate
import (
"testing"
"time"
)
func TestDailySalt(t *testing.T) {
// Same day should produce same salt
day1Morning := time.Date(2024, 1, 15, 9, 0, 0, 0, time.UTC)
day1Evening := time.Date(2024, 1, 15, 23, 59, 59, 0, time.UTC)
salt1 := DailySalt(day1Morning)
salt2 := DailySalt(day1Evening)
if salt1 != salt2 {
t.Errorf("Expected same salt for same day, got %s and %s", salt1, salt2)
}
// Different day should produce different salt
day2 := time.Date(2024, 1, 16, 9, 0, 0, 0, time.UTC)
salt3 := DailySalt(day2)
if salt1 == salt3 {
t.Errorf("Expected different salt for different days, got same: %s", salt1)
}
// Salt should be deterministic
salt4 := DailySalt(day1Morning)
if salt1 != salt4 {
t.Errorf("Salt should be deterministic, got %s and %s", salt1, salt4)
}
}
func TestUniquesEstimator(t *testing.T) {
estimator := NewUniquesEstimator()
// Initially should be zero
if count := estimator.Estimate(); count != 0 {
t.Errorf("Expected initial count of 0, got %d", count)
}
// Add a visitor
estimator.Add("192.168.1.1", "Mozilla/5.0")
if count := estimator.Estimate(); count != 1 {
t.Errorf("Expected count of 1 after adding visitor, got %d", count)
}
// Adding same visitor should not increase count
estimator.Add("192.168.1.1", "Mozilla/5.0")
if count := estimator.Estimate(); count != 1 {
t.Errorf("Expected count of 1 after adding duplicate, got %d", count)
}
// Adding different visitor should increase count
estimator.Add("192.168.1.2", "Chrome/90.0")
if count := estimator.Estimate(); count < 2 {
t.Errorf("Expected count of at least 2 after adding different visitor, got %d", count)
}
// Test with many unique visitors
for i := range 1000 {
ip := "10.0." + string(rune(i/256)) + "." + string(rune(i%256))
estimator.Add(ip, "TestAgent")
}
count := estimator.Estimate()
// HyperLogLog has ~2% error rate, so 1000 uniques should be 980-1020
if count < 980 || count > 1020 {
t.Errorf("Expected count around 1000, got %d (error rate too high)", count)
}
}
func TestUniquesEstimatorDailyRotation(t *testing.T) {
// This test verifies that salts are different on different days
// The actual rotation happens in Add() based on current time
// Verify that different days produce different salts
day1 := time.Date(2024, 1, 15, 12, 0, 0, 0, time.UTC)
day2 := time.Date(2024, 1, 16, 12, 0, 0, 0, time.UTC)
salt1 := DailySalt(day1)
salt2 := DailySalt(day2)
if salt1 == salt2 {
t.Error("Expected different salts for different days")
}
// Verify estimator uses current day's salt
estimator := NewUniquesEstimator()
currentSalt := estimator.CurrentSalt()
expectedSalt := DailySalt(time.Now())
if currentSalt != expectedSalt {
t.Errorf("Expected estimator to use current day's salt, got %s, expected %s", currentSalt, expectedSalt)
}
}
func TestHashVisitor(t *testing.T) {
// Same inputs should produce same hash
hash1 := hashVisitor("192.168.1.1", "Mozilla/5.0", "salt123")
hash2 := hashVisitor("192.168.1.1", "Mozilla/5.0", "salt123")
if hash1 != hash2 {
t.Error("Expected same hash for same inputs")
}
// Different IP should produce different hash
hash3 := hashVisitor("192.168.1.2", "Mozilla/5.0", "salt123")
if hash1 == hash3 {
t.Error("Expected different hash for different IP")
}
// Different user agent should produce different hash
hash4 := hashVisitor("192.168.1.1", "Chrome/90.0", "salt123")
if hash1 == hash4 {
t.Error("Expected different hash for different user agent")
}
// Different salt should produce different hash
hash5 := hashVisitor("192.168.1.1", "Mozilla/5.0", "salt456")
if hash1 == hash5 {
t.Error("Expected different hash for different salt")
}
}

View file

@ -3,6 +3,7 @@ package api
import (
"log"
"net/http"
"strings"
"notashelf.dev/watchdog/internal/aggregate"
"notashelf.dev/watchdog/internal/config"
@ -70,6 +71,13 @@ func (h *IngestionHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
return
}
// Extract visitor identity for unique tracking
ip := extractIP(r)
userAgent := r.Header.Get("User-Agent")
// Track unique visitor if salt rotation is enabled
h.metricsAgg.AddUnique(ip, userAgent)
// Process based on event type
if event.Event != "" {
// Custom event
@ -102,6 +110,32 @@ func (h *IngestionHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusNoContent)
}
// extractIP extracts the client IP from the request
// Checks X-Forwarded-For and X-Real-IP headers for proxied requests
func extractIP(r *http.Request) string {
// Check X-Forwarded-For header (may contain multiple IPs)
if xff := r.Header.Get("X-Forwarded-For"); xff != "" {
// Take the first IP in the list
ips := strings.Split(xff, ",")
if len(ips) > 0 {
return strings.TrimSpace(ips[0])
}
}
// Check X-Real-IP header
if xri := r.Header.Get("X-Real-IP"); xri != "" {
return xri
}
// Fall back to RemoteAddr
ip := r.RemoteAddr
// Strip port if present
if idx := strings.LastIndex(ip, ":"); idx != -1 {
ip = ip[:idx]
}
return ip
}
// Classifies screen width into device categories
func classifyDevice(width int) string {
// FIXME: probably not the best logic for this...