From 0691e5ee347f3702433017cffd767b9b2836a52a Mon Sep 17 00:00:00 2001 From: NotAShelf Date: Sun, 1 Mar 2026 01:13:58 +0300 Subject: [PATCH] internal: implement path normalization w/ configurable rules Strips query strings and URL fragmenets, prevents unbounded Prometheus metrics by normalizing paths like: - `/users/12345/profile -> /users/:id/profile` - `/page?utm_source=twitter -> /page` - `/a/../b -> /b` etc. Signed-off-by: NotAShelf Change-Id: I72f2fa2452f4666567143d052b5716476a6a6964 --- internal/config/config.go | 1 + internal/normalize/path.go | 121 +++++++++++++++++++++ internal/normalize/path_test.go | 180 ++++++++++++++++++++++++++++++++ testdata/config.valid.yaml | 1 + 4 files changed, 303 insertions(+) create mode 100644 internal/normalize/path.go create mode 100644 internal/normalize/path_test.go diff --git a/internal/config/config.go b/internal/config/config.go index 9725105..cddab60 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -37,6 +37,7 @@ type PathConfig struct { StripFragment bool `yaml:"strip_fragment"` CollapseNumericSegments bool `yaml:"collapse_numeric_segments"` MaxSegments int `yaml:"max_segments"` + NormalizeTrailingSlash bool `yaml:"normalize_trailing_slash"` } // Cardinality limits diff --git a/internal/normalize/path.go b/internal/normalize/path.go new file mode 100644 index 0000000..e142651 --- /dev/null +++ b/internal/normalize/path.go @@ -0,0 +1,121 @@ +package normalize + +import ( + "strings" + + "notashelf.dev/watchdog/internal/config" +) + +const maxPathLength = 2048 + +type PathNormalizer struct { + cfg config.PathConfig +} + +func NewPathNormalizer(cfg config.PathConfig) *PathNormalizer { + return &PathNormalizer{cfg: cfg} +} + +func (n *PathNormalizer) Normalize(path string) string { + // Return as-is if path is too long + if len(path) > maxPathLength { + return path + } + + if path == "" { + return "/" + } + + // Strip query string + if n.cfg.StripQuery { + if idx := strings.IndexByte(path, '?'); idx != -1 { + path = path[:idx] + } + } + + // Strip fragment + if n.cfg.StripFragment { + if idx := strings.IndexByte(path, '#'); idx != -1 { + path = path[:idx] + } + } + + // Ensure leading slash + if !strings.HasPrefix(path, "/") { + path = "/" + path + } + + // Split into segments, first element is *always* empty for paths starting with '/' + segments := strings.Split(path, "/") + if len(segments) > 0 && segments[0] == "" { + segments = segments[1:] + } + + // Remove empty segments (from double slashes) + filtered := make([]string, 0, len(segments)) + for _, seg := range segments { + if seg != "" { + filtered = append(filtered, seg) + } + } + segments = filtered + + // Resolve . and .. segments + resolved := make([]string, 0, len(segments)) + for _, seg := range segments { + if seg == "." { + // Skip current directory + continue + } else if seg == ".." { + // Go up one level if possible + if len(resolved) > 0 { + resolved = resolved[:len(resolved)-1] + } + // If already at root, skip .. + } else { + resolved = append(resolved, seg) + } + } + segments = resolved + + // Collapse numeric segments + if n.cfg.CollapseNumericSegments { + for i, seg := range segments { + if isNumeric(seg) { + segments[i] = ":id" + } + } + } + + // Limit segments + if n.cfg.MaxSegments > 0 && len(segments) > n.cfg.MaxSegments { + segments = segments[:n.cfg.MaxSegments] + } + + // Reconstruct path + var result string + if len(segments) == 0 { + result = "/" + } else { + result = "/" + strings.Join(segments, "/") + } + + // Strip trailing slash if configured (except root) + if n.cfg.NormalizeTrailingSlash && result != "/" && strings.HasSuffix(result, "/") { + result = strings.TrimSuffix(result, "/") + } + + return result +} + +func isNumeric(s string) bool { + if s == "" { + return false + } + for _, c := range s { + if c < '0' || c > '9' { + return false + } + } + return true +} diff --git a/internal/normalize/path_test.go b/internal/normalize/path_test.go new file mode 100644 index 0000000..b96c5ce --- /dev/null +++ b/internal/normalize/path_test.go @@ -0,0 +1,180 @@ +package normalize + +import ( + "strings" + "testing" + + "notashelf.dev/watchdog/internal/config" +) + +func TestNormalizePath(t *testing.T) { + tests := []struct { + name string + cfg config.PathConfig + input string + want string + }{ + { + name: "strip query string", + cfg: config.PathConfig{ + StripQuery: true, + }, + input: "/page?utm_source=twitter&id=123", + want: "/page", + }, + { + name: "strip fragment", + cfg: config.PathConfig{ + StripFragment: true, + }, + input: "/page#section", + want: "/page", + }, + { + name: "collapse numeric segments", + cfg: config.PathConfig{ + CollapseNumericSegments: true, + }, + input: "/users/12345/profile", + want: "/users/:id/profile", + }, + { + name: "limit segments", + cfg: config.PathConfig{ + MaxSegments: 3, + }, + input: "/a/b/c/d/e/f", + want: "/a/b/c", + }, + { + name: "combined normalization", + cfg: config.PathConfig{ + StripQuery: true, + StripFragment: true, + CollapseNumericSegments: true, + MaxSegments: 5, + }, + input: "/posts/2024/12/25/my-post?ref=home#comments", + want: "/posts/:id/:id/:id/my-post", + }, + { + name: "root path unchanged", + cfg: config.PathConfig{ + StripQuery: true, + }, + input: "/", + want: "/", + }, + { + name: "empty path becomes root", + cfg: config.PathConfig{}, + input: "", + want: "/", + }, + { + name: "path traversal with ..", + cfg: config.PathConfig{}, + input: "/a/../b", + want: "/b", + }, + { + name: "path traversal with .", + cfg: config.PathConfig{}, + input: "/a/./b", + want: "/a/b", + }, + { + name: "complex traversal", + cfg: config.PathConfig{}, + input: "/a/b/../c/./d", + want: "/a/c/d", + }, + { + name: "traversal beyond root", + cfg: config.PathConfig{}, + input: "/../../../etc", + want: "/etc", + }, + { + name: "double slashes", + cfg: config.PathConfig{}, + input: "/a//b///c", + want: "/a/b/c", + }, + { + name: "trailing slash normalization", + cfg: config.PathConfig{ + NormalizeTrailingSlash: true, + }, + input: "/users/", + want: "/users", + }, + { + name: "root trailing slash preserved", + cfg: config.PathConfig{ + NormalizeTrailingSlash: true, + }, + input: "/", + want: "/", + }, + { + name: "very long path", + cfg: config.PathConfig{}, + input: "/" + strings.Repeat("a", 2050), + want: "/" + strings.Repeat("a", 2050), + }, + { + name: "dot segments only", + cfg: config.PathConfig{}, + input: "/./././", + want: "/", + }, + { + name: "parent segments only", + cfg: config.PathConfig{}, + input: "/../..", + want: "/", + }, + { + name: "combined: traversal, slashes, and trailing slash", + cfg: config.PathConfig{ + NormalizeTrailingSlash: true, + }, + input: "/a//b/../c/./d/", + want: "/a/c/d", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + n := NewPathNormalizer(tt.cfg) + got := n.Normalize(tt.input) + if got != tt.want { + t.Errorf("Normalize(%q) = %q, want %q", tt.input, got, tt.want) + } + }) + } +} + +func TestIsNumeric(t *testing.T) { + tests := []struct { + input string + want bool + }{ + {"123", true}, + {"0", true}, + {"abc", false}, + {"12abc", false}, + {"", false}, + {"2024-12-25", false}, + } + + for _, tt := range tests { + t.Run(tt.input, func(t *testing.T) { + got := isNumeric(tt.input) + if got != tt.want { + t.Errorf("isNumeric(%q) = %v, want %v", tt.input, got, tt.want) + } + }) + } +} diff --git a/testdata/config.valid.yaml b/testdata/config.valid.yaml index c723c6a..656a8f8 100644 --- a/testdata/config.valid.yaml +++ b/testdata/config.valid.yaml @@ -14,6 +14,7 @@ site: strip_fragment: true collapse_numeric_segments: true max_segments: 5 + normalize_trailing_slash: true limits: max_paths: 1000