internal: implement path normalization w/ configurable rules

Strips query strings and URL fragmenets, prevents unbounded Prometheus
metrics by normalizing paths like:

- `/users/12345/profile -> /users/:id/profile`
- `/page?utm_source=twitter -> /page`
- `/a/../b -> /b`

etc.

Signed-off-by: NotAShelf <raf@notashelf.dev>
Change-Id: I72f2fa2452f4666567143d052b5716476a6a6964
This commit is contained in:
raf 2026-03-01 01:13:58 +03:00
commit 0691e5ee34
Signed by: NotAShelf
GPG key ID: 29D95B64378DB4BF
4 changed files with 303 additions and 0 deletions

View file

@ -37,6 +37,7 @@ type PathConfig struct {
StripFragment bool `yaml:"strip_fragment"`
CollapseNumericSegments bool `yaml:"collapse_numeric_segments"`
MaxSegments int `yaml:"max_segments"`
NormalizeTrailingSlash bool `yaml:"normalize_trailing_slash"`
}
// Cardinality limits

121
internal/normalize/path.go Normal file
View file

@ -0,0 +1,121 @@
package normalize
import (
"strings"
"notashelf.dev/watchdog/internal/config"
)
const maxPathLength = 2048
type PathNormalizer struct {
cfg config.PathConfig
}
func NewPathNormalizer(cfg config.PathConfig) *PathNormalizer {
return &PathNormalizer{cfg: cfg}
}
func (n *PathNormalizer) Normalize(path string) string {
// Return as-is if path is too long
if len(path) > maxPathLength {
return path
}
if path == "" {
return "/"
}
// Strip query string
if n.cfg.StripQuery {
if idx := strings.IndexByte(path, '?'); idx != -1 {
path = path[:idx]
}
}
// Strip fragment
if n.cfg.StripFragment {
if idx := strings.IndexByte(path, '#'); idx != -1 {
path = path[:idx]
}
}
// Ensure leading slash
if !strings.HasPrefix(path, "/") {
path = "/" + path
}
// Split into segments, first element is *always* empty for paths starting with '/'
segments := strings.Split(path, "/")
if len(segments) > 0 && segments[0] == "" {
segments = segments[1:]
}
// Remove empty segments (from double slashes)
filtered := make([]string, 0, len(segments))
for _, seg := range segments {
if seg != "" {
filtered = append(filtered, seg)
}
}
segments = filtered
// Resolve . and .. segments
resolved := make([]string, 0, len(segments))
for _, seg := range segments {
if seg == "." {
// Skip current directory
continue
} else if seg == ".." {
// Go up one level if possible
if len(resolved) > 0 {
resolved = resolved[:len(resolved)-1]
}
// If already at root, skip ..
} else {
resolved = append(resolved, seg)
}
}
segments = resolved
// Collapse numeric segments
if n.cfg.CollapseNumericSegments {
for i, seg := range segments {
if isNumeric(seg) {
segments[i] = ":id"
}
}
}
// Limit segments
if n.cfg.MaxSegments > 0 && len(segments) > n.cfg.MaxSegments {
segments = segments[:n.cfg.MaxSegments]
}
// Reconstruct path
var result string
if len(segments) == 0 {
result = "/"
} else {
result = "/" + strings.Join(segments, "/")
}
// Strip trailing slash if configured (except root)
if n.cfg.NormalizeTrailingSlash && result != "/" && strings.HasSuffix(result, "/") {
result = strings.TrimSuffix(result, "/")
}
return result
}
func isNumeric(s string) bool {
if s == "" {
return false
}
for _, c := range s {
if c < '0' || c > '9' {
return false
}
}
return true
}

View file

@ -0,0 +1,180 @@
package normalize
import (
"strings"
"testing"
"notashelf.dev/watchdog/internal/config"
)
func TestNormalizePath(t *testing.T) {
tests := []struct {
name string
cfg config.PathConfig
input string
want string
}{
{
name: "strip query string",
cfg: config.PathConfig{
StripQuery: true,
},
input: "/page?utm_source=twitter&id=123",
want: "/page",
},
{
name: "strip fragment",
cfg: config.PathConfig{
StripFragment: true,
},
input: "/page#section",
want: "/page",
},
{
name: "collapse numeric segments",
cfg: config.PathConfig{
CollapseNumericSegments: true,
},
input: "/users/12345/profile",
want: "/users/:id/profile",
},
{
name: "limit segments",
cfg: config.PathConfig{
MaxSegments: 3,
},
input: "/a/b/c/d/e/f",
want: "/a/b/c",
},
{
name: "combined normalization",
cfg: config.PathConfig{
StripQuery: true,
StripFragment: true,
CollapseNumericSegments: true,
MaxSegments: 5,
},
input: "/posts/2024/12/25/my-post?ref=home#comments",
want: "/posts/:id/:id/:id/my-post",
},
{
name: "root path unchanged",
cfg: config.PathConfig{
StripQuery: true,
},
input: "/",
want: "/",
},
{
name: "empty path becomes root",
cfg: config.PathConfig{},
input: "",
want: "/",
},
{
name: "path traversal with ..",
cfg: config.PathConfig{},
input: "/a/../b",
want: "/b",
},
{
name: "path traversal with .",
cfg: config.PathConfig{},
input: "/a/./b",
want: "/a/b",
},
{
name: "complex traversal",
cfg: config.PathConfig{},
input: "/a/b/../c/./d",
want: "/a/c/d",
},
{
name: "traversal beyond root",
cfg: config.PathConfig{},
input: "/../../../etc",
want: "/etc",
},
{
name: "double slashes",
cfg: config.PathConfig{},
input: "/a//b///c",
want: "/a/b/c",
},
{
name: "trailing slash normalization",
cfg: config.PathConfig{
NormalizeTrailingSlash: true,
},
input: "/users/",
want: "/users",
},
{
name: "root trailing slash preserved",
cfg: config.PathConfig{
NormalizeTrailingSlash: true,
},
input: "/",
want: "/",
},
{
name: "very long path",
cfg: config.PathConfig{},
input: "/" + strings.Repeat("a", 2050),
want: "/" + strings.Repeat("a", 2050),
},
{
name: "dot segments only",
cfg: config.PathConfig{},
input: "/./././",
want: "/",
},
{
name: "parent segments only",
cfg: config.PathConfig{},
input: "/../..",
want: "/",
},
{
name: "combined: traversal, slashes, and trailing slash",
cfg: config.PathConfig{
NormalizeTrailingSlash: true,
},
input: "/a//b/../c/./d/",
want: "/a/c/d",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
n := NewPathNormalizer(tt.cfg)
got := n.Normalize(tt.input)
if got != tt.want {
t.Errorf("Normalize(%q) = %q, want %q", tt.input, got, tt.want)
}
})
}
}
func TestIsNumeric(t *testing.T) {
tests := []struct {
input string
want bool
}{
{"123", true},
{"0", true},
{"abc", false},
{"12abc", false},
{"", false},
{"2024-12-25", false},
}
for _, tt := range tests {
t.Run(tt.input, func(t *testing.T) {
got := isNumeric(tt.input)
if got != tt.want {
t.Errorf("isNumeric(%q) = %v, want %v", tt.input, got, tt.want)
}
})
}
}

View file

@ -14,6 +14,7 @@ site:
strip_fragment: true
collapse_numeric_segments: true
max_segments: 5
normalize_trailing_slash: true
limits:
max_paths: 1000