Signed-off-by: NotAShelf <raf@notashelf.dev> Change-Id: Ifb9d95d5206b7b1cf23fa3d5aaf9d0db6a6a6964
204 lines
8.5 KiB
Nix
204 lines
8.5 KiB
Nix
{
|
|
pkgs,
|
|
self,
|
|
}:
|
|
pkgs.testers.nixosTest {
|
|
name = "fc-machine-health";
|
|
|
|
nodes.machine = {
|
|
imports = [
|
|
self.nixosModules.fc-ci
|
|
../vm-common.nix
|
|
];
|
|
_module.args.self = self;
|
|
};
|
|
|
|
testScript = ''
|
|
import hashlib
|
|
import json
|
|
|
|
machine.start()
|
|
machine.wait_for_unit("postgresql.service")
|
|
machine.wait_until_succeeds("sudo -u fc psql -U fc -d fc -c 'SELECT 1'", timeout=30)
|
|
machine.wait_for_unit("fc-server.service")
|
|
machine.wait_until_succeeds("curl -sf http://127.0.0.1:3000/health", timeout=30)
|
|
|
|
api_token = "fc_testkey123"
|
|
api_hash = hashlib.sha256(api_token.encode()).hexdigest()
|
|
machine.succeed(
|
|
f"sudo -u fc psql -U fc -d fc -c \"INSERT INTO api_keys (name, key_hash, role) VALUES ('test', '{api_hash}', 'admin')\""
|
|
)
|
|
auth_header = f"-H 'Authorization: Bearer {api_token}'"
|
|
|
|
# Create a builder via API
|
|
builder_json = machine.succeed(
|
|
"curl -sf -X POST http://127.0.0.1:3000/api/v1/admin/builders "
|
|
f"{auth_header} "
|
|
"-H 'Content-Type: application/json' "
|
|
"-d '{\"name\": \"test-builder\", \"ssh_uri\": \"ssh://builder@host\", \"systems\": [\"x86_64-linux\"]}'"
|
|
)
|
|
builder = json.loads(builder_json)
|
|
builder_id = builder["id"]
|
|
|
|
with subtest("New builder starts with zero failures"):
|
|
assert builder["consecutive_failures"] == 0, \
|
|
f"Expected 0 failures, got {builder['consecutive_failures']}"
|
|
assert builder["disabled_until"] is None, \
|
|
f"Expected disabled_until=null, got {builder['disabled_until']}"
|
|
assert builder["last_failure"] is None, \
|
|
f"Expected last_failure=null, got {builder['last_failure']}"
|
|
|
|
with subtest("Recording failure increments consecutive_failures"):
|
|
machine.succeed(
|
|
"sudo -u fc psql -U fc -d fc -c \""
|
|
"UPDATE remote_builders SET "
|
|
"consecutive_failures = LEAST(consecutive_failures + 1, 4), "
|
|
"last_failure = NOW(), "
|
|
"disabled_until = NOW() + interval '60 seconds' "
|
|
f"WHERE id = '{builder_id}'\""
|
|
)
|
|
result = machine.succeed(
|
|
f"curl -sf http://127.0.0.1:3000/api/v1/admin/builders/{builder_id}"
|
|
)
|
|
b = json.loads(result)
|
|
assert b["consecutive_failures"] == 1, \
|
|
f"Expected 1 failure, got {b['consecutive_failures']}"
|
|
assert b["disabled_until"] is not None, \
|
|
"Expected disabled_until to be set"
|
|
assert b["last_failure"] is not None, \
|
|
"Expected last_failure to be set"
|
|
|
|
with subtest("Failures cap at 4"):
|
|
machine.succeed(
|
|
"sudo -u fc psql -U fc -d fc -c \""
|
|
f"UPDATE remote_builders SET consecutive_failures = 10 WHERE id = '{builder_id}'\""
|
|
)
|
|
# Simulate record_failure SQL (same as repo code)
|
|
machine.succeed(
|
|
"sudo -u fc psql -U fc -d fc -c \""
|
|
"UPDATE remote_builders SET "
|
|
"consecutive_failures = LEAST(consecutive_failures + 1, 4), "
|
|
"last_failure = NOW(), "
|
|
"disabled_until = NOW() + make_interval(secs => 60.0 * power(3, LEAST(consecutive_failures + 1, 4) - 1)) "
|
|
f"WHERE id = '{builder_id}'\""
|
|
)
|
|
result = machine.succeed(
|
|
"sudo -u fc psql -U fc -d fc -tA -c "
|
|
f"\"SELECT consecutive_failures FROM remote_builders WHERE id = '{builder_id}'\""
|
|
)
|
|
assert result.strip() == "4", f"Expected failures capped at 4, got {result.strip()}"
|
|
|
|
with subtest("Disabled builder excluded from find_for_system"):
|
|
# Set disabled_until far in the future
|
|
machine.succeed(
|
|
"sudo -u fc psql -U fc -d fc -c \""
|
|
"UPDATE remote_builders SET disabled_until = NOW() + interval '1 hour' "
|
|
f"WHERE id = '{builder_id}'\""
|
|
)
|
|
result = machine.succeed(
|
|
"sudo -u fc psql -U fc -d fc -tA -c "
|
|
"\"SELECT count(*) FROM remote_builders "
|
|
"WHERE enabled = true "
|
|
"AND 'x86_64-linux' = ANY(systems) "
|
|
"AND (disabled_until IS NULL OR disabled_until < NOW())\""
|
|
)
|
|
assert result.strip() == "0", \
|
|
f"Expected disabled builder excluded, got count={result.strip()}"
|
|
|
|
with subtest("Non-disabled builder included in find_for_system"):
|
|
# Clear disabled_until
|
|
machine.succeed(
|
|
"sudo -u fc psql -U fc -d fc -c \""
|
|
f"UPDATE remote_builders SET disabled_until = NULL WHERE id = '{builder_id}'\""
|
|
)
|
|
result = machine.succeed(
|
|
"sudo -u fc psql -U fc -d fc -tA -c "
|
|
"\"SELECT count(*) FROM remote_builders "
|
|
"WHERE enabled = true "
|
|
"AND 'x86_64-linux' = ANY(systems) "
|
|
"AND (disabled_until IS NULL OR disabled_until < NOW())\""
|
|
)
|
|
assert result.strip() == "1", \
|
|
f"Expected non-disabled builder included, got count={result.strip()}"
|
|
|
|
with subtest("Recording success resets health state"):
|
|
# First set some failures
|
|
machine.succeed(
|
|
"sudo -u fc psql -U fc -d fc -c \""
|
|
"UPDATE remote_builders SET "
|
|
"consecutive_failures = 3, "
|
|
"disabled_until = NOW() + interval '1 hour', "
|
|
"last_failure = NOW() "
|
|
f"WHERE id = '{builder_id}'\""
|
|
)
|
|
# Simulate record_success (same as repo code)
|
|
machine.succeed(
|
|
"sudo -u fc psql -U fc -d fc -c \""
|
|
"UPDATE remote_builders SET "
|
|
"consecutive_failures = 0, "
|
|
"disabled_until = NULL "
|
|
f"WHERE id = '{builder_id}'\""
|
|
)
|
|
result = machine.succeed(
|
|
f"curl -sf http://127.0.0.1:3000/api/v1/admin/builders/{builder_id}"
|
|
)
|
|
b = json.loads(result)
|
|
assert b["consecutive_failures"] == 0, \
|
|
f"Expected 0 failures after success, got {b['consecutive_failures']}"
|
|
assert b["disabled_until"] is None, \
|
|
f"Expected disabled_until=null after success, got {b['disabled_until']}"
|
|
|
|
with subtest("Health fields visible in admin API list"):
|
|
result = machine.succeed(
|
|
f"curl -sf http://127.0.0.1:3000/api/v1/admin/builders {auth_header}"
|
|
)
|
|
builders = json.loads(result)
|
|
assert len(builders) >= 1, "Expected at least one builder"
|
|
b = builders[0]
|
|
assert "consecutive_failures" in b, "Missing consecutive_failures in API response"
|
|
assert "disabled_until" in b, "Missing disabled_until in API response"
|
|
assert "last_failure" in b, "Missing last_failure in API response"
|
|
|
|
with subtest("Exponential backoff increases with failures"):
|
|
# Record 1st failure: expect ~60s backoff
|
|
machine.succeed(
|
|
"sudo -u fc psql -U fc -d fc -c \""
|
|
f"UPDATE remote_builders SET consecutive_failures = 0, disabled_until = NULL WHERE id = '{builder_id}'\""
|
|
)
|
|
machine.succeed(
|
|
"sudo -u fc psql -U fc -d fc -c \""
|
|
"UPDATE remote_builders SET "
|
|
"consecutive_failures = LEAST(consecutive_failures + 1, 4), "
|
|
"last_failure = NOW(), "
|
|
"disabled_until = NOW() + make_interval(secs => 60.0 * power(3, LEAST(consecutive_failures + 1, 4) - 1)) "
|
|
f"WHERE id = '{builder_id}'\""
|
|
)
|
|
delta1 = machine.succeed(
|
|
"sudo -u fc psql -U fc -d fc -tA -c "
|
|
f"\"SELECT EXTRACT(EPOCH FROM (disabled_until - last_failure))::int FROM remote_builders WHERE id = '{builder_id}'\""
|
|
)
|
|
d1 = int(delta1.strip())
|
|
assert 55 <= d1 <= 65, f"1st failure backoff expected ~60s, got {d1}s"
|
|
|
|
# Record 2nd failure: expect ~180s backoff
|
|
machine.succeed(
|
|
"sudo -u fc psql -U fc -d fc -c \""
|
|
"UPDATE remote_builders SET "
|
|
"consecutive_failures = LEAST(consecutive_failures + 1, 4), "
|
|
"last_failure = NOW(), "
|
|
"disabled_until = NOW() + make_interval(secs => 60.0 * power(3, LEAST(consecutive_failures + 1, 4) - 1)) "
|
|
f"WHERE id = '{builder_id}'\""
|
|
)
|
|
delta2 = machine.succeed(
|
|
"sudo -u fc psql -U fc -d fc -tA -c "
|
|
f"\"SELECT EXTRACT(EPOCH FROM (disabled_until - last_failure))::int FROM remote_builders WHERE id = '{builder_id}'\""
|
|
)
|
|
d2 = int(delta2.strip())
|
|
assert 175 <= d2 <= 185, f"2nd failure backoff expected ~180s, got {d2}s"
|
|
|
|
# Cleanup
|
|
machine.succeed(
|
|
f"curl -sf -X DELETE http://127.0.0.1:3000/api/v1/admin/builders/{builder_id} {auth_header}"
|
|
)
|
|
'';
|
|
}
|