circus/nix/tests/machine-health.nix
NotAShelf 9dde82d46f
nix: add tests for channel tarballs and gc pinning
Signed-off-by: NotAShelf <raf@notashelf.dev>
Change-Id: Ifb9d95d5206b7b1cf23fa3d5aaf9d0db6a6a6964
2026-02-28 12:18:10 +03:00

204 lines
8.5 KiB
Nix

{
pkgs,
self,
}:
pkgs.testers.nixosTest {
name = "fc-machine-health";
nodes.machine = {
imports = [
self.nixosModules.fc-ci
../vm-common.nix
];
_module.args.self = self;
};
testScript = ''
import hashlib
import json
machine.start()
machine.wait_for_unit("postgresql.service")
machine.wait_until_succeeds("sudo -u fc psql -U fc -d fc -c 'SELECT 1'", timeout=30)
machine.wait_for_unit("fc-server.service")
machine.wait_until_succeeds("curl -sf http://127.0.0.1:3000/health", timeout=30)
api_token = "fc_testkey123"
api_hash = hashlib.sha256(api_token.encode()).hexdigest()
machine.succeed(
f"sudo -u fc psql -U fc -d fc -c \"INSERT INTO api_keys (name, key_hash, role) VALUES ('test', '{api_hash}', 'admin')\""
)
auth_header = f"-H 'Authorization: Bearer {api_token}'"
# Create a builder via API
builder_json = machine.succeed(
"curl -sf -X POST http://127.0.0.1:3000/api/v1/admin/builders "
f"{auth_header} "
"-H 'Content-Type: application/json' "
"-d '{\"name\": \"test-builder\", \"ssh_uri\": \"ssh://builder@host\", \"systems\": [\"x86_64-linux\"]}'"
)
builder = json.loads(builder_json)
builder_id = builder["id"]
with subtest("New builder starts with zero failures"):
assert builder["consecutive_failures"] == 0, \
f"Expected 0 failures, got {builder['consecutive_failures']}"
assert builder["disabled_until"] is None, \
f"Expected disabled_until=null, got {builder['disabled_until']}"
assert builder["last_failure"] is None, \
f"Expected last_failure=null, got {builder['last_failure']}"
with subtest("Recording failure increments consecutive_failures"):
machine.succeed(
"sudo -u fc psql -U fc -d fc -c \""
"UPDATE remote_builders SET "
"consecutive_failures = LEAST(consecutive_failures + 1, 4), "
"last_failure = NOW(), "
"disabled_until = NOW() + interval '60 seconds' "
f"WHERE id = '{builder_id}'\""
)
result = machine.succeed(
f"curl -sf http://127.0.0.1:3000/api/v1/admin/builders/{builder_id}"
)
b = json.loads(result)
assert b["consecutive_failures"] == 1, \
f"Expected 1 failure, got {b['consecutive_failures']}"
assert b["disabled_until"] is not None, \
"Expected disabled_until to be set"
assert b["last_failure"] is not None, \
"Expected last_failure to be set"
with subtest("Failures cap at 4"):
machine.succeed(
"sudo -u fc psql -U fc -d fc -c \""
f"UPDATE remote_builders SET consecutive_failures = 10 WHERE id = '{builder_id}'\""
)
# Simulate record_failure SQL (same as repo code)
machine.succeed(
"sudo -u fc psql -U fc -d fc -c \""
"UPDATE remote_builders SET "
"consecutive_failures = LEAST(consecutive_failures + 1, 4), "
"last_failure = NOW(), "
"disabled_until = NOW() + make_interval(secs => 60.0 * power(3, LEAST(consecutive_failures + 1, 4) - 1)) "
f"WHERE id = '{builder_id}'\""
)
result = machine.succeed(
"sudo -u fc psql -U fc -d fc -tA -c "
f"\"SELECT consecutive_failures FROM remote_builders WHERE id = '{builder_id}'\""
)
assert result.strip() == "4", f"Expected failures capped at 4, got {result.strip()}"
with subtest("Disabled builder excluded from find_for_system"):
# Set disabled_until far in the future
machine.succeed(
"sudo -u fc psql -U fc -d fc -c \""
"UPDATE remote_builders SET disabled_until = NOW() + interval '1 hour' "
f"WHERE id = '{builder_id}'\""
)
result = machine.succeed(
"sudo -u fc psql -U fc -d fc -tA -c "
"\"SELECT count(*) FROM remote_builders "
"WHERE enabled = true "
"AND 'x86_64-linux' = ANY(systems) "
"AND (disabled_until IS NULL OR disabled_until < NOW())\""
)
assert result.strip() == "0", \
f"Expected disabled builder excluded, got count={result.strip()}"
with subtest("Non-disabled builder included in find_for_system"):
# Clear disabled_until
machine.succeed(
"sudo -u fc psql -U fc -d fc -c \""
f"UPDATE remote_builders SET disabled_until = NULL WHERE id = '{builder_id}'\""
)
result = machine.succeed(
"sudo -u fc psql -U fc -d fc -tA -c "
"\"SELECT count(*) FROM remote_builders "
"WHERE enabled = true "
"AND 'x86_64-linux' = ANY(systems) "
"AND (disabled_until IS NULL OR disabled_until < NOW())\""
)
assert result.strip() == "1", \
f"Expected non-disabled builder included, got count={result.strip()}"
with subtest("Recording success resets health state"):
# First set some failures
machine.succeed(
"sudo -u fc psql -U fc -d fc -c \""
"UPDATE remote_builders SET "
"consecutive_failures = 3, "
"disabled_until = NOW() + interval '1 hour', "
"last_failure = NOW() "
f"WHERE id = '{builder_id}'\""
)
# Simulate record_success (same as repo code)
machine.succeed(
"sudo -u fc psql -U fc -d fc -c \""
"UPDATE remote_builders SET "
"consecutive_failures = 0, "
"disabled_until = NULL "
f"WHERE id = '{builder_id}'\""
)
result = machine.succeed(
f"curl -sf http://127.0.0.1:3000/api/v1/admin/builders/{builder_id}"
)
b = json.loads(result)
assert b["consecutive_failures"] == 0, \
f"Expected 0 failures after success, got {b['consecutive_failures']}"
assert b["disabled_until"] is None, \
f"Expected disabled_until=null after success, got {b['disabled_until']}"
with subtest("Health fields visible in admin API list"):
result = machine.succeed(
f"curl -sf http://127.0.0.1:3000/api/v1/admin/builders {auth_header}"
)
builders = json.loads(result)
assert len(builders) >= 1, "Expected at least one builder"
b = builders[0]
assert "consecutive_failures" in b, "Missing consecutive_failures in API response"
assert "disabled_until" in b, "Missing disabled_until in API response"
assert "last_failure" in b, "Missing last_failure in API response"
with subtest("Exponential backoff increases with failures"):
# Record 1st failure: expect ~60s backoff
machine.succeed(
"sudo -u fc psql -U fc -d fc -c \""
f"UPDATE remote_builders SET consecutive_failures = 0, disabled_until = NULL WHERE id = '{builder_id}'\""
)
machine.succeed(
"sudo -u fc psql -U fc -d fc -c \""
"UPDATE remote_builders SET "
"consecutive_failures = LEAST(consecutive_failures + 1, 4), "
"last_failure = NOW(), "
"disabled_until = NOW() + make_interval(secs => 60.0 * power(3, LEAST(consecutive_failures + 1, 4) - 1)) "
f"WHERE id = '{builder_id}'\""
)
delta1 = machine.succeed(
"sudo -u fc psql -U fc -d fc -tA -c "
f"\"SELECT EXTRACT(EPOCH FROM (disabled_until - last_failure))::int FROM remote_builders WHERE id = '{builder_id}'\""
)
d1 = int(delta1.strip())
assert 55 <= d1 <= 65, f"1st failure backoff expected ~60s, got {d1}s"
# Record 2nd failure: expect ~180s backoff
machine.succeed(
"sudo -u fc psql -U fc -d fc -c \""
"UPDATE remote_builders SET "
"consecutive_failures = LEAST(consecutive_failures + 1, 4), "
"last_failure = NOW(), "
"disabled_until = NOW() + make_interval(secs => 60.0 * power(3, LEAST(consecutive_failures + 1, 4) - 1)) "
f"WHERE id = '{builder_id}'\""
)
delta2 = machine.succeed(
"sudo -u fc psql -U fc -d fc -tA -c "
f"\"SELECT EXTRACT(EPOCH FROM (disabled_until - last_failure))::int FROM remote_builders WHERE id = '{builder_id}'\""
)
d2 = int(delta2.strip())
assert 175 <= d2 <= 185, f"2nd failure backoff expected ~180s, got {d2}s"
# Cleanup
machine.succeed(
f"curl -sf -X DELETE http://127.0.0.1:3000/api/v1/admin/builders/{builder_id} {auth_header}"
)
'';
}