fc-common: add GC pinning and machine health infrastructure

Migration 017 adds `builds.keep`, `jobsets.keep_nr`, and health tracking
columns to `remote_builders`. Repo layer implements `set_keep`,
`list_pinned_ids`, `record_failure` with exponential backoff,
`record_success`, and `find_for_system` filtering of disabled builders.
GC root cleanup now skips pinned builds.

Signed-off-by: NotAShelf <raf@notashelf.dev>
Change-Id: Ibba121de3dc42f71204e3a8f5776aa8b6a6a6964
This commit is contained in:
raf 2026-02-17 00:02:30 +03:00
commit 5b472a2f57
Signed by: NotAShelf
GPG key ID: 29D95B64378DB4BF
16 changed files with 173 additions and 23 deletions

View file

@ -374,6 +374,34 @@ pub async fn get_completed_by_drv_paths(
)
}
/// Return the set of build IDs that have `keep = true` (GC-pinned).
pub async fn list_pinned_ids(
pool: &PgPool,
) -> Result<std::collections::HashSet<Uuid>> {
let rows: Vec<(Uuid,)> =
sqlx::query_as("SELECT id FROM builds WHERE keep = true")
.fetch_all(pool)
.await
.map_err(CiError::Database)?;
Ok(rows.into_iter().map(|(id,)| id).collect())
}
/// Set the `keep` (GC pin) flag on a build.
pub async fn set_keep(
pool: &PgPool,
id: Uuid,
keep: bool,
) -> Result<Build> {
sqlx::query_as::<_, Build>(
"UPDATE builds SET keep = $1 WHERE id = $2 RETURNING *",
)
.bind(keep)
.bind(id)
.fetch_optional(pool)
.await?
.ok_or_else(|| CiError::NotFound(format!("Build {id} not found")))
}
/// Set the `builder_id` for a build.
pub async fn set_builder(
pool: &PgPool,

View file

@ -18,11 +18,12 @@ pub async fn create(pool: &PgPool, input: CreateJobset) -> Result<Jobset> {
let flake_mode = input.flake_mode.unwrap_or(true);
let check_interval = input.check_interval.unwrap_or(60);
let scheduling_shares = input.scheduling_shares.unwrap_or(100);
let keep_nr = input.keep_nr.unwrap_or(3);
sqlx::query_as::<_, Jobset>(
"INSERT INTO jobsets (project_id, name, nix_expression, enabled, \
flake_mode, check_interval, branch, scheduling_shares, state) VALUES \
($1, $2, $3, $4, $5, $6, $7, $8, $9) RETURNING *",
flake_mode, check_interval, branch, scheduling_shares, state, keep_nr) \
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10) RETURNING *",
)
.bind(input.project_id)
.bind(&input.name)
@ -33,6 +34,7 @@ pub async fn create(pool: &PgPool, input: CreateJobset) -> Result<Jobset> {
.bind(&input.branch)
.bind(scheduling_shares)
.bind(state.as_str())
.bind(keep_nr)
.fetch_one(pool)
.await
.map_err(|e| {
@ -106,11 +108,12 @@ pub async fn update(
let scheduling_shares = input
.scheduling_shares
.unwrap_or(existing.scheduling_shares);
let keep_nr = input.keep_nr.unwrap_or(existing.keep_nr);
sqlx::query_as::<_, Jobset>(
"UPDATE jobsets SET name = $1, nix_expression = $2, enabled = $3, \
flake_mode = $4, check_interval = $5, branch = $6, scheduling_shares = \
$7, state = $8 WHERE id = $9 RETURNING *",
$7, state = $8, keep_nr = $9 WHERE id = $10 RETURNING *",
)
.bind(&name)
.bind(&nix_expression)
@ -120,6 +123,7 @@ pub async fn update(
.bind(&branch)
.bind(scheduling_shares)
.bind(state.as_str())
.bind(keep_nr)
.bind(id)
.fetch_one(pool)
.await
@ -160,15 +164,17 @@ pub async fn upsert(pool: &PgPool, input: CreateJobset) -> Result<Jobset> {
let flake_mode = input.flake_mode.unwrap_or(true);
let check_interval = input.check_interval.unwrap_or(60);
let scheduling_shares = input.scheduling_shares.unwrap_or(100);
let keep_nr = input.keep_nr.unwrap_or(3);
sqlx::query_as::<_, Jobset>(
"INSERT INTO jobsets (project_id, name, nix_expression, enabled, \
flake_mode, check_interval, branch, scheduling_shares, state) VALUES \
($1, $2, $3, $4, $5, $6, $7, $8, $9) ON CONFLICT (project_id, name) DO \
UPDATE SET nix_expression = EXCLUDED.nix_expression, enabled = \
EXCLUDED.enabled, flake_mode = EXCLUDED.flake_mode, check_interval = \
EXCLUDED.check_interval, branch = EXCLUDED.branch, scheduling_shares = \
EXCLUDED.scheduling_shares, state = EXCLUDED.state RETURNING *",
flake_mode, check_interval, branch, scheduling_shares, state, keep_nr) \
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10) ON CONFLICT \
(project_id, name) DO UPDATE SET nix_expression = \
EXCLUDED.nix_expression, enabled = EXCLUDED.enabled, flake_mode = \
EXCLUDED.flake_mode, check_interval = EXCLUDED.check_interval, branch = \
EXCLUDED.branch, scheduling_shares = EXCLUDED.scheduling_shares, state = \
EXCLUDED.state, keep_nr = EXCLUDED.keep_nr RETURNING *",
)
.bind(input.project_id)
.bind(&input.name)
@ -179,6 +185,7 @@ pub async fn upsert(pool: &PgPool, input: CreateJobset) -> Result<Jobset> {
.bind(&input.branch)
.bind(scheduling_shares)
.bind(state.as_str())
.bind(keep_nr)
.fetch_one(pool)
.await
.map_err(CiError::Database)

View file

@ -70,12 +70,14 @@ pub async fn list_enabled(pool: &PgPool) -> Result<Vec<RemoteBuilder>> {
}
/// Find a suitable builder for the given system.
/// Excludes builders that are temporarily disabled due to consecutive failures.
pub async fn find_for_system(
pool: &PgPool,
system: &str,
) -> Result<Vec<RemoteBuilder>> {
sqlx::query_as::<_, RemoteBuilder>(
"SELECT * FROM remote_builders WHERE enabled = true AND $1 = ANY(systems) \
AND (disabled_until IS NULL OR disabled_until < NOW()) \
ORDER BY speed_factor DESC",
)
.bind(system)
@ -84,6 +86,41 @@ pub async fn find_for_system(
.map_err(CiError::Database)
}
/// Record a build failure for a remote builder.
/// Increments consecutive_failures (capped at 4), sets last_failure,
/// and computes disabled_until with exponential backoff.
/// Backoff formula (from Hydra): delta = 60 * 3^(min(failures, 4) - 1) seconds.
pub async fn record_failure(pool: &PgPool, id: Uuid) -> Result<RemoteBuilder> {
sqlx::query_as::<_, RemoteBuilder>(
"UPDATE remote_builders SET \
consecutive_failures = LEAST(consecutive_failures + 1, 4), \
last_failure = NOW(), \
disabled_until = NOW() + make_interval(secs => \
60.0 * power(3, LEAST(consecutive_failures + 1, 4) - 1) + (random() * 30)::int \
) \
WHERE id = $1 RETURNING *",
)
.bind(id)
.fetch_optional(pool)
.await?
.ok_or_else(|| CiError::NotFound(format!("Remote builder {id} not found")))
}
/// Record a build success for a remote builder.
/// Resets consecutive_failures and clears disabled_until.
pub async fn record_success(pool: &PgPool, id: Uuid) -> Result<RemoteBuilder> {
sqlx::query_as::<_, RemoteBuilder>(
"UPDATE remote_builders SET \
consecutive_failures = 0, \
disabled_until = NULL \
WHERE id = $1 RETURNING *",
)
.bind(id)
.fetch_optional(pool)
.await?
.ok_or_else(|| CiError::NotFound(format!("Remote builder {id} not found")))
}
pub async fn update(
pool: &PgPool,
id: Uuid,