fc-common: add GC pinning and machine health infrastructure
Migration 017 adds `builds.keep`, `jobsets.keep_nr`, and health tracking columns to `remote_builders`. Repo layer implements `set_keep`, `list_pinned_ids`, `record_failure` with exponential backoff, `record_success`, and `find_for_system` filtering of disabled builders. GC root cleanup now skips pinned builds. Signed-off-by: NotAShelf <raf@notashelf.dev> Change-Id: Ibba121de3dc42f71204e3a8f5776aa8b6a6a6964
This commit is contained in:
parent
25699e5e97
commit
5b472a2f57
16 changed files with 173 additions and 23 deletions
|
|
@ -374,6 +374,34 @@ pub async fn get_completed_by_drv_paths(
|
|||
)
|
||||
}
|
||||
|
||||
/// Return the set of build IDs that have `keep = true` (GC-pinned).
|
||||
pub async fn list_pinned_ids(
|
||||
pool: &PgPool,
|
||||
) -> Result<std::collections::HashSet<Uuid>> {
|
||||
let rows: Vec<(Uuid,)> =
|
||||
sqlx::query_as("SELECT id FROM builds WHERE keep = true")
|
||||
.fetch_all(pool)
|
||||
.await
|
||||
.map_err(CiError::Database)?;
|
||||
Ok(rows.into_iter().map(|(id,)| id).collect())
|
||||
}
|
||||
|
||||
/// Set the `keep` (GC pin) flag on a build.
|
||||
pub async fn set_keep(
|
||||
pool: &PgPool,
|
||||
id: Uuid,
|
||||
keep: bool,
|
||||
) -> Result<Build> {
|
||||
sqlx::query_as::<_, Build>(
|
||||
"UPDATE builds SET keep = $1 WHERE id = $2 RETURNING *",
|
||||
)
|
||||
.bind(keep)
|
||||
.bind(id)
|
||||
.fetch_optional(pool)
|
||||
.await?
|
||||
.ok_or_else(|| CiError::NotFound(format!("Build {id} not found")))
|
||||
}
|
||||
|
||||
/// Set the `builder_id` for a build.
|
||||
pub async fn set_builder(
|
||||
pool: &PgPool,
|
||||
|
|
|
|||
|
|
@ -18,11 +18,12 @@ pub async fn create(pool: &PgPool, input: CreateJobset) -> Result<Jobset> {
|
|||
let flake_mode = input.flake_mode.unwrap_or(true);
|
||||
let check_interval = input.check_interval.unwrap_or(60);
|
||||
let scheduling_shares = input.scheduling_shares.unwrap_or(100);
|
||||
let keep_nr = input.keep_nr.unwrap_or(3);
|
||||
|
||||
sqlx::query_as::<_, Jobset>(
|
||||
"INSERT INTO jobsets (project_id, name, nix_expression, enabled, \
|
||||
flake_mode, check_interval, branch, scheduling_shares, state) VALUES \
|
||||
($1, $2, $3, $4, $5, $6, $7, $8, $9) RETURNING *",
|
||||
flake_mode, check_interval, branch, scheduling_shares, state, keep_nr) \
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10) RETURNING *",
|
||||
)
|
||||
.bind(input.project_id)
|
||||
.bind(&input.name)
|
||||
|
|
@ -33,6 +34,7 @@ pub async fn create(pool: &PgPool, input: CreateJobset) -> Result<Jobset> {
|
|||
.bind(&input.branch)
|
||||
.bind(scheduling_shares)
|
||||
.bind(state.as_str())
|
||||
.bind(keep_nr)
|
||||
.fetch_one(pool)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
|
|
@ -106,11 +108,12 @@ pub async fn update(
|
|||
let scheduling_shares = input
|
||||
.scheduling_shares
|
||||
.unwrap_or(existing.scheduling_shares);
|
||||
let keep_nr = input.keep_nr.unwrap_or(existing.keep_nr);
|
||||
|
||||
sqlx::query_as::<_, Jobset>(
|
||||
"UPDATE jobsets SET name = $1, nix_expression = $2, enabled = $3, \
|
||||
flake_mode = $4, check_interval = $5, branch = $6, scheduling_shares = \
|
||||
$7, state = $8 WHERE id = $9 RETURNING *",
|
||||
$7, state = $8, keep_nr = $9 WHERE id = $10 RETURNING *",
|
||||
)
|
||||
.bind(&name)
|
||||
.bind(&nix_expression)
|
||||
|
|
@ -120,6 +123,7 @@ pub async fn update(
|
|||
.bind(&branch)
|
||||
.bind(scheduling_shares)
|
||||
.bind(state.as_str())
|
||||
.bind(keep_nr)
|
||||
.bind(id)
|
||||
.fetch_one(pool)
|
||||
.await
|
||||
|
|
@ -160,15 +164,17 @@ pub async fn upsert(pool: &PgPool, input: CreateJobset) -> Result<Jobset> {
|
|||
let flake_mode = input.flake_mode.unwrap_or(true);
|
||||
let check_interval = input.check_interval.unwrap_or(60);
|
||||
let scheduling_shares = input.scheduling_shares.unwrap_or(100);
|
||||
let keep_nr = input.keep_nr.unwrap_or(3);
|
||||
|
||||
sqlx::query_as::<_, Jobset>(
|
||||
"INSERT INTO jobsets (project_id, name, nix_expression, enabled, \
|
||||
flake_mode, check_interval, branch, scheduling_shares, state) VALUES \
|
||||
($1, $2, $3, $4, $5, $6, $7, $8, $9) ON CONFLICT (project_id, name) DO \
|
||||
UPDATE SET nix_expression = EXCLUDED.nix_expression, enabled = \
|
||||
EXCLUDED.enabled, flake_mode = EXCLUDED.flake_mode, check_interval = \
|
||||
EXCLUDED.check_interval, branch = EXCLUDED.branch, scheduling_shares = \
|
||||
EXCLUDED.scheduling_shares, state = EXCLUDED.state RETURNING *",
|
||||
flake_mode, check_interval, branch, scheduling_shares, state, keep_nr) \
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10) ON CONFLICT \
|
||||
(project_id, name) DO UPDATE SET nix_expression = \
|
||||
EXCLUDED.nix_expression, enabled = EXCLUDED.enabled, flake_mode = \
|
||||
EXCLUDED.flake_mode, check_interval = EXCLUDED.check_interval, branch = \
|
||||
EXCLUDED.branch, scheduling_shares = EXCLUDED.scheduling_shares, state = \
|
||||
EXCLUDED.state, keep_nr = EXCLUDED.keep_nr RETURNING *",
|
||||
)
|
||||
.bind(input.project_id)
|
||||
.bind(&input.name)
|
||||
|
|
@ -179,6 +185,7 @@ pub async fn upsert(pool: &PgPool, input: CreateJobset) -> Result<Jobset> {
|
|||
.bind(&input.branch)
|
||||
.bind(scheduling_shares)
|
||||
.bind(state.as_str())
|
||||
.bind(keep_nr)
|
||||
.fetch_one(pool)
|
||||
.await
|
||||
.map_err(CiError::Database)
|
||||
|
|
|
|||
|
|
@ -70,12 +70,14 @@ pub async fn list_enabled(pool: &PgPool) -> Result<Vec<RemoteBuilder>> {
|
|||
}
|
||||
|
||||
/// Find a suitable builder for the given system.
|
||||
/// Excludes builders that are temporarily disabled due to consecutive failures.
|
||||
pub async fn find_for_system(
|
||||
pool: &PgPool,
|
||||
system: &str,
|
||||
) -> Result<Vec<RemoteBuilder>> {
|
||||
sqlx::query_as::<_, RemoteBuilder>(
|
||||
"SELECT * FROM remote_builders WHERE enabled = true AND $1 = ANY(systems) \
|
||||
AND (disabled_until IS NULL OR disabled_until < NOW()) \
|
||||
ORDER BY speed_factor DESC",
|
||||
)
|
||||
.bind(system)
|
||||
|
|
@ -84,6 +86,41 @@ pub async fn find_for_system(
|
|||
.map_err(CiError::Database)
|
||||
}
|
||||
|
||||
/// Record a build failure for a remote builder.
|
||||
/// Increments consecutive_failures (capped at 4), sets last_failure,
|
||||
/// and computes disabled_until with exponential backoff.
|
||||
/// Backoff formula (from Hydra): delta = 60 * 3^(min(failures, 4) - 1) seconds.
|
||||
pub async fn record_failure(pool: &PgPool, id: Uuid) -> Result<RemoteBuilder> {
|
||||
sqlx::query_as::<_, RemoteBuilder>(
|
||||
"UPDATE remote_builders SET \
|
||||
consecutive_failures = LEAST(consecutive_failures + 1, 4), \
|
||||
last_failure = NOW(), \
|
||||
disabled_until = NOW() + make_interval(secs => \
|
||||
60.0 * power(3, LEAST(consecutive_failures + 1, 4) - 1) + (random() * 30)::int \
|
||||
) \
|
||||
WHERE id = $1 RETURNING *",
|
||||
)
|
||||
.bind(id)
|
||||
.fetch_optional(pool)
|
||||
.await?
|
||||
.ok_or_else(|| CiError::NotFound(format!("Remote builder {id} not found")))
|
||||
}
|
||||
|
||||
/// Record a build success for a remote builder.
|
||||
/// Resets consecutive_failures and clears disabled_until.
|
||||
pub async fn record_success(pool: &PgPool, id: Uuid) -> Result<RemoteBuilder> {
|
||||
sqlx::query_as::<_, RemoteBuilder>(
|
||||
"UPDATE remote_builders SET \
|
||||
consecutive_failures = 0, \
|
||||
disabled_until = NULL \
|
||||
WHERE id = $1 RETURNING *",
|
||||
)
|
||||
.bind(id)
|
||||
.fetch_optional(pool)
|
||||
.await?
|
||||
.ok_or_else(|| CiError::NotFound(format!("Remote builder {id} not found")))
|
||||
}
|
||||
|
||||
pub async fn update(
|
||||
pool: &PgPool,
|
||||
id: Uuid,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue