circus/crates/evaluator/src/eval_loop.rs
NotAShelf 38ed7faee2
various: replace silent error discards with logged warnings
Signed-off-by: NotAShelf <raf@notashelf.dev>
Change-Id: I465d760b5330980270b64b4a89abc09f6a6a6964
2026-02-15 23:37:51 +03:00

653 lines
18 KiB
Rust

use std::{collections::HashMap, time::Duration};
use anyhow::Context;
use chrono::Utc;
use fc_common::{
config::EvaluatorConfig,
error::{CiError, check_disk_space},
models::{
CreateBuild,
CreateEvaluation,
EvaluationStatus,
JobsetInput,
JobsetState,
},
repo,
};
use futures::stream::{self, StreamExt};
use sqlx::PgPool;
use tracing::info;
use uuid::Uuid;
pub async fn run(pool: PgPool, config: EvaluatorConfig) -> anyhow::Result<()> {
let poll_interval = Duration::from_secs(config.poll_interval);
let nix_timeout = Duration::from_secs(config.nix_timeout);
let git_timeout = Duration::from_secs(config.git_timeout);
loop {
if let Err(e) = run_cycle(&pool, &config, nix_timeout, git_timeout).await {
tracing::error!("Evaluation cycle failed: {e}");
}
tokio::time::sleep(poll_interval).await;
}
}
async fn run_cycle(
pool: &PgPool,
config: &EvaluatorConfig,
nix_timeout: Duration,
git_timeout: Duration,
) -> anyhow::Result<()> {
let active = repo::jobsets::list_active(pool).await?;
// Filter to jobsets that are due for evaluation based on their
// check_interval and last_checked_at
let now = Utc::now();
let ready: Vec<_> = active
.into_iter()
.filter(|js| {
match js.last_checked_at {
Some(last) => {
let elapsed = (now - last).num_seconds();
elapsed >= i64::from(js.check_interval)
},
None => true, // Never checked, evaluate now
}
})
.collect();
tracing::info!("Found {} jobsets due for evaluation", ready.len());
let max_concurrent = config.max_concurrent_evals;
stream::iter(ready)
.for_each_concurrent(max_concurrent, |jobset| {
async move {
if let Err(e) =
evaluate_jobset(pool, &jobset, config, nix_timeout, git_timeout).await
{
tracing::error!(
jobset_id = %jobset.id,
jobset_name = %jobset.name,
"Failed to evaluate jobset: {e}"
);
let msg = e.to_string().to_lowercase();
if msg.contains("no space left on device")
|| msg.contains("disk full")
|| msg.contains("enospc")
|| msg.contains("cannot create")
|| msg.contains("sqlite")
{
tracing::error!(
"DISK SPACE ISSUE DETECTED: Evaluation failed due to disk space \
problems. Please free up space on the server:\n- Run \
`nix-collect-garbage -d` to clean the Nix store\n- Clear \
/tmp/fc-evaluator directory\n- Check build logs directory if \
configured"
);
}
}
}
})
.await;
Ok(())
}
async fn evaluate_jobset(
pool: &PgPool,
jobset: &fc_common::models::ActiveJobset,
config: &EvaluatorConfig,
nix_timeout: Duration,
git_timeout: Duration,
) -> anyhow::Result<()> {
let url = jobset.repository_url.clone();
let work_dir = config.work_dir.clone();
let project_name = jobset.project_name.clone();
let branch = jobset.branch.clone();
tracing::info!(
jobset = %jobset.name,
project = %project_name,
"Starting evaluation cycle"
);
match check_disk_space(&work_dir) {
Ok(info) => {
if info.is_critical() {
tracing::error!(
jobset = %jobset.name,
"CRITICAL: Less than 1GB disk space available. {}",
info.summary()
);
} else if info.is_low() {
tracing::warn!(
jobset = %jobset.name,
"LOW: Less than 5GB disk space available. {}",
info.summary()
);
}
},
Err(e) => {
tracing::warn!(
jobset = %jobset.name,
"Disk space check failed: {}. Proceeding anyway...",
e
);
},
}
// Clone/fetch in a blocking task (git2 is sync) with timeout
let (repo_path, commit_hash) = tokio::time::timeout(
git_timeout,
tokio::task::spawn_blocking(move || {
crate::git::clone_or_fetch(
&url,
&work_dir,
&project_name,
branch.as_deref(),
)
}),
)
.await
.map_err(|_| {
anyhow::anyhow!("Git operation timed out after {git_timeout:?}")
})???;
// Query jobset inputs
let inputs = repo::jobset_inputs::list_for_jobset(pool, jobset.id)
.await
.unwrap_or_default();
// Compute inputs hash for eval caching (commit + all input values/revisions)
let inputs_hash = compute_inputs_hash(&commit_hash, &inputs);
// Check if this exact combination was already evaluated (eval caching)
if let Ok(Some(cached)) =
repo::evaluations::get_by_inputs_hash(pool, jobset.id, &inputs_hash).await
{
tracing::debug!(
jobset = %jobset.name,
commit = %commit_hash,
cached_eval = %cached.id,
"Inputs unchanged (hash: {}), skipping evaluation",
&inputs_hash[..16],
);
// Create evaluation record even when skipped so system tracks this check
// Handle duplicate key conflict gracefully (another evaluator may have
// created it) - fall through to process existing evaluation instead of
// skipping
if let Err(e) = repo::evaluations::create(pool, CreateEvaluation {
jobset_id: jobset.id,
commit_hash: commit_hash.clone(),
pr_number: None,
pr_head_branch: None,
pr_base_branch: None,
pr_action: None,
})
.await
{
if !matches!(e, CiError::Conflict(_)) {
return Err(e.into());
}
tracing::info!(
jobset = %jobset.name,
commit = %commit_hash,
"Evaluation already exists (concurrent creation in inputs_hash path), will process"
);
} else {
// Successfully created new evaluation, can skip
repo::jobsets::update_last_checked(pool, jobset.id).await?;
return Ok(());
}
}
// Also skip if commit hasn't changed (backward compat)
if let Some(latest) = repo::evaluations::get_latest(pool, jobset.id).await?
&& latest.commit_hash == commit_hash
&& latest.inputs_hash.as_deref() == Some(&inputs_hash)
{
tracing::debug!(
jobset = %jobset.name,
commit = %commit_hash,
"Inputs unchanged (hash: {}), skipping evaluation",
&inputs_hash[..16],
);
// Create evaluation record even when skipped so system tracks this check
// Handle duplicate key conflict gracefully (another evaluator may have
// created it) - fall through to process existing evaluation instead of
// skipping
if let Err(e) = repo::evaluations::create(pool, CreateEvaluation {
jobset_id: jobset.id,
commit_hash: commit_hash.clone(),
pr_number: None,
pr_head_branch: None,
pr_base_branch: None,
pr_action: None,
})
.await
{
if !matches!(e, CiError::Conflict(_)) {
return Err(e.into());
}
tracing::info!(
jobset = %jobset.name,
commit = %commit_hash,
"Evaluation already exists (concurrent creation in commit path), will process"
);
let existing = repo::evaluations::get_by_jobset_and_commit(
pool,
jobset.id,
&commit_hash,
)
.await?
.ok_or_else(|| {
anyhow::anyhow!(
"Evaluation conflict but not found: {}/{}",
jobset.id,
commit_hash
)
})?;
if existing.status == EvaluationStatus::Completed {
// Check if we need to re-evaluate due to no builds
let builds =
repo::builds::list_for_evaluation(pool, existing.id).await?;
if builds.is_empty() {
info!(
"Evaluation completed with 0 builds, re-running nix evaluation \
jobset={} commit={}",
jobset.name, commit_hash
);
// Update existing evaluation status to Running
repo::evaluations::update_status(
pool,
existing.id,
EvaluationStatus::Running,
None,
)
.await?;
// Use existing evaluation instead of creating new one
let eval = existing;
// Run nix evaluation and create builds from the result
let eval_result = crate::nix::evaluate(
&repo_path,
&jobset.nix_expression,
jobset.flake_mode,
nix_timeout,
config,
&inputs,
)
.await?;
create_builds_from_eval(pool, eval.id, &eval_result).await?;
repo::evaluations::update_status(
pool,
eval.id,
EvaluationStatus::Completed,
None,
)
.await?;
repo::jobsets::update_last_checked(pool, jobset.id).await?;
return Ok(());
} else {
info!(
"Evaluation already completed with {} builds, skipping nix \
evaluation jobset={} commit={}",
builds.len(),
jobset.name,
commit_hash
);
repo::jobsets::update_last_checked(pool, jobset.id).await?;
return Ok(());
}
}
// Existing evaluation is pending or running, update status and continue
repo::evaluations::update_status(
pool,
existing.id,
EvaluationStatus::Running,
None,
)
.await?;
} else {
// Successfully created new evaluation, can skip
repo::jobsets::update_last_checked(pool, jobset.id).await?;
return Ok(());
}
}
tracing::info!(
jobset = %jobset.name,
commit = %commit_hash,
"Starting evaluation"
);
// Create evaluation record. If it already exists (race condition), fetch the
// existing one and continue. Only update status if it's still pending.
let eval = match repo::evaluations::create(pool, CreateEvaluation {
jobset_id: jobset.id,
commit_hash: commit_hash.clone(),
pr_number: None,
pr_head_branch: None,
pr_base_branch: None,
pr_action: None,
})
.await
{
Ok(eval) => eval,
Err(CiError::Conflict(_)) => {
tracing::info!(
jobset = %jobset.name,
commit = %commit_hash,
"Evaluation already exists (conflict), fetching existing record"
);
let existing = repo::evaluations::get_by_jobset_and_commit(
pool,
jobset.id,
&commit_hash,
)
.await?
.ok_or_else(|| {
anyhow::anyhow!(
"Evaluation conflict but not found: {}/{}",
jobset.id,
commit_hash
)
})?;
if existing.status == EvaluationStatus::Pending {
repo::evaluations::update_status(
pool,
existing.id,
EvaluationStatus::Running,
None,
)
.await?;
} else if existing.status == EvaluationStatus::Completed {
let build_count = repo::builds::count_filtered(
pool,
Some(existing.id),
None,
None,
None,
)
.await?;
if build_count > 0 {
info!(
"Evaluation already completed with {} builds, skipping nix \
evaluation jobset={} commit={}",
build_count, jobset.name, commit_hash
);
return Ok(());
} else {
info!(
"Evaluation completed but has 0 builds, re-running nix evaluation \
jobset={} commit={}",
jobset.name, commit_hash
);
}
}
existing
},
Err(e) => {
return Err(anyhow::anyhow!(e)).with_context(|| {
format!("failed to create evaluation for jobset {}", jobset.name)
});
},
};
// Set inputs hash (only needed for new evaluations, not existing ones)
if let Err(e) =
repo::evaluations::set_inputs_hash(pool, eval.id, &inputs_hash).await
{
tracing::warn!(eval_id = %eval.id, "Failed to set evaluation inputs hash: {e}");
}
// Check for declarative config in repo
check_declarative_config(pool, &repo_path, jobset.project_id).await;
// Run nix evaluation
match crate::nix::evaluate(
&repo_path,
&jobset.nix_expression,
jobset.flake_mode,
nix_timeout,
config,
&inputs,
)
.await
{
Ok(eval_result) => {
tracing::debug!(jobset = %jobset.name, job_count = eval_result.jobs.len(), "Nix evaluation returned");
tracing::info!(
jobset = %jobset.name,
count = eval_result.jobs.len(),
errors = eval_result.error_count,
"Evaluation discovered jobs"
);
create_builds_from_eval(pool, eval.id, &eval_result).await?;
repo::evaluations::update_status(
pool,
eval.id,
EvaluationStatus::Completed,
None,
)
.await?;
},
Err(e) => {
let msg = e.to_string();
tracing::error!(jobset = %jobset.name, "Evaluation failed: {msg}");
repo::evaluations::update_status(
pool,
eval.id,
EvaluationStatus::Failed,
Some(&msg),
)
.await?;
},
}
// Update last_checked_at timestamp for per-jobset interval tracking
if let Err(e) = repo::jobsets::update_last_checked(pool, jobset.id).await {
tracing::warn!(
jobset = %jobset.name,
"Failed to update last_checked_at: {e}"
);
}
// Mark one-shot jobsets as complete (disabled) after evaluation
if jobset.state == JobsetState::OneShot {
tracing::info!(
jobset = %jobset.name,
"One-shot evaluation complete, disabling jobset"
);
if let Err(e) = repo::jobsets::mark_one_shot_complete(pool, jobset.id).await
{
tracing::error!(
jobset = %jobset.name,
"Failed to mark one-shot complete: {e}"
);
}
}
Ok(())
}
/// Create build records from evaluation results, resolving dependencies.
async fn create_builds_from_eval(
pool: &PgPool,
eval_id: Uuid,
eval_result: &crate::nix::EvalResult,
) -> anyhow::Result<()> {
let mut drv_to_build: HashMap<String, Uuid> = HashMap::new();
let mut name_to_build: HashMap<String, Uuid> = HashMap::new();
for job in &eval_result.jobs {
let outputs_json = job
.outputs
.as_ref()
.map(|o| serde_json::to_value(o).unwrap_or_default());
let constituents_json = job
.constituents
.as_ref()
.map(|c| serde_json::to_value(c).unwrap_or_default());
let is_aggregate = job.constituents.is_some();
let build = repo::builds::create(pool, CreateBuild {
evaluation_id: eval_id,
job_name: job.name.clone(),
drv_path: job.drv_path.clone(),
system: job.system.clone(),
outputs: outputs_json,
is_aggregate: Some(is_aggregate),
constituents: constituents_json,
})
.await?;
drv_to_build.insert(job.drv_path.clone(), build.id);
name_to_build.insert(job.name.clone(), build.id);
}
// Resolve dependencies
for job in &eval_result.jobs {
let build_id = match drv_to_build.get(&job.drv_path) {
Some(id) => *id,
None => continue,
};
// Input derivation dependencies
if let Some(ref input_drvs) = job.input_drvs {
for dep_drv in input_drvs.keys() {
if let Some(&dep_build_id) = drv_to_build.get(dep_drv)
&& dep_build_id != build_id
{
if let Err(e) =
repo::build_dependencies::create(pool, build_id, dep_build_id).await
{
tracing::warn!(build_id = %build_id, dep = %dep_build_id, "Failed to create build dependency: {e}");
}
}
}
}
// Aggregate constituent dependencies
if let Some(ref constituents) = job.constituents {
for constituent_name in constituents {
if let Some(&dep_build_id) = name_to_build.get(constituent_name)
&& dep_build_id != build_id
{
if let Err(e) =
repo::build_dependencies::create(pool, build_id, dep_build_id).await
{
tracing::warn!(build_id = %build_id, dep = %dep_build_id, "Failed to create constituent dependency: {e}");
}
}
}
}
}
Ok(())
}
/// Compute a deterministic hash over the commit and all jobset inputs.
/// Used for evaluation caching — skip re-eval when inputs haven't changed.
fn compute_inputs_hash(commit_hash: &str, inputs: &[JobsetInput]) -> String {
use sha2::{Digest, Sha256};
let mut hasher = Sha256::new();
hasher.update(commit_hash.as_bytes());
// Sort inputs by name for deterministic hashing
let mut sorted_inputs: Vec<&JobsetInput> = inputs.iter().collect();
sorted_inputs.sort_by_key(|i| &i.name);
for input in sorted_inputs {
hasher.update(input.name.as_bytes());
hasher.update(input.input_type.as_bytes());
hasher.update(input.value.as_bytes());
if let Some(ref rev) = input.revision {
hasher.update(rev.as_bytes());
}
}
hex::encode(hasher.finalize())
}
/// Check for declarative project config (.fc.toml or .fc/config.toml) in the
/// repo.
async fn check_declarative_config(
pool: &PgPool,
repo_path: &std::path::Path,
project_id: Uuid,
) {
let config_path = repo_path.join(".fc.toml");
let alt_config_path = repo_path.join(".fc/config.toml");
let path = if config_path.exists() {
config_path
} else if alt_config_path.exists() {
alt_config_path
} else {
return;
};
let content = match std::fs::read_to_string(&path) {
Ok(c) => c,
Err(e) => {
tracing::warn!(
"Failed to read declarative config {}: {e}",
path.display()
);
return;
},
};
#[derive(serde::Deserialize)]
struct DeclarativeConfig {
jobsets: Option<Vec<DeclarativeJobset>>,
}
#[derive(serde::Deserialize)]
struct DeclarativeJobset {
name: String,
nix_expression: String,
flake_mode: Option<bool>,
check_interval: Option<i32>,
enabled: Option<bool>,
}
let config: DeclarativeConfig = match toml::from_str(&content) {
Ok(c) => c,
Err(e) => {
tracing::warn!("Failed to parse declarative config: {e}");
return;
},
};
if let Some(jobsets) = config.jobsets {
for js in jobsets {
let input = fc_common::models::CreateJobset {
project_id,
name: js.name,
nix_expression: js.nix_expression,
enabled: js.enabled,
flake_mode: js.flake_mode,
check_interval: js.check_interval,
branch: None,
scheduling_shares: None,
state: None,
};
if let Err(e) = repo::jobsets::upsert(pool, input).await {
tracing::warn!("Failed to upsert declarative jobset: {e}");
}
}
}
}