fc-evaulator: allow fail-fast behaviour

Signed-off-by: NotAShelf <raf@notashelf.dev> Change-Id: I1da41766f1c499347279c41f2316f4376a6a6964
2026-02-16 21:05:12 +03:00 · 2026-02-16 21:05:12 +03:00 · 235c9834b7
commit 235c9834b7
parent 49638d5d14
5 changed files with 30 additions and 134 deletions
--- a/crates/common/src/config.rs
+++ b/crates/common/src/config.rs
@ -62,6 +62,10 @@ pub struct EvaluatorConfig {
  pub work_dir:             PathBuf,
  pub restrict_eval:        bool,
  pub allow_ifd:            bool,
+
+  /// Whether to abort on the first evaluation cycle error instead of logging
+  /// and retrying.
+  pub strict_errors: bool,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
@ -70,6 +74,11 @@ pub struct QueueRunnerConfig {
  pub poll_interval: u64,
  pub build_timeout: u64,
  pub work_dir:      PathBuf,
+
+  /// When true, abort on the first runner loop error instead of logging and
+  /// retrying.
+  #[serde(default)]
+  pub strict_errors: bool,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
@ -504,6 +513,7 @@ impl Default for EvaluatorConfig {
      work_dir:             PathBuf::from("/tmp/fc-evaluator"),
      restrict_eval:        true,
      allow_ifd:            false,
+      strict_errors:        false,
    }
  }
 }
@ -515,6 +525,7 @@ impl Default for QueueRunnerConfig {
      poll_interval: 5,
      build_timeout: 3600,
      work_dir:      PathBuf::from("/tmp/fc-queue-runner"),
+      strict_errors: false,
    }
  }
 }
--- a/crates/evaluator/src/eval_loop.rs
+++ b/crates/evaluator/src/eval_loop.rs
@ -29,8 +29,13 @@ pub async fn run(
  let nix_timeout = Duration::from_secs(config.nix_timeout);
  let git_timeout = Duration::from_secs(config.git_timeout);

+  let strict = config.strict_errors;
+
  loop {
    if let Err(e) = run_cycle(&pool, &config, nix_timeout, git_timeout).await {
+      if strict {
+        return Err(e);
+      }
      tracing::error!("Evaluation cycle failed: {e}");
    }
    // Wake on NOTIFY or fall back to regular poll interval
@ -180,36 +185,12 @@ async fn evaluate_jobset(
        "Inputs unchanged (hash: {}), skipping evaluation",
        &inputs_hash[..16],
    );
-    // Create evaluation record even when skipped so system tracks this check
-    // Handle duplicate key conflict gracefully (another evaluator may have
-    // created it) - fall through to process existing evaluation instead of
-    // skipping
-    if let Err(e) = repo::evaluations::create(pool, CreateEvaluation {
-      jobset_id:      jobset.id,
-      commit_hash:    commit_hash.clone(),
-      pr_number:      None,
-      pr_head_branch: None,
-      pr_base_branch: None,
-      pr_action:      None,
-    })
-    .await
-    {
-      if !matches!(e, CiError::Conflict(_)) {
-        return Err(e.into());
-      }
-      tracing::info!(
-          jobset = %jobset.name,
-          commit = %commit_hash,
-          "Evaluation already exists (concurrent creation in inputs_hash path), will process"
-      );
-    } else {
-      // Successfully created new evaluation, can skip
    repo::jobsets::update_last_checked(pool, jobset.id).await?;
    return Ok(());
  }
-  }

-  // Also skip if commit hasn't changed (backward compat)
+  // Also skip if commit hasn't changed and inputs_hash matches (backward
+  // compat for evaluations created before inputs_hash was indexed)
  if let Some(latest) = repo::evaluations::get_latest(pool, jobset.id).await?
    && latest.commit_hash == commit_hash
    && latest.inputs_hash.as_deref() == Some(&inputs_hash)
@ -220,111 +201,8 @@ async fn evaluate_jobset(
        "Inputs unchanged (hash: {}), skipping evaluation",
        &inputs_hash[..16],
    );
-    // Create evaluation record even when skipped so system tracks this check
-    // Handle duplicate key conflict gracefully (another evaluator may have
-    // created it) - fall through to process existing evaluation instead of
-    // skipping
-    if let Err(e) = repo::evaluations::create(pool, CreateEvaluation {
-      jobset_id:      jobset.id,
-      commit_hash:    commit_hash.clone(),
-      pr_number:      None,
-      pr_head_branch: None,
-      pr_base_branch: None,
-      pr_action:      None,
-    })
-    .await
-    {
-      if !matches!(e, CiError::Conflict(_)) {
-        return Err(e.into());
-      }
-      tracing::info!(
-          jobset = %jobset.name,
-          commit = %commit_hash,
-          "Evaluation already exists (concurrent creation in commit path), will process"
-      );
-      let existing = repo::evaluations::get_by_jobset_and_commit(
-        pool,
-        jobset.id,
-        &commit_hash,
-      )
-      .await?
-      .ok_or_else(|| {
-        anyhow::anyhow!(
-          "Evaluation conflict but not found: {}/{}",
-          jobset.id,
-          commit_hash
-        )
-      })?;
-
-      if existing.status == EvaluationStatus::Completed {
-        // Check if we need to re-evaluate due to no builds
-        let builds =
-          repo::builds::list_for_evaluation(pool, existing.id).await?;
-        if builds.is_empty() {
-          info!(
-            "Evaluation completed with 0 builds, re-running nix evaluation \
-             jobset={} commit={}",
-            jobset.name, commit_hash
-          );
-          // Update existing evaluation status to Running
-          repo::evaluations::update_status(
-            pool,
-            existing.id,
-            EvaluationStatus::Running,
-            None,
-          )
-          .await?;
-          // Use existing evaluation instead of creating new one
-          let eval = existing;
-          // Run nix evaluation and create builds from the result
-          let eval_result = crate::nix::evaluate(
-            &repo_path,
-            &jobset.nix_expression,
-            jobset.flake_mode,
-            nix_timeout,
-            config,
-            &inputs,
-          )
-          .await?;
-
-          create_builds_from_eval(pool, eval.id, &eval_result).await?;
-
-          repo::evaluations::update_status(
-            pool,
-            eval.id,
-            EvaluationStatus::Completed,
-            None,
-          )
-          .await?;
-
    repo::jobsets::update_last_checked(pool, jobset.id).await?;
    return Ok(());
-        } else {
-          info!(
-            "Evaluation already completed with {} builds, skipping nix \
-             evaluation jobset={} commit={}",
-            builds.len(),
-            jobset.name,
-            commit_hash
-          );
-          repo::jobsets::update_last_checked(pool, jobset.id).await?;
-          return Ok(());
-        }
-      }
-
-      // Existing evaluation is pending or running, update status and continue
-      repo::evaluations::update_status(
-        pool,
-        existing.id,
-        EvaluationStatus::Running,
-        None,
-      )
-      .await?;
-    } else {
-      // Successfully created new evaluation, can skip
-      repo::jobsets::update_last_checked(pool, jobset.id).await?;
-      return Ok(());
-    }
  }

  tracing::info!(
--- a/crates/queue-runner/src/main.rs
+++ b/crates/queue-runner/src/main.rs
@ -35,6 +35,7 @@ async fn main() -> anyhow::Result<()> {
  let workers = cli.workers.unwrap_or(qr_config.workers);
  let poll_interval = Duration::from_secs(qr_config.poll_interval);
  let build_timeout = Duration::from_secs(qr_config.build_timeout);
+  let strict_errors = qr_config.strict_errors;
  let work_dir = qr_config.work_dir;

  // Ensure the work directory exists
@ -76,7 +77,7 @@ async fn main() -> anyhow::Result<()> {
  );

  tokio::select! {
-      result = fc_queue_runner::runner_loop::run(db.pool().clone(), worker_pool, poll_interval, wakeup) => {
+      result = fc_queue_runner::runner_loop::run(db.pool().clone(), worker_pool, poll_interval, wakeup, strict_errors) => {
          if let Err(e) = result {
              tracing::error!("Runner loop failed: {e}");
          }
--- a/crates/queue-runner/src/runner_loop.rs
+++ b/crates/queue-runner/src/runner_loop.rs
@ -14,6 +14,7 @@ pub async fn run(
  worker_pool: Arc<WorkerPool>,
  poll_interval: Duration,
  wakeup: Arc<Notify>,
+  strict_errors: bool,
 ) -> anyhow::Result<()> {
  // Reset orphaned builds from previous crashes (older than 5 minutes)
  match repo::builds::reset_orphaned(&pool, 300).await {
@ -184,6 +185,9 @@ pub async fn run(
        }
      },
      Err(e) => {
+        if strict_errors {
+          return Err(anyhow::anyhow!("Failed to fetch pending builds: {e}"));
+        }
        tracing::error!("Failed to fetch pending builds: {e}");
      },
    }
--- a/nix/vm-common.nix
+++ b/nix/vm-common.nix
@ -82,11 +82,13 @@ in {
          poll_interval = 5;
          work_dir = "/var/lib/fc/evaluator";
          nix_timeout = 60;
+          strict_errors = true;
        };

        queue_runner = {
          poll_interval = 3;
          work_dir = "/var/lib/fc/queue-runner";
+          strict_errors = true;
        };
      };