fc-queue-runner: implement per-build cancellation via CancellationToken
Adds an `ActiveBuild` registry (DashMap of `<Uuid, CancellationToken>`) to `WorkerPool` and get `dispatch()` to create a per-build token to race `run_build` against it via Tokio's `select!`. The `cancel_checker_loop` then polls the DB every N seconds (currently 2) for builds cancelled while running, and triggers their tokens. Existing `kill_on_drop(true) on `nix build` processes handles subprocess cleanup when the future is dropped. Thank you past me for your insight. Signed-off-by: NotAShelf <raf@notashelf.dev> Change-Id: Ic8af58e92972c7d5d104d9c717e9217d6a6a6964
This commit is contained in:
parent
d401177902
commit
f8586a7f3c
4 changed files with 82 additions and 21 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
|
@ -874,6 +874,7 @@ dependencies = [
|
||||||
"chrono",
|
"chrono",
|
||||||
"clap",
|
"clap",
|
||||||
"config",
|
"config",
|
||||||
|
"dashmap",
|
||||||
"fc-common",
|
"fc-common",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,7 @@ anyhow.workspace = true
|
||||||
chrono.workspace = true
|
chrono.workspace = true
|
||||||
clap.workspace = true
|
clap.workspace = true
|
||||||
config.workspace = true
|
config.workspace = true
|
||||||
|
dashmap.workspace = true
|
||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
serde_json.workspace = true
|
serde_json.workspace = true
|
||||||
sqlx.workspace = true
|
sqlx.workspace = true
|
||||||
|
|
|
||||||
|
|
@ -5,8 +5,9 @@ use fc_common::{
|
||||||
config::{Config, GcConfig},
|
config::{Config, GcConfig},
|
||||||
database::Database,
|
database::Database,
|
||||||
gc_roots,
|
gc_roots,
|
||||||
|
repo,
|
||||||
};
|
};
|
||||||
use fc_queue_runner::worker::WorkerPool;
|
use fc_queue_runner::worker::{ActiveBuilds, WorkerPool};
|
||||||
|
|
||||||
#[derive(Parser)]
|
#[derive(Parser)]
|
||||||
#[command(name = "fc-queue-runner")]
|
#[command(name = "fc-queue-runner")]
|
||||||
|
|
@ -78,6 +79,8 @@ async fn main() -> anyhow::Result<()> {
|
||||||
wakeup.clone(),
|
wakeup.clone(),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let active_builds = worker_pool.active_builds().clone();
|
||||||
|
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
result = fc_queue_runner::runner_loop::run(db.pool().clone(), worker_pool, poll_interval, wakeup, strict_errors, failed_paths_cache) => {
|
result = fc_queue_runner::runner_loop::run(db.pool().clone(), worker_pool, poll_interval, wakeup, strict_errors, failed_paths_cache) => {
|
||||||
if let Err(e) = result {
|
if let Err(e) = result {
|
||||||
|
|
@ -86,6 +89,7 @@ async fn main() -> anyhow::Result<()> {
|
||||||
}
|
}
|
||||||
() = gc_loop(gc_config_for_loop) => {}
|
() = gc_loop(gc_config_for_loop) => {}
|
||||||
() = failed_paths_cleanup_loop(db.pool().clone(), failed_paths_ttl, failed_paths_cache) => {}
|
() = failed_paths_cleanup_loop(db.pool().clone(), failed_paths_ttl, failed_paths_cache) => {}
|
||||||
|
() = cancel_checker_loop(db.pool().clone(), active_builds) => {}
|
||||||
() = shutdown_signal() => {
|
() = shutdown_signal() => {
|
||||||
tracing::info!("Shutdown signal received, draining in-flight builds...");
|
tracing::info!("Shutdown signal received, draining in-flight builds...");
|
||||||
worker_pool_for_drain.drain();
|
worker_pool_for_drain.drain();
|
||||||
|
|
@ -176,6 +180,34 @@ async fn failed_paths_cleanup_loop(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn cancel_checker_loop(pool: sqlx::PgPool, active_builds: ActiveBuilds) {
|
||||||
|
let interval = Duration::from_secs(2);
|
||||||
|
loop {
|
||||||
|
tokio::time::sleep(interval).await;
|
||||||
|
|
||||||
|
let build_ids: Vec<uuid::Uuid> =
|
||||||
|
active_builds.iter().map(|entry| *entry.key()).collect();
|
||||||
|
|
||||||
|
if build_ids.is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
match repo::builds::get_cancelled_among(&pool, &build_ids).await {
|
||||||
|
Ok(cancelled_ids) => {
|
||||||
|
for id in cancelled_ids {
|
||||||
|
if let Some((_, token)) = active_builds.remove(&id) {
|
||||||
|
tracing::info!(build_id = %id, "Triggering cancellation for running build");
|
||||||
|
token.cancel();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!("Failed to check for cancelled builds: {e}");
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async fn shutdown_signal() {
|
async fn shutdown_signal() {
|
||||||
let ctrl_c = async {
|
let ctrl_c = async {
|
||||||
tokio::signal::ctrl_c()
|
tokio::signal::ctrl_c()
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
use std::{path::PathBuf, sync::Arc, time::Duration};
|
use std::{path::PathBuf, sync::Arc, time::Duration};
|
||||||
|
|
||||||
|
use dashmap::DashMap;
|
||||||
use fc_common::{
|
use fc_common::{
|
||||||
alerts::AlertManager,
|
alerts::AlertManager,
|
||||||
config::{
|
config::{
|
||||||
|
|
@ -24,6 +25,10 @@ use fc_common::{
|
||||||
};
|
};
|
||||||
use sqlx::PgPool;
|
use sqlx::PgPool;
|
||||||
use tokio::sync::Semaphore;
|
use tokio::sync::Semaphore;
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
pub type ActiveBuilds = Arc<DashMap<Uuid, CancellationToken>>;
|
||||||
|
|
||||||
pub struct WorkerPool {
|
pub struct WorkerPool {
|
||||||
semaphore: Arc<Semaphore>,
|
semaphore: Arc<Semaphore>,
|
||||||
|
|
@ -37,7 +42,8 @@ pub struct WorkerPool {
|
||||||
signing_config: Arc<SigningConfig>,
|
signing_config: Arc<SigningConfig>,
|
||||||
cache_upload_config: Arc<CacheUploadConfig>,
|
cache_upload_config: Arc<CacheUploadConfig>,
|
||||||
alert_manager: Arc<Option<AlertManager>>,
|
alert_manager: Arc<Option<AlertManager>>,
|
||||||
drain_token: tokio_util::sync::CancellationToken,
|
drain_token: CancellationToken,
|
||||||
|
active_builds: ActiveBuilds,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl WorkerPool {
|
impl WorkerPool {
|
||||||
|
|
@ -68,7 +74,8 @@ impl WorkerPool {
|
||||||
signing_config: Arc::new(signing_config),
|
signing_config: Arc::new(signing_config),
|
||||||
cache_upload_config: Arc::new(cache_upload_config),
|
cache_upload_config: Arc::new(cache_upload_config),
|
||||||
alert_manager: Arc::new(alert_manager),
|
alert_manager: Arc::new(alert_manager),
|
||||||
drain_token: tokio_util::sync::CancellationToken::new(),
|
drain_token: CancellationToken::new(),
|
||||||
|
active_builds: Arc::new(DashMap::new()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -95,6 +102,10 @@ impl WorkerPool {
|
||||||
.await;
|
.await;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn active_builds(&self) -> &ActiveBuilds {
|
||||||
|
&self.active_builds
|
||||||
|
}
|
||||||
|
|
||||||
#[tracing::instrument(skip(self, build), fields(build_id = %build.id, job = %build.job_name))]
|
#[tracing::instrument(skip(self, build), fields(build_id = %build.id, job = %build.job_name))]
|
||||||
pub fn dispatch(&self, build: Build) {
|
pub fn dispatch(&self, build: Build) {
|
||||||
if self.drain_token.is_cancelled() {
|
if self.drain_token.is_cancelled() {
|
||||||
|
|
@ -112,29 +123,45 @@ impl WorkerPool {
|
||||||
let signing_config = self.signing_config.clone();
|
let signing_config = self.signing_config.clone();
|
||||||
let cache_upload_config = self.cache_upload_config.clone();
|
let cache_upload_config = self.cache_upload_config.clone();
|
||||||
let alert_manager = self.alert_manager.clone();
|
let alert_manager = self.alert_manager.clone();
|
||||||
|
let active_builds = self.active_builds.clone();
|
||||||
|
let cancel_token = CancellationToken::new();
|
||||||
|
let build_id = build.id;
|
||||||
|
|
||||||
|
active_builds.insert(build_id, cancel_token.clone());
|
||||||
|
|
||||||
tokio::spawn(async move {
|
tokio::spawn(async move {
|
||||||
let _permit = match semaphore.acquire().await {
|
let result = async {
|
||||||
Ok(p) => p,
|
let _permit = match semaphore.acquire().await {
|
||||||
Err(_) => return,
|
Ok(p) => p,
|
||||||
|
Err(_) => return,
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Err(e) = run_build(
|
||||||
|
&pool,
|
||||||
|
&build,
|
||||||
|
&work_dir,
|
||||||
|
timeout,
|
||||||
|
&log_config,
|
||||||
|
&gc_config,
|
||||||
|
¬ifications_config,
|
||||||
|
&signing_config,
|
||||||
|
&cache_upload_config,
|
||||||
|
&alert_manager,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
tracing::error!(build_id = %build.id, "Build dispatch failed: {e}");
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
if let Err(e) = run_build(
|
tokio::select! {
|
||||||
&pool,
|
() = result => {}
|
||||||
&build,
|
() = cancel_token.cancelled() => {
|
||||||
&work_dir,
|
tracing::info!(build_id = %build_id, "Build cancelled, aborting");
|
||||||
timeout,
|
}
|
||||||
&log_config,
|
|
||||||
&gc_config,
|
|
||||||
¬ifications_config,
|
|
||||||
&signing_config,
|
|
||||||
&cache_upload_config,
|
|
||||||
&alert_manager,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
tracing::error!(build_id = %build.id, "Build dispatch failed: {e}");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
active_builds.remove(&build_id);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue