From 34284fd5673d5e1f05e301c071398cae22794284 Mon Sep 17 00:00:00 2001 From: Roy Lin Date: Thu, 11 Jun 2026 17:12:34 +0800 Subject: [PATCH 1/4] =?UTF-8?q?proto(p2):=20deferred-main-spawn=20?= =?UTF-8?q?=E2=80=94=20validated=20+=20security-hardened?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Deferred-main-spawn for IDLE-booted (pooled) sandboxes: full box semantics (exit code + json-file console logs) without a cold boot. - Guest: BOX_DEFERRED_MAIN=1 boots IDLE (skip boot spawn, stash BOX_EXEC_* cmd, CONTAINER_PID=-1; ECHILD-with-no-container keeps waiting instead of exiting). A spawn-main control frame runs the stashed cmd as the MAIN via the exec server's build_command (SAME seccomp/user/no-new-privs as a boot main, async- signal-safe pre_exec) with stdio overridden to inherit (→ console → json-file logs), under reaper::spawn_managed; pid CAS-published while MANAGED, then the supervision loop (reads the atomic each tick) reaps it for the real exit code. - Host: spec.rs passes BOX_DEFERRED_MAIN to the guest; ExecClient::spawn_main sends the trigger; boot sends it post-readiness. KVM-verified: (a) safe multi-threaded fork no deadlock (incl. 8 concurrent execs), (b) stdout/stderr stream-tagged in container.json, (c) exit code via /.a3s_exit_code, and security parity (seccomp + uid identical to a boot main; --user 1000 → uid 1000). Remaining for production: cgroup join (resource limits), config field instead of the env trigger, pool Request::SpawnMain, unit/e2e tests, docs. --- src/guest/init/src/exec_server.rs | 158 ++++++++++++++++++++++++++++++ src/guest/init/src/main.rs | 80 +++++++++++---- src/runtime/src/grpc/exec.rs | 36 +++++++ src/runtime/src/vm/mod.rs | 18 ++++ src/runtime/src/vm/spec.rs | 10 ++ 5 files changed, 283 insertions(+), 19 deletions(-) diff --git a/src/guest/init/src/exec_server.rs b/src/guest/init/src/exec_server.rs index 5e9f4c3..3ad8e01 100644 --- a/src/guest/init/src/exec_server.rs +++ b/src/guest/init/src/exec_server.rs @@ -29,6 +29,14 @@ pub fn set_container_pid(pid: i32) { CONTAINER_PID.store(pid, Ordering::SeqCst); } +/// The main container PID (-1 if not yet spawned, -2 while a deferred spawn is in +/// flight). The PID 1 supervision loop reads this each tick, so a deferred main +/// published here (after an IDLE boot) is recognized as the container and reaped +/// for its real exit code. +pub fn container_pid() -> i32 { + CONTAINER_PID.load(Ordering::SeqCst) +} + /// Host→guest control to gracefully stop the container: deliver the given signal /// number to the main container process. `signal-main:` (e.g. `signal-main:15` /// for SIGTERM, `signal-main:2` for the image STOPSIGNAL=SIGINT). The container @@ -39,6 +47,19 @@ const EXEC_CONTROL_SIGNAL_MAIN: &[u8] = b"signal-main:"; #[cfg(target_os = "linux")] const EXEC_SIGNAL_MAIN_ACK: &[u8] = b"signal-main-ack"; +/// Host→guest control to spawn the container MAIN process on demand — for VMs that +/// booted IDLE (`BOX_DEFERRED_MAIN=1`, e.g. a pre-warmed pool sandbox). Payload is +/// `spawn-main:`. The spawned process becomes +/// the container main: it inherits PID 1's console fds (so its stdout/stderr reach +/// the json-file logs, unlike a piped exec) and the supervision loop reaps it for +/// the real exit code. Must match the host prefix in `runtime/src/grpc/exec.rs`. +#[cfg(target_os = "linux")] +const EXEC_CONTROL_SPAWN_MAIN: &[u8] = b"spawn-main:"; +#[cfg(target_os = "linux")] +const EXEC_SPAWN_MAIN_ACK: &[u8] = b"spawn-main-ack"; +#[cfg(target_os = "linux")] +const EXEC_SPAWN_MAIN_NACK: &[u8] = b"spawn-main-nack:"; + /// Deliver `sig` to the main container process (best-effort). #[cfg(target_os = "linux")] fn signal_main_process(sig: i32) { @@ -56,6 +77,126 @@ fn signal_main_process(sig: i32) { } } +/// The container command, stashed at boot (parsed from BOX_EXEC_*), so a later +/// `spawn-main` trigger can run it as the main without the host re-sending it. +#[cfg(target_os = "linux")] +struct DeferredMainSpec { + executable: String, + args: Vec, + env: Vec<(String, String)>, + workdir: Option, + user: Option, +} + +#[cfg(target_os = "linux")] +static DEFERRED_MAIN: std::sync::Mutex> = std::sync::Mutex::new(None); + +/// Stash the container command for a deferred (IDLE) boot. The command already +/// reached the guest via BOX_EXEC_*, so the host only sends a bare spawn-main +/// trigger post-readiness; the guest runs the stashed command as its main. +#[cfg(target_os = "linux")] +pub fn set_deferred_main_spec( + executable: String, + args: Vec, + env: Vec<(String, String)>, + workdir: Option, + user: Option, +) { + *DEFERRED_MAIN.lock().unwrap_or_else(|e| e.into_inner()) = Some(DeferredMainSpec { + executable, + args, + env, + workdir, + user, + }); +} + +/// Spawn the deferred container main (after an IDLE boot). The child inherits PID +/// 1's stdout/stderr — fds 1/2 = the virtio-console — so its output reaches +/// `console.log` → `container.json`, exactly like a boot-spawned main (and unlike a +/// `Stdio::piped` exec, whose output only flows over the exec stream). It is spawned +/// via `Command::spawn` (the same clone/exec the exec server already uses safely), +/// NOT `namespace::spawn_isolated`'s raw `fork()` — whose heavy allocating child +/// code could deadlock from this multi-threaded PID 1. The pid is published WHILE +/// still registered MANAGED (the reaper can't reap it as an orphan before the +/// hand-off), then released to the supervision loop, which reaps it for the real +/// exit code. A CAS makes only the first spawn-main win. +#[cfg(target_os = "linux")] +fn spawn_deferred_main() -> Result { + // The container command + user, stashed at boot from BOX_EXEC_*. + let (executable, args, env, workdir, user) = { + let guard = DEFERRED_MAIN.lock().unwrap_or_else(|e| e.into_inner()); + let spec = guard.as_ref().ok_or("no deferred-main command set")?; + ( + spec.executable.clone(), + spec.args.clone(), + spec.env.clone(), + spec.workdir.clone(), + spec.user.clone(), + ) + }; + + // cmd vector + env. Include the guest's own A3S_SEC_* control vars so + // build_command applies the SAME seccomp/user/no-new-privs as a boot-spawned + // main (the container env carries them on a normal exec; here we add them). + let mut cmd_vec = Vec::with_capacity(1 + args.len()); + cmd_vec.push(executable); + cmd_vec.extend(args); + let mut env_entries: Vec = env.iter().map(|(k, v)| format!("{k}={v}")).collect(); + for (k, v) in std::env::vars() { + if k.starts_with("A3S_SEC_") { + env_entries.push(format!("{k}={v}")); + } + } + + // Reuse the exec server's secured command builder (seccomp + user + no-new-privs + // via async-signal-safe pre_exec — already safe to spawn from this multi-threaded + // PID 1), then override stdio to INHERIT so the main's stdout/stderr reach PID + // 1's console fds (→ json-file logs), unlike an exec's piped stdio. + let (mut command, _timeout) = build_command( + ExecCommandSpec { + cmd: &cmd_vec, + timeout_ns: 0, + env: &env_entries, + working_dir: workdir.as_deref(), + rootfs: None, + stdin_data: None, + stdin_streaming: false, + user: user.as_deref(), + }, + None, + ) + .map_err(|out| String::from_utf8_lossy(&out.stderr).into_owned())?; + command + .stdout(std::process::Stdio::inherit()) + .stderr(std::process::Stdio::inherit()) + .stdin(std::process::Stdio::null()); + + // Idempotency: claim the sentinel (-1 → -2 pending); a second spawn-main loses. + if CONTAINER_PID + .compare_exchange(-1, -2, Ordering::SeqCst, Ordering::SeqCst) + .is_err() + { + return Err("container main already spawned".to_string()); + } + + match crate::reaper::spawn_managed(|| command.spawn()) { + Ok((child, guard)) => { + let pid = child.id() as i32; + // Publish the real pid (over the -2 marker) while still MANAGED, then + // release ownership: now the loop's `pid == container_pid` branch reaps. + CONTAINER_PID.store(pid, Ordering::SeqCst); + std::mem::forget(child); // PID 1's reaper owns it — do not double-reap + drop(guard); + Ok(pid) + } + Err(e) => { + CONTAINER_PID.store(-1, Ordering::SeqCst); // reset so a retry is possible + Err(format!("spawn failed: {e}")) + } + } +} + /// Maximum payload bytes per streamed exec chunk. const STREAM_CHUNK_BYTES: usize = 16 * 1024; const EXEC_CONTROL_CANCEL: &[u8] = b"cancel"; @@ -210,6 +351,23 @@ fn handle_connection(fd: std::os::fd::OwnedFd) -> Result<(), Box { + info!(pid, "Deferred container main spawned"); + write_frame(&mut stream, FrameType::Control as u8, EXEC_SPAWN_MAIN_ACK)?; + } + Err(e) => { + warn!(error = %e, "spawn-main failed"); + let mut nack = EXEC_SPAWN_MAIN_NACK.to_vec(); + nack.extend_from_slice(e.as_bytes()); + write_frame(&mut stream, FrameType::Control as u8, &nack)?; + } + } + std::mem::forget(fd); + return Ok(()); + } send_error_frame(&mut stream, "Expected Data frame")?; std::mem::forget(fd); return Ok(()); diff --git a/src/guest/init/src/main.rs b/src/guest/init/src/main.rs index 72fb1fe..dd6221d 100644 --- a/src/guest/init/src/main.rs +++ b/src/guest/init/src/main.rs @@ -358,22 +358,52 @@ fn run_init() -> Result<(), Box> { .map(|(k, v)| (k.as_str(), v.as_str())) .collect(); - let container_pid_raw = namespace::spawn_isolated( - &namespace_config, - &exec_config.executable, - &args_refs, - &env_refs, - &exec_config.workdir, - exec_config.user.as_deref(), - )?; - let container_pid = nix::unistd::Pid::from_raw(container_pid_raw as i32); - - info!("Container process started with PID {}", container_pid); - - // Make the main container PID available to the exec server so a host - // graceful-stop request (signal-main control frame) can deliver the - // STOPSIGNAL to it. Must be set before the exec server thread starts. - exec_server::set_container_pid(container_pid_raw as i32); + // Deferred-main (BOX_DEFERRED_MAIN=1): boot IDLE — skip the boot spawn and let + // the container main be spawned later by a `spawn-main` control frame (for a + // pre-warmed/pooled sandbox). CONTAINER_PID stays the -1 sentinel; the exec + // server + supervision loop start as usual, so host readiness still passes + // (the heartbeat handshake has no container-pid dependency). + let deferred_main = std::env::var("BOX_DEFERRED_MAIN") + .map(|v| v == "1") + .unwrap_or(false); + + let container_pid = if deferred_main { + info!("BOX_DEFERRED_MAIN=1 — booting IDLE; container main deferred to a spawn-main control frame"); + // Stash the parsed command so a later spawn-main trigger runs it as main. + #[cfg(target_os = "linux")] + exec_server::set_deferred_main_spec( + exec_config.executable.clone(), + exec_config.args.clone(), + exec_config + .env + .iter() + .map(|(k, v)| (k.clone(), v.clone())) + .collect(), + if exec_config.workdir.is_empty() { + None + } else { + Some(exec_config.workdir.clone()) + }, + exec_config.user.clone(), + ); + nix::unistd::Pid::from_raw(-1) + } else { + let container_pid_raw = namespace::spawn_isolated( + &namespace_config, + &exec_config.executable, + &args_refs, + &env_refs, + &exec_config.workdir, + exec_config.user.as_deref(), + )?; + info!("Container process started with PID {}", container_pid_raw); + + // Make the main container PID available to the exec server so a host + // graceful-stop request (signal-main control frame) can deliver the + // STOPSIGNAL to it. Must be set before the exec server thread starts. + exec_server::set_container_pid(container_pid_raw as i32); + nix::unistd::Pid::from_raw(container_pid_raw as i32) + }; expose_container_env_to_exec(&exec_config); @@ -1028,13 +1058,25 @@ fn wait_for_children(container_pid: nix::unistd::Pid) -> Result<(), Box (pid, 128 + signal as i32, true), // No exited child right now: stop draining and poll again later. Ok(_) => break, - // No children at all (container already gone): nothing to supervise. - Err(nix::errno::Errno::ECHILD) => return Ok(()), + // No children right now. In deferred-main mode (IDLE boot) the + // container main has not been spawned yet — keep waiting for the + // spawn-main frame rather than exiting (which would halt the VM + // before the main ever runs). Otherwise the container is gone: done. + Err(nix::errno::Errno::ECHILD) => { + if exec_server::container_pid() < 0 { + break; + } + return Ok(()); + } // Transient error: retry on the next tick. Err(_) => break, }; - if pid == container_pid { + // Read the container pid fresh each iteration: a deferred main (IDLE + // boot) publishes it late via spawn-main; the eager path set it at boot. + // The -1/-2 sentinels (unset/pending) never match a real pid. + let cpid = exec_server::container_pid(); + if cpid >= 0 && pid.as_raw() == cpid { // The container drives the VM lifecycle: reap it and exit with its // status so the host (and detached `run -d wait`) sees the real code. let _ = waitpid(pid, None); diff --git a/src/runtime/src/grpc/exec.rs b/src/runtime/src/grpc/exec.rs index 5b2e3b7..a4f93e8 100644 --- a/src/runtime/src/grpc/exec.rs +++ b/src/runtime/src/grpc/exec.rs @@ -20,6 +20,10 @@ const EXEC_FLUSH_ACK: &[u8] = b"flush-ack"; /// received and the signal delivered. Must match the guest's /// `EXEC_SIGNAL_MAIN_ACK` in `guest/init/src/exec_server.rs`. const EXEC_SIGNAL_MAIN_ACK: &[u8] = b"signal-main-ack"; +/// Guest→host acknowledgement that a `spawn-main` deferred-main control was +/// received and the container main spawned. Matches the guest's +/// `EXEC_SPAWN_MAIN_ACK` in `guest/init/src/exec_server.rs`. +const EXEC_SPAWN_MAIN_ACK: &[u8] = b"spawn-main-ack"; type ExecFrameReader = a3s_transport::FrameReader>; type ExecFrameWriter = a3s_transport::FrameWriter>; @@ -275,6 +279,38 @@ impl ExecClient { _ => Ok(false), } } + + /// Ask a guest that booted IDLE (`BOX_DEFERRED_MAIN=1`) to spawn its container + /// command — already known to the guest via BOX_EXEC_* — as the MAIN process. + /// The spawned main inherits the console (so its output reaches the json-file + /// logs) and drives the VM lifecycle. Returns `Ok(true)` if acknowledged. + pub async fn spawn_main(&self) -> Result { + let mut stream = match UnixStream::connect(&self.socket_path).await { + Ok(s) => s, + Err(_) => return Ok(false), + }; + + let frame = a3s_transport::Frame::control(b"spawn-main:".to_vec()); + let encoded = frame + .encode() + .map_err(|e| BoxError::ExecError(format!("spawn-main frame encode failed: {}", e)))?; + + if stream.write_all(&encoded).await.is_err() { + return Ok(false); + } + + let (r, _w) = tokio::io::split(stream); + let mut reader = a3s_transport::FrameReader::new(r); + match reader.read_frame().await { + Ok(Some(f)) + if f.frame_type == a3s_transport::FrameType::Control + && f.payload == EXEC_SPAWN_MAIN_ACK => + { + Ok(true) + } + _ => Ok(false), + } + } } /// Handle for reading streaming exec events. diff --git a/src/runtime/src/vm/mod.rs b/src/runtime/src/vm/mod.rs index 86cf62f..4f63d57 100644 --- a/src/runtime/src/vm/mod.rs +++ b/src/runtime/src/vm/mod.rs @@ -728,6 +728,24 @@ impl VmManager { } } + // Prototype: deferred-main-spawn. The guest booted IDLE (BOX_DEFERRED_MAIN); + // now that the exec server is ready, tell it to spawn the container command + // (already passed via BOX_EXEC_*) as the MAIN process — full box semantics + // (exit code + json-file console logs) without a cold boot. + #[cfg(unix)] + if std::env::var("BOX_DEFERRED_MAIN") + .map(|v| v == "1") + .unwrap_or(false) + { + if let Some(client) = self.exec_client.as_ref() { + match client.spawn_main().await { + Ok(true) => tracing::info!("deferred container main spawned"), + Ok(false) => tracing::warn!("deferred spawn-main not acknowledged"), + Err(e) => tracing::warn!(error = %e, "deferred spawn-main failed"), + } + } + } + // 5b2. Store socket paths for CRI streaming access self.exec_socket_path = Some(layout.exec_socket_path.clone()); self.pty_socket_path = Some(layout.pty_socket_path.clone()); diff --git a/src/runtime/src/vm/spec.rs b/src/runtime/src/vm/spec.rs index f38bca8..4f07c40 100644 --- a/src/runtime/src/vm/spec.rs +++ b/src/runtime/src/vm/spec.rs @@ -132,6 +132,16 @@ impl VmManager { env.push((format!("BOX_EXEC_ARG_{}", i), arg.clone())); } + // Prototype: deferred-main-spawn. If the host set BOX_DEFERRED_MAIN=1, + // tell guest init to boot IDLE; the runtime then sends a spawn-main + // control frame post-readiness to run the command above as the main. + if std::env::var("BOX_DEFERRED_MAIN") + .map(|v| v == "1") + .unwrap_or(false) + { + env.push(("BOX_DEFERRED_MAIN".to_string(), "1".to_string())); + } + // Pass the effective working directory to guest init so PID 1 and // the container entrypoint agree even when no OCI WORKDIR is set. env.push(("BOX_EXEC_WORKDIR".to_string(), workdir.clone())); From 7fd363e30d2b418831841b3feee4b4e98dfd884c Mon Sep 17 00:00:00 2001 From: Roy Lin Date: Thu, 11 Jun 2026 17:44:10 +0800 Subject: [PATCH 2/4] =?UTF-8?q?feat(p2):=20pool=20integration=20=E2=80=94?= =?UTF-8?q?=20deferred-main=20full=20box=20semantics?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `pool start --deferred` boots pooled VMs IDLE; `pool run` then spawns the per- request command as the box's real MAIN via the spawn-main control frame (hybrid: frame carries the command since the VM is pre-warmed before the command is known), giving full box semantics — real exit code (/.a3s_exit_code) + json-file console logs — vs the keepalive+exec MVP's piped exec-stream output. - core: BoxConfig.deferred_main; spec.rs/mod.rs honor it (env override kept for tests). Guest spawn-main now accepts an optional command in the frame. - runtime: VmManager::run_deferred_main (send spawn-main + wait for the main to exit + read the box's container.json split by stream); ExecClient::spawn_main takes an optional spec. - cli: pool --deferred flag; handle_conn routes to run_deferred_main. --- src/cli/src/commands/pool.rs | 43 ++++++++++++++++- src/core/src/config.rs | 8 ++++ src/guest/init/src/exec_server.rs | 47 +++++++++++++----- src/runtime/src/grpc/exec.rs | 8 +++- src/runtime/src/vm/mod.rs | 80 ++++++++++++++++++++++++++++++- src/runtime/src/vm/spec.rs | 7 +-- 6 files changed, 172 insertions(+), 21 deletions(-) diff --git a/src/cli/src/commands/pool.rs b/src/cli/src/commands/pool.rs index 71b0a0a..e863d24 100644 --- a/src/cli/src/commands/pool.rs +++ b/src/cli/src/commands/pool.rs @@ -72,6 +72,12 @@ pub struct PoolStartArgs { #[arg(long, value_delimiter = ',')] pub warm: Vec, + /// Boot pooled VMs IDLE and run each `pool run` command as the box's real MAIN + /// (full box semantics: exit code + json-file console logs), instead of + /// exec-into-keepalive. + #[arg(long)] + pub deferred: bool, + /// Output as JSON #[arg(long)] pub json: bool, @@ -176,6 +182,17 @@ fn keepalive_cmd() -> Vec { ] } +/// Build the `spawn-main` JSON spec for a deferred-mode pool command (executable + +/// args + a standard PATH so the binary resolves like a normal container main). +fn deferred_spec_json(cmd: &[String]) -> Vec { + let spec = serde_json::json!({ + "executable": cmd.first().map(String::as_str).unwrap_or("/bin/sh"), + "args": cmd.get(1..).unwrap_or(&[]), + "env": [["PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"]], + }); + serde_json::to_vec(&spec).unwrap_or_default() +} + /// Parse a `--warm` entry of the form `image[=count]` (count defaults to `default_size`). fn parse_warm_spec(entry: &str, default_size: usize) -> Result<(String, usize), String> { match entry.split_once('=') { @@ -212,6 +229,9 @@ struct PoolRegistry { size: usize, max: usize, ttl: u64, + /// When true, pooled VMs boot IDLE and `pool run` spawns the command as the + /// box's real MAIN (full box semantics), instead of exec-into-keepalive. + deferred: bool, } impl PoolRegistry { @@ -234,8 +254,11 @@ impl PoolRegistry { }; let box_config = BoxConfig { image: image.to_string(), + // In deferred mode the VM boots IDLE (keepalive cmd is stashed but + // unused — the per-request command arrives via spawn-main). cmd: keepalive_cmd(), pool: pool_config.clone(), + deferred_main: self.deferred, ..Default::default() }; let pool = std::sync::Arc::new( @@ -303,6 +326,7 @@ async fn execute_start(args: PoolStartArgs) -> Result<(), Box err_resp(format!("acquire failed: {e}")), - Ok(vm) => { - let resp = match vm.exec_command(run.cmd, EXEC_TIMEOUT_NS).await { + Ok(mut vm) => { + // Deferred-main: run the command as the box's real MAIN + // (full box semantics — exit code + json-file console logs). + // Otherwise exec it in the keepalive VM (output via the + // exec stream). + let result = if registry.deferred { + vm.run_deferred_main( + &deferred_spec_json(&run.cmd), + std::time::Duration::from_secs(60), + ) + .await + } else { + vm.exec_command(run.cmd, EXEC_TIMEOUT_NS).await + }; + let resp = match result { Ok(o) => RunResponse { stdout: o.stdout, stderr: o.stderr, @@ -763,6 +800,7 @@ mod tests { ttl: 300, socket: DEFAULT_SOCKET.to_string(), warm: vec![], + deferred: false, json: false, }; let result = execute_start(args).await; @@ -779,6 +817,7 @@ mod tests { ttl: 300, socket: DEFAULT_SOCKET.to_string(), warm: vec![], + deferred: false, json: false, }; let result = execute_start(args).await; diff --git a/src/core/src/config.rs b/src/core/src/config.rs index eedd36d..f2de9b9 100644 --- a/src/core/src/config.rs +++ b/src/core/src/config.rs @@ -310,6 +310,13 @@ pub struct BoxConfig { #[serde(default)] pub pool: PoolConfig, + /// Boot the VM IDLE — do not spawn the container main at boot; instead the + /// main is started later by a `spawn-main` control frame. Used by the pool so a + /// pre-warmed sandbox runs a per-request command as its real main, with full box + /// semantics (exit code + json-file console logs) and no cold boot. + #[serde(default)] + pub deferred_main: bool, + /// Port mappings: "host_port:guest_port" (e.g., "8080:80") /// Maps host ports to guest ports via TSI (Transparent Socket Impersonation). #[serde(default)] @@ -407,6 +414,7 @@ impl Default for BoxConfig { extra_env: vec![], cache: CacheConfig::default(), pool: PoolConfig::default(), + deferred_main: false, port_map: vec![], dns: vec![], add_hosts: vec![], diff --git a/src/guest/init/src/exec_server.rs b/src/guest/init/src/exec_server.rs index 3ad8e01..b27f0af 100644 --- a/src/guest/init/src/exec_server.rs +++ b/src/guest/init/src/exec_server.rs @@ -80,11 +80,16 @@ fn signal_main_process(sig: i32) { /// The container command, stashed at boot (parsed from BOX_EXEC_*), so a later /// `spawn-main` trigger can run it as the main without the host re-sending it. #[cfg(target_os = "linux")] +#[derive(serde::Deserialize)] struct DeferredMainSpec { executable: String, + #[serde(default)] args: Vec, + #[serde(default)] env: Vec<(String, String)>, + #[serde(default)] workdir: Option, + #[serde(default)] user: Option, } @@ -122,18 +127,23 @@ pub fn set_deferred_main_spec( /// hand-off), then released to the supervision loop, which reaps it for the real /// exit code. A CAS makes only the first spawn-main win. #[cfg(target_os = "linux")] -fn spawn_deferred_main() -> Result { - // The container command + user, stashed at boot from BOX_EXEC_*. - let (executable, args, env, workdir, user) = { - let guard = DEFERRED_MAIN.lock().unwrap_or_else(|e| e.into_inner()); - let spec = guard.as_ref().ok_or("no deferred-main command set")?; - ( - spec.executable.clone(), - spec.args.clone(), - spec.env.clone(), - spec.workdir.clone(), - spec.user.clone(), - ) +fn spawn_deferred_main(frame: Option) -> Result { + // Use the command carried in the frame (the pool path — a pre-warmed VM gets + // its per-request command here), else the one stashed at boot from BOX_EXEC_* + // (the `run` path, where the command is known at boot). + let (executable, args, env, workdir, user) = match frame { + Some(s) => (s.executable, s.args, s.env, s.workdir, s.user), + None => { + let guard = DEFERRED_MAIN.lock().unwrap_or_else(|e| e.into_inner()); + let spec = guard.as_ref().ok_or("no deferred-main command set")?; + ( + spec.executable.clone(), + spec.args.clone(), + spec.env.clone(), + spec.workdir.clone(), + spec.user.clone(), + ) + } }; // cmd vector + env. Include the guest's own A3S_SEC_* control vars so @@ -353,7 +363,18 @@ fn handle_connection(fd: std::os::fd::OwnedFd) -> Result<(), Box(body) { + Ok(spec) => spawn_deferred_main(Some(spec)), + Err(e) => Err(format!("invalid spawn-main spec: {e}")), + } + }; + match result { Ok(pid) => { info!(pid, "Deferred container main spawned"); write_frame(&mut stream, FrameType::Control as u8, EXEC_SPAWN_MAIN_ACK)?; diff --git a/src/runtime/src/grpc/exec.rs b/src/runtime/src/grpc/exec.rs index a4f93e8..22c7885 100644 --- a/src/runtime/src/grpc/exec.rs +++ b/src/runtime/src/grpc/exec.rs @@ -284,13 +284,17 @@ impl ExecClient { /// command — already known to the guest via BOX_EXEC_* — as the MAIN process. /// The spawned main inherits the console (so its output reaches the json-file /// logs) and drives the VM lifecycle. Returns `Ok(true)` if acknowledged. - pub async fn spawn_main(&self) -> Result { + pub async fn spawn_main(&self, spec_json: Option<&[u8]>) -> Result { let mut stream = match UnixStream::connect(&self.socket_path).await { Ok(s) => s, Err(_) => return Ok(false), }; - let frame = a3s_transport::Frame::control(b"spawn-main:".to_vec()); + let mut payload = b"spawn-main:".to_vec(); + if let Some(json) = spec_json { + payload.extend_from_slice(json); + } + let frame = a3s_transport::Frame::control(payload); let encoded = frame .encode() .map_err(|e| BoxError::ExecError(format!("spawn-main frame encode failed: {}", e)))?; diff --git a/src/runtime/src/vm/mod.rs b/src/runtime/src/vm/mod.rs index 4f63d57..41de141 100644 --- a/src/runtime/src/vm/mod.rs +++ b/src/runtime/src/vm/mod.rs @@ -489,6 +489,80 @@ impl VmManager { Ok(None) } + /// Run a command as the container MAIN in an IDLE-booted (deferred-main) VM. + /// + /// Sends the `spawn-main` control frame carrying `spec_json` (the command), + /// waits for the main to exit (which halts the VM), and returns its real exit + /// code + the box's json-file console logs split by stream. This is the full- + /// box-semantics counterpart to [`Self::exec_command`] (whose output is piped + /// over the exec stream, not the json-file logs). + #[cfg(unix)] + pub async fn run_deferred_main( + &mut self, + spec_json: &[u8], + timeout: std::time::Duration, + ) -> Result { + let acked = { + let client = self + .exec_client + .as_ref() + .ok_or_else(|| BoxError::ExecError("Exec client not connected".to_string()))?; + client.spawn_main(Some(spec_json)).await? + }; + if !acked { + return Err(BoxError::ExecError( + "spawn-main was not acknowledged by the guest".to_string(), + )); + } + + // Wait for the main to exit — guest-init persists the code and halts the VM. + let start = std::time::Instant::now(); + let exit_code = loop { + if let Some(code) = self.try_wait_exit().await? { + break code; + } + if start.elapsed() >= timeout { + return Err(BoxError::ExecError( + "deferred main did not exit within the timeout".to_string(), + )); + } + tokio::time::sleep(std::time::Duration::from_millis(50)).await; + }; + + // Let the shim's log processor finish draining console.log into the json + // file (it flushes as the VM halts) before reading the captured output. + tokio::time::sleep(std::time::Duration::from_millis(150)).await; + let (stdout, stderr) = self.read_container_logs(); + Ok(a3s_box_core::exec::ExecOutput { + stdout, + stderr, + exit_code, + }) + } + + /// Read the box's json-file console logs, split into stdout/stderr by stream. + fn read_container_logs(&self) -> (Vec, Vec) { + let path = self + .home_dir + .join("boxes") + .join(&self.box_id) + .join("logs") + .join("container.json"); + let (mut out, mut err) = (Vec::new(), Vec::new()); + if let Ok(content) = std::fs::read_to_string(&path) { + for line in content.lines() { + if let Ok(entry) = serde_json::from_str::(line) { + if entry.stream == "stderr" { + err.extend_from_slice(entry.log.as_bytes()); + } else { + out.extend_from_slice(entry.log.as_bytes()); + } + } + } + } + (out, err) + } + /// Execute a command in the guest VM. /// /// Requires the VM to be in Ready, Busy, or Compacting state. @@ -732,13 +806,17 @@ impl VmManager { // now that the exec server is ready, tell it to spawn the container command // (already passed via BOX_EXEC_*) as the MAIN process — full box semantics // (exit code + json-file console logs) without a cold boot. + // Auto-trigger spawn-main only for the env-driven `run` path, where the + // command is known at boot. The pool sets config.deferred_main to boot the + // VM IDLE but drives spawn-main EXPLICITLY per request (the per-request + // command isn't known at pre-warm), so a pool VM must NOT auto-trigger here. #[cfg(unix)] if std::env::var("BOX_DEFERRED_MAIN") .map(|v| v == "1") .unwrap_or(false) { if let Some(client) = self.exec_client.as_ref() { - match client.spawn_main().await { + match client.spawn_main(None).await { Ok(true) => tracing::info!("deferred container main spawned"), Ok(false) => tracing::warn!("deferred spawn-main not acknowledged"), Err(e) => tracing::warn!(error = %e, "deferred spawn-main failed"), diff --git a/src/runtime/src/vm/spec.rs b/src/runtime/src/vm/spec.rs index 4f07c40..8ee0231 100644 --- a/src/runtime/src/vm/spec.rs +++ b/src/runtime/src/vm/spec.rs @@ -135,9 +135,10 @@ impl VmManager { // Prototype: deferred-main-spawn. If the host set BOX_DEFERRED_MAIN=1, // tell guest init to boot IDLE; the runtime then sends a spawn-main // control frame post-readiness to run the command above as the main. - if std::env::var("BOX_DEFERRED_MAIN") - .map(|v| v == "1") - .unwrap_or(false) + if self.config.deferred_main + || std::env::var("BOX_DEFERRED_MAIN") + .map(|v| v == "1") + .unwrap_or(false) { env.push(("BOX_DEFERRED_MAIN".to_string(), "1".to_string())); } From e47de08acf735a238c63178de80c9ba56cf26f75 Mon Sep 17 00:00:00 2001 From: Roy Lin Date: Thu, 11 Jun 2026 17:57:29 +0800 Subject: [PATCH 3/4] test(p2): deferred_spec_json unit + deferred-pool e2e Unit: deferred_spec_json builds the spawn-main JSON (executable+args+PATH, shell fallback). Host-backed e2e test_real_pool_deferred_main: pool start --deferred + pool run asserts stdout/stderr come back from the box's json-file logs and the real exit code propagates. --- src/cli/src/commands/pool.rs | 17 ++++++++ src/cli/tests/host_smoke.rs | 80 ++++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+) diff --git a/src/cli/src/commands/pool.rs b/src/cli/src/commands/pool.rs index e863d24..89a2876 100644 --- a/src/cli/src/commands/pool.rs +++ b/src/cli/src/commands/pool.rs @@ -731,6 +731,23 @@ mod tests { assert!(parse_warm_spec("=4", 2).is_err()); } + #[test] + fn test_deferred_spec_json() { + // The spawn-main spec for a deferred pool run: executable + args + a PATH + // so the binary resolves like a normal container main. + let json = deferred_spec_json(&["sh".into(), "-c".into(), "echo hi".into()]); + let v: serde_json::Value = serde_json::from_slice(&json).unwrap(); + assert_eq!(v["executable"], "sh"); + assert_eq!(v["args"][0], "-c"); + assert_eq!(v["args"][1], "echo hi"); + assert_eq!(v["env"][0][0], "PATH"); + assert!(v["env"][0][1].as_str().unwrap().contains("/bin")); + // Empty cmd falls back to a shell rather than panicking. + let j2 = deferred_spec_json(&[]); + let v2: serde_json::Value = serde_json::from_slice(&j2).unwrap(); + assert_eq!(v2["executable"], "/bin/sh"); + } + #[tokio::test] async fn test_backpressure_bounds_concurrency() { // The contract PoolEntry relies on: a permit (held until teardown) caps diff --git a/src/cli/tests/host_smoke.rs b/src/cli/tests/host_smoke.rs index b1dd913..c63ec14 100644 --- a/src/cli/tests/host_smoke.rs +++ b/src/cli/tests/host_smoke.rs @@ -600,3 +600,83 @@ fn test_real_pool_warm_run() { let _ = daemon.kill(); let _ = daemon.wait(); } + +/// Deferred-main pool end-to-end: `pool start --deferred` boots pooled VMs IDLE, +/// and `pool run` spawns the command as the box's real MAIN — full box semantics +/// (stdout/stderr from the box's json-file console logs + the real exit code), +/// unlike the keepalive+exec MVP's exec-stream output. Host-backed (KVM). +#[test] +#[ignore] +fn test_real_pool_deferred_main() { + let cli = CliTest::new(); + let image = host_smoke_image(); + seed_runnable_alpine_image(&cli, &image); + let socket = cli + .home_path() + .join("pd.sock") + .to_str() + .expect("utf8 socket path") + .to_string(); + + let mut daemon = cli.spawn_background(&[ + "pool", + "start", + "--deferred", + "--image", + image.as_str(), + "--size", + "2", + "--max", + "4", + "--socket", + socket.as_str(), + ]); + + let sock_path = cli.home_path().join("pd.sock"); + let start = std::time::Instant::now(); + while !sock_path.exists() { + if start.elapsed() > Duration::from_secs(120) { + let _ = daemon.kill(); + panic!("deferred pool daemon never created its socket"); + } + if let Ok(Some(status)) = daemon.try_wait() { + panic!("deferred pool daemon exited early: {status}"); + } + std::thread::sleep(Duration::from_millis(200)); + } + std::thread::sleep(Duration::from_secs(5)); + + // Full box semantics: stdout + stderr come back from the box's json-file logs. + let (out, err, ok) = cli.output(&[ + "pool", + "run", + "--socket", + socket.as_str(), + "--", + "sh", + "-c", + "echo deferred-stdout; echo deferred-stderr 1>&2; exit 0", + ]); + assert!( + ok, + "deferred pool run failed.\nstdout:\n{out}\nstderr:\n{err}" + ); + assert!(out.contains("deferred-stdout"), "missing stdout: {out:?}"); + assert!(err.contains("deferred-stderr"), "missing stderr: {err:?}"); + + // The real container exit code propagates (not the exec-stream's). + let (_o, _e, ok2) = cli.output(&[ + "pool", + "run", + "--socket", + socket.as_str(), + "--", + "sh", + "-c", + "exit 7", + ]); + assert!(!ok2, "expected a non-zero exit from the deferred main"); + + let _ = daemon.kill(); + let _ = daemon.wait(); +} From 82b6c9099903a26311713ac2dfcd96d353256014 Mon Sep 17 00:00:00 2001 From: Roy Lin Date: Thu, 11 Jun 2026 18:00:48 +0800 Subject: [PATCH 4/4] docs(p2): mark deferred-main-spawn implemented + usage; README pool note Update the P2 design doc to IMPLEMENTED with usage (pool start --deferred / pool run) and what landed vs the design (incl. cgroup parity is free via VM-level limits). Add a warm-pool + --deferred bullet to the README verified-behavior list. --- README.md | 3 ++- docs/p2-deferred-main-spawn-design.md | 32 +++++++++++++++++++++++---- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 9e7473e..dcc1fd0 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,8 @@ The ignored `core_smoke` suite covers the core CLI path on a real MicroVM host: - non-TTY `exec`, PTY, `attach`, `logs`, `stop`, `wait`, and `rm`; - TCP published ports with host loopback HTTP reachability; - bridge network endpoint allocation, peer `/etc/hosts`, connect/disconnect, and force removal cleanup; -- named volumes, `cp`, `diff`, `export`, `commit`, `snapshot`, restart-policy monitor recovery, and Compose health/volume flow. +- named volumes, `cp`, `diff`, `export`, `commit`, `snapshot`, restart-policy monitor recovery, and Compose health/volume flow; +- warm pool (`pool start`/`pool run`): pre-warmed sandboxes served over a socket, with backpressure and multi-image lazy pools; `--deferred` runs each command as the box's real main for full box semantics (real exit code + json-file console logs) with no cold boot. The most recent local record in this branch: all 14 ignored `core_smoke` tests passed on macOS HVF with an offline Alpine OCI archive, and the ignored diff --git a/docs/p2-deferred-main-spawn-design.md b/docs/p2-deferred-main-spawn-design.md index a763ba0..4fa859e 100644 --- a/docs/p2-deferred-main-spawn-design.md +++ b/docs/p2-deferred-main-spawn-design.md @@ -1,9 +1,33 @@ # Design: P2 — Deferred-Main-Spawn (full box semantics for pooled sandboxes) -Status: **GO-WITH-CONDITIONS** (design + prototype-first). Builds on -`refactor/init-readiness` (PR #15: early-bind + event-driven readiness + PID1 -reaper) and `feat/p1-template-pool` (PR #18: the warm-sandbox pool controller). -Derived from an adversarial mapping of the real #15+#18 base. +Status: **IMPLEMENTED & KVM-verified** (the GO-WITH-CONDITIONS design below was +confirmed in practice). Builds on PR #15 (early-bind + event-driven readiness + +PID1 reaper) and PR #18 (the warm-sandbox pool controller). + +## 0. Implementation status & usage + +```sh +# Boot a pool of IDLE sandboxes (no container main at boot)... +a3s-box pool start --deferred --image alpine:latest --size 4 --socket /tmp/p.sock +# ...then run a command as the box's REAL main — full box semantics: +a3s-box pool run --socket /tmp/p.sock -- sh -c 'echo hi; exit 7' # exit 7; output in the json-file logs +``` + +What landed (vs the design): a `BOX_DEFERRED_MAIN=1`/`BoxConfig.deferred_main` +IDLE boot (skip the boot spawn; the `ECHILD`-with-no-container case keeps PID 1 +waiting instead of exiting — see §5/Phase 1); a `spawn-main` control frame (bare +for the `run` path's boot-stashed command, or carrying a command for the pool, +which pre-warms before the command is known); the deferred main spawned via the +exec server's `build_command` (**identical seccomp/user/no-new-privs** to a boot +main — verified `Seccomp: 2`, `--user 1000`→uid 1000) with stdio overridden to +`inherit` so its stdout/stderr reach the json-file console logs; the pid +CAS-published while MANAGED then reaped by the supervision loop for the real exit +code; `pool start --deferred` + `VmManager::run_deferred_main`. Resource limits +need no extra work — they are VM-level (libkrun `set_vm_config`), so a deferred +main shares the boot main's limits. KVM-verified end to end (exit codes 7/3/0, +stdout+stderr from the json-file logs, seccomp applied — all from a pre-warmed +pool), with unit (`deferred_spec_json`) + host e2e (`test_real_pool_deferred_main`) +coverage. Not yet wired: a typed pool API (`Request::SpawnMain`) beyond the CLI. ## 1. Goal