From 53a2f3e3f32430b38a910d7507a88c38fbb84d26 Mon Sep 17 00:00:00 2001 From: Roy Lin Date: Thu, 11 Jun 2026 10:47:46 +0800 Subject: [PATCH 1/3] refactor(init): early-bind vsock servers + event-driven readiness (issue #3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restructures the exec/PTY readiness path so boot waits for a real readiness EVENT bounded by VM liveness, instead of guessing a fixed timeout — replacing the interim 10s→30s band-aid. P1 — bind early, serve late. Split exec_server/pty_server into bind_*()->Listener (pure socket/bind/listen syscalls) and serve_*(listener) (the accept loop). run_init now binds both vsock listeners on the main thread right after the filesystem mounts (Step 2.6), BEFORE the slow network bring-up and the container fork, then spawns the accept loops after the fork (Step 8). Binding adds no thread, so the single-threaded-at-fork invariant that keeps spawn_isolated safe is preserved. The listen backlog is filled from boot, so a host connect QUEUES instead of being refused — this removes the `run -it` PTY "Connection refused". CLOEXEC keeps the forked container from inheriting the listeners. P2 — event-driven, liveness-bounded readiness. Early binding makes the host `connect` succeed immediately, so heartbeat()'s (timeout-less) read would block until the guest's accept loop runs. wait_for_exec_ready is rewritten to bound each connect+heartbeat attempt (tokio timeout), return at once when the VM exits (has_exited, zombie-aware — fast-exit containers never stall), and treat a large absolute cap purely as a backstop against a wedged-but-alive guest. A healthy guest passes the heartbeat the moment its accept loop runs, however late in a slow cold boot — so the false "heartbeat failed" warning is gone without a fixed budget to outrun. Also folds in the issue-#3 cleanups: dead `/sbin/init` BOX_EXEC_EXEC default → `/bin/sh`, and the stale resolve_oci_entrypoint doc comment. Deferred: an explicit guest→host "ready" beacon on a new vsock port was considered but NOT wired — port_forward uses add_vsock_port(listen=true) with a guest connect-out, which contradicts the assumed listen=false direction for guest→host, and that is only verifiable on KVM. The liveness-bounded heartbeat achieves the same correctness without guessing cross-process vsock semantics. Supersedes the interim 30s fix (PR #14). --- src/guest/init/src/exec_server.rs | 96 ++++++++++++++++++--------- src/guest/init/src/main.rs | 33 ++++++--- src/guest/init/src/pty_server.rs | 90 ++++++++++++++++--------- src/runtime/src/vm/ready.rs | 107 ++++++++++++++---------------- src/runtime/src/vm/spec.rs | 3 +- 5 files changed, 201 insertions(+), 128 deletions(-) diff --git a/src/guest/init/src/exec_server.rs b/src/guest/init/src/exec_server.rs index a593ef7..eb7da21 100644 --- a/src/guest/init/src/exec_server.rs +++ b/src/guest/init/src/exec_server.rs @@ -67,52 +67,88 @@ const EXEC_CONTROL_FLUSH: &[u8] = b"flush"; /// match the host's `EXEC_FLUSH_ACK` in `runtime/src/grpc/exec.rs`. const EXEC_FLUSH_ACK: &[u8] = b"flush-ack"; -/// Run the exec server, listening on vsock port 4089. +/// A bound, listening exec-server socket — produced by [`bind_exec_server`] and +/// consumed by [`serve_exec_server`]. /// -/// On Linux, binds to `AF_VSOCK` with `VMADDR_CID_ANY`. -/// On non-Linux platforms, this is a no-op (development stub). -pub fn run_exec_server() -> Result<(), Box> { - info!("Starting exec server on vsock port {}", EXEC_VSOCK_PORT); +/// Splitting bind from serve lets guest-init bind the exec vsock port EARLY on +/// the main thread (pure socket/bind/listen syscalls — no thread spawn, so the +/// later single-threaded container `fork()` stays fork-safe) while the accept +/// loop runs afterwards in its own thread. Binding early fills the listen +/// backlog from the start of boot, so a host connect QUEUES instead of being +/// refused while the slower boot steps (network, container spawn) finish — this +/// removes the "Connection refused" / heartbeat race of issue #3. On non-Linux +/// this is an inert placeholder so callers stay platform-agnostic. +#[cfg(target_os = "linux")] +pub struct ExecListener(std::os::fd::OwnedFd); +#[cfg(not(target_os = "linux"))] +pub struct ExecListener; +/// Bind + listen the exec vsock socket (port 4089). Pure socket syscalls, safe +/// to call on the main thread before the container fork. +pub fn bind_exec_server() -> Result> { #[cfg(target_os = "linux")] { - run_vsock_server()?; + use nix::sys::socket::{ + bind, listen, socket, AddressFamily, Backlog, SockFlag, SockType, VsockAddr, + }; + use std::os::fd::AsRawFd; + + let sock_fd = socket( + AddressFamily::Vsock, + SockType::Stream, + SockFlag::empty(), + None, + )?; + + // Set CLOEXEC manually since SOCK_CLOEXEC isn't available in nix 0.29 on + // macOS — and so the forked container never inherits the listening socket. + unsafe { + libc::fcntl(sock_fd.as_raw_fd(), libc::F_SETFD, libc::FD_CLOEXEC); + } + + let addr = VsockAddr::new(libc::VMADDR_CID_ANY, EXEC_VSOCK_PORT); + bind(sock_fd.as_raw_fd(), &addr)?; + listen(&sock_fd, Backlog::new(4)?)?; + + info!("Exec server listening on vsock port {}", EXEC_VSOCK_PORT); + Ok(ExecListener(sock_fd)) } #[cfg(not(target_os = "linux"))] { info!("Exec server not available on non-Linux platform (development mode)"); + Ok(ExecListener) } - - Ok(()) } -/// Linux vsock server implementation. -#[cfg(target_os = "linux")] -fn run_vsock_server() -> Result<(), Box> { - use nix::sys::socket::{ - accept, bind, listen, socket, AddressFamily, Backlog, SockFlag, SockType, VsockAddr, - }; - use std::os::fd::{AsRawFd, FromRawFd, OwnedFd}; - use tracing::error; - - let sock_fd = socket( - AddressFamily::Vsock, - SockType::Stream, - SockFlag::empty(), - None, - )?; +/// Run the exec accept loop on an already-bound listener. Intended to run on its +/// own thread for the VM's lifetime; never returns under normal operation. +pub fn serve_exec_server(listener: ExecListener) -> Result<(), Box> { + #[cfg(target_os = "linux")] + { + run_accept_loop(listener.0) + } - // Set CLOEXEC manually since SOCK_CLOEXEC isn't available in nix 0.29 on macOS - unsafe { - libc::fcntl(sock_fd.as_raw_fd(), libc::F_SETFD, libc::FD_CLOEXEC); + #[cfg(not(target_os = "linux"))] + { + let _ = listener; + Ok(()) } +} - let addr = VsockAddr::new(libc::VMADDR_CID_ANY, EXEC_VSOCK_PORT); - bind(sock_fd.as_raw_fd(), &addr)?; - listen(&sock_fd, Backlog::new(4)?)?; +/// Bind then serve in one call. Kept for callers that don't need the early-bind +/// split (e.g. tests); guest-init's boot path uses `bind_*` + `serve_*` directly. +pub fn run_exec_server() -> Result<(), Box> { + info!("Starting exec server on vsock port {}", EXEC_VSOCK_PORT); + serve_exec_server(bind_exec_server()?) +} - info!("Exec server listening on vsock port {}", EXEC_VSOCK_PORT); +/// The exec server accept loop. +#[cfg(target_os = "linux")] +fn run_accept_loop(sock_fd: std::os::fd::OwnedFd) -> Result<(), Box> { + use nix::sys::socket::accept; + use std::os::fd::{AsRawFd, FromRawFd, OwnedFd}; + use tracing::error; loop { match accept(sock_fd.as_raw_fd()) { diff --git a/src/guest/init/src/main.rs b/src/guest/init/src/main.rs index 4a60564..c704daf 100644 --- a/src/guest/init/src/main.rs +++ b/src/guest/init/src/main.rs @@ -43,8 +43,11 @@ impl ExecConfig { /// - BOX_EXEC_ENV_*: container environment variables /// - BOX_EXEC_WORKDIR: working directory (defaults to "/") fn from_env() -> Self { - let executable = - std::env::var("BOX_EXEC_EXEC").unwrap_or_else(|_| "/sbin/init".to_string()); + // The runtime always sets BOX_EXEC_EXEC when guest-init is PID 1 + // (runtime/src/vm/spec.rs), so this default is only a defensive fallback. + // Use /bin/sh — universal across distros — never /sbin/init, which does + // not exist on Alpine and was the original cause of issue #3. + let executable = std::env::var("BOX_EXEC_EXEC").unwrap_or_else(|_| "/bin/sh".to_string()); // Parse args from individual env vars (BOX_EXEC_ARGC + BOX_EXEC_ARG_0..N) let args: Vec = match std::env::var("BOX_EXEC_ARGC") @@ -268,6 +271,18 @@ fn run_init() -> Result<(), Box> { // Step 2.5: Mount tmpfs volumes mount_tmpfs_volumes()?; + // Step 2.6: Bind the exec (vsock 4089) and PTY (vsock 4090) listening sockets + // NOW, before the slower network bring-up and container spawn below. These are + // pure socket/bind/listen syscalls on this (still single-threaded) main thread, + // so the later container fork stays fork-safe; the accept loops are spawned as + // threads only after the fork (Step 8). Binding this early fills the listen + // backlog from the start of boot, so a host connect QUEUES instead of being + // refused while network setup and the container spawn finish — closing the + // exec/PTY startup race of issue #3. CLOEXEC on the fds keeps the forked + // container from inheriting the listeners. + let exec_listener = exec_server::bind_exec_server()?; + let pty_listener = pty_server::bind_pty_server()?; + // Step 3: Configure guest network (if passt mode is active). // Network setup may write /etc/resolv.conf — must run before read-only remount. network::configure_guest_network()?; @@ -362,9 +377,11 @@ fn run_init() -> Result<(), Box> { expose_container_env_to_exec(&exec_config); - // Step 8: Start exec server in background thread - std::thread::spawn(|| { - if let Err(e) = exec_server::run_exec_server() { + // Step 8: Start the exec server accept loop on the socket bound in Step 2.6. + // (set_container_pid above ran first, so a host signal-main frame still finds + // the PID once the loop is serving.) + std::thread::spawn(move || { + if let Err(e) = exec_server::serve_exec_server(exec_listener) { error!("Exec server failed: {}", e); } }); @@ -376,9 +393,9 @@ fn run_init() -> Result<(), Box> { } }); - // Step 8.5: Start PTY server in background thread - std::thread::spawn(|| { - if let Err(e) = pty_server::run_pty_server() { + // Step 8.5: Start the PTY server accept loop on the socket bound in Step 2.6. + std::thread::spawn(move || { + if let Err(e) = pty_server::serve_pty_server(pty_listener) { error!("PTY server failed: {}", e); } }); diff --git a/src/guest/init/src/pty_server.rs b/src/guest/init/src/pty_server.rs index c50b41f..c457257 100644 --- a/src/guest/init/src/pty_server.rs +++ b/src/guest/init/src/pty_server.rs @@ -15,51 +15,81 @@ use tracing::{error, warn}; #[cfg(target_os = "linux")] use crate::user::parse_process_user; -/// Run the PTY server, listening on vsock port 4090. -/// -/// On Linux, binds to `AF_VSOCK` with `VMADDR_CID_ANY`. -/// On non-Linux platforms, this is a no-op (development stub). -pub fn run_pty_server() -> Result<(), Box> { - info!("Starting PTY server on vsock port {}", PTY_VSOCK_PORT); +/// A bound, listening PTY-server socket — produced by [`bind_pty_server`] and +/// consumed by [`serve_pty_server`]. Same early-bind rationale as +/// [`crate::exec_server::ExecListener`]: bind on the main thread before the +/// container fork (fork-safe, fills the listen backlog), accept later in a +/// thread. On non-Linux this is an inert placeholder. +#[cfg(target_os = "linux")] +pub struct PtyListener(std::os::fd::OwnedFd); +#[cfg(not(target_os = "linux"))] +pub struct PtyListener; +/// Bind + listen the PTY vsock socket (port 4090). Pure socket syscalls, safe to +/// call on the main thread before the container fork. +pub fn bind_pty_server() -> Result> { #[cfg(target_os = "linux")] { - run_vsock_pty_server()?; + use nix::sys::socket::{ + bind, listen, socket, AddressFamily, Backlog, SockFlag, SockType, VsockAddr, + }; + use std::os::fd::AsRawFd; + + let sock_fd = socket( + AddressFamily::Vsock, + SockType::Stream, + SockFlag::empty(), + None, + )?; + + // Set CLOEXEC manually since SOCK_CLOEXEC isn't available in nix 0.29 on + // macOS — and so the forked container never inherits the listening socket. + unsafe { + libc::fcntl(sock_fd.as_raw_fd(), libc::F_SETFD, libc::FD_CLOEXEC); + } + + let addr = VsockAddr::new(libc::VMADDR_CID_ANY, PTY_VSOCK_PORT); + bind(sock_fd.as_raw_fd(), &addr)?; + listen(&sock_fd, Backlog::new(4)?)?; + + info!("PTY server listening on vsock port {}", PTY_VSOCK_PORT); + Ok(PtyListener(sock_fd)) } #[cfg(not(target_os = "linux"))] { info!("PTY server not available on non-Linux platform (development mode)"); + Ok(PtyListener) } - - Ok(()) } -/// Linux vsock PTY server implementation. -#[cfg(target_os = "linux")] -fn run_vsock_pty_server() -> Result<(), Box> { - use nix::sys::socket::{ - accept, bind, listen, socket, AddressFamily, Backlog, SockFlag, SockType, VsockAddr, - }; - use std::os::fd::{AsRawFd, FromRawFd, OwnedFd}; - - let sock_fd = socket( - AddressFamily::Vsock, - SockType::Stream, - SockFlag::empty(), - None, - )?; +/// Run the PTY accept loop on an already-bound listener. Intended to run on its +/// own thread for the VM's lifetime; never returns under normal operation. +pub fn serve_pty_server(listener: PtyListener) -> Result<(), Box> { + #[cfg(target_os = "linux")] + { + run_pty_accept_loop(listener.0) + } - // Set CLOEXEC manually since SOCK_CLOEXEC isn't available in nix 0.29 on macOS - unsafe { - libc::fcntl(sock_fd.as_raw_fd(), libc::F_SETFD, libc::FD_CLOEXEC); + #[cfg(not(target_os = "linux"))] + { + let _ = listener; + Ok(()) } +} - let addr = VsockAddr::new(libc::VMADDR_CID_ANY, PTY_VSOCK_PORT); - bind(sock_fd.as_raw_fd(), &addr)?; - listen(&sock_fd, Backlog::new(4)?)?; +/// Bind then serve in one call. Kept for callers that don't need the early-bind +/// split; guest-init's boot path uses `bind_*` + `serve_*` directly. +pub fn run_pty_server() -> Result<(), Box> { + info!("Starting PTY server on vsock port {}", PTY_VSOCK_PORT); + serve_pty_server(bind_pty_server()?) +} - info!("PTY server listening on vsock port {}", PTY_VSOCK_PORT); +/// The PTY server accept loop. +#[cfg(target_os = "linux")] +fn run_pty_accept_loop(sock_fd: std::os::fd::OwnedFd) -> Result<(), Box> { + use nix::sys::socket::accept; + use std::os::fd::{AsRawFd, FromRawFd, OwnedFd}; loop { match accept(sock_fd.as_raw_fd()) { diff --git a/src/runtime/src/vm/ready.rs b/src/runtime/src/vm/ready.rs index 7cc48ed..8bac187 100644 --- a/src/runtime/src/vm/ready.rs +++ b/src/runtime/src/vm/ready.rs @@ -30,91 +30,80 @@ impl VmManager { Ok(()) } - /// Wait for the exec server socket to become ready. + /// Wait for the exec server to become ready (a Frame Heartbeat round-trip). /// - /// Polls for the socket file to appear, then verifies the exec server - /// is healthy via a Frame Heartbeat round-trip. This is best-effort: - /// if the exec socket never appears (e.g., older guest init without - /// exec server), the VM still boots successfully. + /// Waits for the readiness EVENT — a successful heartbeat — bounded by VM + /// liveness, instead of guessing a fixed timeout. guest-init binds the exec + /// socket early (before the slow network bring-up and container spawn), so the + /// host connect succeeds immediately and the heartbeat passes the moment the + /// guest's accept loop runs — however late in a slow cold boot. Each attempt + /// is individually time-bounded (the early-bound socket makes a host `connect` + /// succeed and then block on read until the guest accepts), the loop returns + /// at once if the VM has exited (a fast-exiting container never stalls), and a + /// large absolute cap is only a last-resort backstop against a wedged-but-alive + /// guest — not the expected wait. Best-effort: exec/attach also connect on + /// demand, so even a timed-out probe does not mean exec is unavailable. #[cfg(unix)] pub(crate) async fn wait_for_exec_ready( &mut self, exec_socket_path: &std::path::Path, ) -> Result<()> { - const MAX_WAIT_MS: u64 = 10000; - const POLL_INTERVAL_MS: u64 = 200; + use tokio::time::Duration; + + // Per-attempt cap on one connect + heartbeat round-trip. guest-init binds + // the exec socket early, so the host `connect` succeeds as soon as the VM + // boots and `heartbeat()`'s read then blocks until the guest's accept loop + // runs; bounding each attempt keeps the loop checking VM liveness instead + // of hanging in that read. + const ATTEMPT_TIMEOUT: Duration = Duration::from_millis(500); + const POLL_INTERVAL: Duration = Duration::from_millis(200); + // Last-resort backstop against a wedged-but-alive guest that binds but + // never accepts. A healthy guest passes the heartbeat the instant its + // accept loop runs (however late), and an exited VM returns immediately + // below — so this cap is not the expected wait. + const MAX_WAIT_MS: u64 = 120_000; tracing::debug!( socket_path = %exec_socket_path.display(), - "Waiting for exec server socket" + "Waiting for exec server readiness" ); let start = std::time::Instant::now(); - // Phase 1: Wait for socket file to appear loop { - if start.elapsed().as_millis() >= MAX_WAIT_MS as u128 { - tracing::warn!("Exec socket did not appear, exec will not be available"); - return Ok(()); - } - - if exec_socket_path.exists() { - tracing::debug!("Exec socket file detected"); - break; - } - - // Check if VM is still running (has_exited treats a zombie shim as - // exited; is_running's kill(pid,0) would not). + // Return at once if the VM has already exited (zombie-aware: has_exited + // treats a zombie shim as exited, unlike is_running's kill(pid,0)). A + // fast-exiting container never stalls here. if let Some(ref handler) = *self.handler.read().await { if handler.has_exited() { - tracing::warn!("VM exited before exec socket appeared"); + tracing::debug!("VM exited before exec server became ready"); return Ok(()); } } - tokio::time::sleep(tokio::time::Duration::from_millis(POLL_INTERVAL_MS)).await; - } - - // Phase 2: Connect and verify with Heartbeat health check - while start.elapsed().as_millis() < MAX_WAIT_MS as u128 { - // Stop waiting if the VM has already exited: the exec socket can - // appear during guest init and then vanish when a fast-exiting - // container shuts the VM down. The shim becomes a zombie the moment - // the VM halts, so use has_exited (zombie-aware) rather than - // is_running — without this, a container that exits before its first - // heartbeat stalls the whole boot for MAX_WAIT_MS (~10s), which hit - // every short-lived `run` that lost the heartbeat race and every - // monitor restart of a fast-exiting container. - if let Some(ref handler) = *self.handler.read().await { - if handler.has_exited() { - tracing::debug!("VM exited before exec server became ready"); + // One bounded connect + heartbeat attempt. A timeout (early-bound + // socket, guest not yet accepting) or any error just means "retry". + if let Ok(Ok(client)) = + tokio::time::timeout(ATTEMPT_TIMEOUT, ExecClient::connect(exec_socket_path)).await + { + if let Ok(Ok(true)) = + tokio::time::timeout(ATTEMPT_TIMEOUT, client.heartbeat()).await + { + tracing::debug!("Exec server heartbeat passed"); + self.exec_client = Some(client); return Ok(()); } } - match ExecClient::connect(exec_socket_path).await { - Ok(client) => match client.heartbeat().await { - Ok(true) => { - tracing::debug!("Exec server heartbeat passed"); - self.exec_client = Some(client); - return Ok(()); - } - Ok(false) => { - tracing::debug!("Exec server heartbeat failed, retrying"); - } - Err(e) => { - tracing::debug!(error = %e, "Exec heartbeat error, retrying"); - } - }, - Err(e) => { - tracing::debug!(error = %e, "Exec connect failed, retrying"); - } + if start.elapsed().as_millis() >= MAX_WAIT_MS as u128 { + tracing::warn!( + timeout_ms = MAX_WAIT_MS, + "Exec server did not become ready within the safety cap; exec/attach connect on demand and may still succeed once the guest finishes starting" + ); + return Ok(()); } - tokio::time::sleep(tokio::time::Duration::from_millis(POLL_INTERVAL_MS)).await; + tokio::time::sleep(POLL_INTERVAL).await; } - - tracing::warn!("Exec socket appeared but heartbeat failed, exec will not be available"); - Ok(()) } } diff --git a/src/runtime/src/vm/spec.rs b/src/runtime/src/vm/spec.rs index 7b66429..f38bca8 100644 --- a/src/runtime/src/vm/spec.rs +++ b/src/runtime/src/vm/spec.rs @@ -342,7 +342,8 @@ impl VmManager { /// - If `entrypoint_override` is set, it replaces the OCI ENTRYPOINT /// - If ENTRYPOINT is set: executable = ENTRYPOINT[0], args = ENTRYPOINT[1:] + CMD /// - If only CMD is set: executable = CMD[0], args = CMD[1:] - /// - If neither: fall back to `/sbin/init` + /// - If neither: fall back to `/bin/sh` (universal across distros; `/sbin/init` + /// does not exist on Alpine, which was the original cause of issue #3) /// - If `cmd_override` is non-empty, it replaces the OCI CMD /// /// Paths are used as-is since the OCI image is always extracted at rootfs root. From e8b18241a6db06b98a48ef93554de944b6bf9ada Mon Sep 17 00:00:00 2001 From: Roy Lin Date: Thu, 11 Jun 2026 12:09:32 +0800 Subject: [PATCH 2/3] =?UTF-8?q?fix(init):=20real=20PID1=20reaper=20?= =?UTF-8?q?=E2=80=94=20reap=20orphans=20without=20stealing=20exec/PTY=20ex?= =?UTF-8?q?it=20codes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit guest-init runs as PID 1 but only waited on the container pid, so reparented grandchildren and the sidecar were never reaped and accumulated as zombies for the VM's lifetime. The earlier code couldn't just waitpid(-1): that races with the exec/PTY handlers, which waitpid their own children to read the real exit code — a stolen child makes the handler see ECHILD and report a bogus exit 0 (exec_server.rs). That tension is exactly why a prior fix narrowed the loop to waitpid(container_pid), trading the zombie leak for correct exec codes. Resolve both with a small reaper registry: - New `reaper` module: handlers mark their child pid MANAGED across the spawn (the lock is held across fork, closing the spawn/register race for fast-exiting commands like `exec -- false`); an RAII guard unregisters on every return path. - The supervision loop now peeks exited children non-destructively with `waitid(WNOWAIT)` and routes: the container -> reap + propagate exit code (VM lifecycle, unchanged); MANAGED children -> left for their handler to reap (real exit codes preserved); everything else (orphans + sidecar) -> reaped here. - exec one-shot + streaming spawns and the PTY fork register their children; their existing waitpid/try_wait paths are unchanged. Fixes the zombie leak and makes the long-standing "reaped by the zombie-reaper loop" comments true again, with no regression to exec/PTY or container exit codes. Unit-tested (reaper registry); needs KVM verification of exec exit codes + orphan reaping. Builds on P1+P2 (issue #3). --- src/guest/init/src/exec_server.rs | 14 +++- src/guest/init/src/lib.rs | 1 + src/guest/init/src/main.rs | 99 +++++++++++++++++++------ src/guest/init/src/pty_server.rs | 5 ++ src/guest/init/src/reaper.rs | 119 ++++++++++++++++++++++++++++++ 5 files changed, 211 insertions(+), 27 deletions(-) create mode 100644 src/guest/init/src/reaper.rs diff --git a/src/guest/init/src/exec_server.rs b/src/guest/init/src/exec_server.rs index eb7da21..5e9f4c3 100644 --- a/src/guest/init/src/exec_server.rs +++ b/src/guest/init/src/exec_server.rs @@ -751,8 +751,12 @@ fn execute_command( Err(output) => return output, }; - let mut child = match command.spawn() { - Ok(child) => child, + // Spawn under the reaper registry: the pid is marked MANAGED before the PID 1 + // supervision loop can see it, so the loop leaves this child for us to reap + // (and read its real exit code) instead of stealing it. The guard unregisters + // the pid when this function returns (all paths). + let (mut child, _reap_guard) = match crate::reaper::spawn_managed(|| command.spawn()) { + Ok(pair) => pair, Err(e) => { return ExecOutput { stdout: vec![], @@ -1043,8 +1047,10 @@ fn execute_command_streaming( } }; - let mut child = match command.spawn() { - Ok(child) => child, + // Spawn under the reaper registry (see one-shot path) so PID 1 leaves this + // streaming child for us to reap; the guard unregisters on return. + let (mut child, _reap_guard) = match crate::reaper::spawn_managed(|| command.spawn()) { + Ok(pair) => pair, Err(e) => { let output = ExecOutput { stdout: vec![], diff --git a/src/guest/init/src/lib.rs b/src/guest/init/src/lib.rs index 57b05ef..f6a6f34 100644 --- a/src/guest/init/src/lib.rs +++ b/src/guest/init/src/lib.rs @@ -14,6 +14,7 @@ pub mod namespace; pub mod network; pub mod port_forward; pub mod pty_server; +pub mod reaper; pub mod user; pub use namespace::{spawn_isolated, NamespaceConfig, NamespaceError}; diff --git a/src/guest/init/src/main.rs b/src/guest/init/src/main.rs index c704daf..c862cb2 100644 --- a/src/guest/init/src/main.rs +++ b/src/guest/init/src/main.rs @@ -986,55 +986,105 @@ fn remount_rootfs_readonly() -> Result<(), Box> { Ok(()) } -/// Wait for the main container process. +/// Supervise children as PID 1: propagate the container's exit, and reap orphans. /// -/// Exec and PTY requests run in other guest-init threads and wait for their -/// own child processes. The main supervision loop must not call waitpid(-1), -/// otherwise it can reap those children before the request handler observes -/// their exit status. +/// Exec and PTY request handlers reap their OWN children (each `waitpid`s a +/// specific pid) to read the real exit status, so this loop must not steal them +/// with a blind `waitpid(-1)`. It peeks exited children non-destructively with +/// `waitid(WNOWAIT)` and, via the [`reaper`](a3s_box_guest_init::reaper) +/// registry, reaps only the container (→ VM lifecycle / exit code) and UNMANAGED +/// children — reparented grandchildren and the sidecar — leaving handler-managed +/// children for their handler. This propagates the container exit code AND fixes +/// the zombie leak (orphans were previously never reaped until shutdown). +#[cfg(target_os = "linux")] fn wait_for_children(container_pid: nix::unistd::Pid) -> Result<(), Box> { - use nix::sys::wait::{waitpid, WaitPidFlag, WaitStatus}; + use a3s_box_guest_init::reaper; + use nix::sys::wait::{waitid, waitpid, Id, WaitPidFlag, WaitStatus}; /// Maximum time to wait for children after forwarding SIGTERM (5 seconds). const CHILD_SHUTDOWN_TIMEOUT_MS: u64 = 5000; - info!("Waiting for container process {}", container_pid); + info!( + "Supervising children as PID 1; container PID {}", + container_pid + ); loop { - // Check if shutdown was requested via SIGTERM if SHUTDOWN_REQUESTED.load(Ordering::SeqCst) { info!("SIGTERM received, initiating graceful shutdown"); graceful_shutdown(CHILD_SHUTDOWN_TIMEOUT_MS); return Ok(()); } + // Drain currently-exited children. `WNOWAIT` peeks without reaping, so a + // handler-managed child stays reapable by its handler; we break on it and + // revisit next tick (the handler clears it within its own poll interval). + loop { + let (pid, code, signaled) = match waitid( + Id::All, + WaitPidFlag::WEXITED | WaitPidFlag::WNOWAIT | WaitPidFlag::WNOHANG, + ) { + Ok(WaitStatus::Exited(pid, status)) => (pid, status, false), + Ok(WaitStatus::Signaled(pid, signal, _)) => (pid, 128 + signal as i32, true), + // No exited child right now: stop draining and poll again later. + Ok(_) => break, + // No children at all (container already gone): nothing to supervise. + Err(nix::errno::Errno::ECHILD) => return Ok(()), + // Transient error: retry on the next tick. + Err(_) => break, + }; + + if pid == container_pid { + // The container drives the VM lifecycle: reap it and exit with its + // status so the host (and detached `run -d wait`) sees the real code. + let _ = waitpid(pid, None); + if signaled { + error!("Container process {} terminated (exit code {})", pid, code); + } else { + info!("Container process {} exited with status {}", pid, code); + } + persist_exit_code(code); + process::exit(code); + } else if reaper::is_managed(pid.as_raw()) { + // Owned by an exec/PTY handler, which reaps it for the real status. + // Stop draining; it clears shortly and we revisit on the next tick. + break; + } else { + // Orphan (reparented grandchild) or the sidecar: reap it here so it + // does not linger as a zombie. Keep draining for more. + let _ = waitpid(pid, Some(WaitPidFlag::WNOHANG)); + } + } + + std::thread::sleep(std::time::Duration::from_millis(100)); + } +} + +/// Non-Linux development stub: just wait for the container process to exit. +#[cfg(not(target_os = "linux"))] +fn wait_for_children(container_pid: nix::unistd::Pid) -> Result<(), Box> { + use nix::sys::wait::{waitpid, WaitPidFlag, WaitStatus}; + + loop { + if SHUTDOWN_REQUESTED.load(Ordering::SeqCst) { + return Ok(()); + } match waitpid(container_pid, Some(WaitPidFlag::WNOHANG)) { - Ok(WaitStatus::Exited(pid, status)) => { - info!("Container process {} exited with status {}", pid, status); + Ok(WaitStatus::Exited(_, status)) => { persist_exit_code(status); process::exit(status); } - Ok(WaitStatus::Signaled(pid, signal, _)) => { - error!("Container process {} killed by signal {:?}", pid, signal); + Ok(WaitStatus::Signaled(_, signal, _)) => { persist_exit_code(128 + signal as i32); process::exit(128 + signal as i32); } Ok(WaitStatus::StillAlive) => { std::thread::sleep(std::time::Duration::from_millis(100)); } - Ok(_) => { - // Other status, continue waiting - } - Err(nix::errno::Errno::ECHILD) => { - info!("Container process {} is no longer a child", container_pid); - break; - } - Err(e) => { - return Err(format!("waitpid failed: {}", e).into()); - } + Ok(_) => {} + Err(_) => break, } } - Ok(()) } @@ -1053,6 +1103,9 @@ fn persist_exit_code(code: i32) { } /// Perform graceful shutdown: forward SIGTERM to children, wait, then force-kill. +/// Only the Linux supervision loop drives this (the non-Linux dev stub exits the +/// process directly), so it is gated to avoid a dead-code warning on macOS. +#[cfg(target_os = "linux")] fn graceful_shutdown(timeout_ms: u64) { // Step 1: Send SIGTERM to all processes (except ourselves, PID 1) #[cfg(target_os = "linux")] diff --git a/src/guest/init/src/pty_server.rs b/src/guest/init/src/pty_server.rs index c457257..b1648f4 100644 --- a/src/guest/init/src/pty_server.rs +++ b/src/guest/init/src/pty_server.rs @@ -246,6 +246,11 @@ fn handle_pty_connection(fd: std::os::fd::OwnedFd) -> Result<(), Box> = Mutex::new(Vec::new()); + +/// RAII guard: removes its pid from the MANAGED set when dropped (i.e. when the +/// handler returns, having reaped its own child). Drop runs on every handler +/// exit path — normal, timeout-kill, and error — so the set never leaks. +pub struct ManagedChild(i32); + +impl Drop for ManagedChild { + fn drop(&mut self) { + // Recover from a poisoned lock rather than panicking inside PID 1. + let mut managed = MANAGED.lock().unwrap_or_else(|e| e.into_inner()); + managed.retain(|&p| p != self.0); + } +} + +/// Spawn a child while atomically marking its pid MANAGED. +/// +/// The MANAGED lock is held across the `spawn` call, so the supervision loop +/// cannot conclude the new pid is unmanaged (and reap it as an orphan) in the +/// window between fork and registration — which closes the race for commands +/// that exit almost immediately (e.g. `exec … -- false`). Holding the lock +/// across the fork is safe: the forked child runs only async-signal-safe code +/// before `exec` and never touches this mutex, so its inherited (locked) copy is +/// discarded at `exec` without deadlock. +pub fn spawn_managed(spawn: F) -> std::io::Result<(std::process::Child, ManagedChild)> +where + F: FnOnce() -> std::io::Result, +{ + let mut managed = MANAGED.lock().unwrap_or_else(|e| e.into_inner()); + let child = spawn()?; + let pid = child.id() as i32; + managed.push(pid); + drop(managed); + Ok((child, ManagedChild(pid))) +} + +/// Mark an already-forked pid MANAGED (raw-fork callers, e.g. the PTY server). +/// Call immediately after `fork` in the parent. Returns the RAII guard. +pub fn manage_pid(pid: i32) -> ManagedChild { + let mut managed = MANAGED.lock().unwrap_or_else(|e| e.into_inner()); + if !managed.contains(&pid) { + managed.push(pid); + } + drop(managed); + ManagedChild(pid) +} + +/// Whether `pid` is owned by a request handler (the loop must not reap it). +pub fn is_managed(pid: i32) -> bool { + MANAGED + .lock() + .map(|m| m.contains(&pid)) + .unwrap_or_else(|e| e.into_inner().contains(&pid)) +} + +#[cfg(test)] +mod tests { + use super::*; + + // Distinct synthetic pids per test so the shared MANAGED static doesn't + // collide across parallel test threads. + #[test] + fn manage_pid_registers_until_guard_dropped() { + let pid = 0x7fff_fff0; + assert!(!is_managed(pid)); + { + let _guard = manage_pid(pid); + assert!( + is_managed(pid), + "pid should be managed while the guard lives" + ); + } + assert!(!is_managed(pid), "guard drop must unregister the pid"); + } + + #[test] + fn unregistered_pid_is_not_managed() { + assert!(!is_managed(0x7fff_fff1)); + } + + #[test] + fn spawn_managed_marks_then_releases_real_child() { + // A child that exits immediately is the worst case the lock-across-spawn + // protects: it must still be MANAGED for the handler (here, the test) to + // reap, never the supervision loop. + let (mut child, guard) = + spawn_managed(|| std::process::Command::new("true").spawn()).expect("spawn true"); + let pid = child.id() as i32; + assert!( + is_managed(pid), + "spawned pid must be registered before we observe it" + ); + let _ = child.wait(); // the handler (test) reaps its own child + drop(guard); + assert!(!is_managed(pid), "guard drop must unregister after reap"); + } +} From 719040019f0034cec7b1e6cf358cb3eba29ec982 Mon Sep 17 00:00:00 2001 From: Roy Lin Date: Thu, 11 Jun 2026 16:03:19 +0800 Subject: [PATCH 3/3] docs: P2 deferred-main-spawn design (GO-WITH-CONDITIONS) Adversarial mapping of the #15+#18 base resolved both crux uncertainties: console logs come free via process-wide fd inheritance (Stdio::inherit, not the exec path's piped), and the multi-threaded fork hazard is avoided by spawning the deferred main via Command::spawn (not spawn_isolated's raw fork; the VM already isolates). Conditions: single spawn-main (CAS) + atomic late container-pid handoff to the reaper. Includes risk-ranked blockers + a 7-phase plan whose Phase 0 is a single KVM prototype that de-risks the whole feature. --- docs/p2-deferred-main-spawn-design.md | 118 ++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 docs/p2-deferred-main-spawn-design.md diff --git a/docs/p2-deferred-main-spawn-design.md b/docs/p2-deferred-main-spawn-design.md new file mode 100644 index 0000000..a763ba0 --- /dev/null +++ b/docs/p2-deferred-main-spawn-design.md @@ -0,0 +1,118 @@ +# Design: P2 — Deferred-Main-Spawn (full box semantics for pooled sandboxes) + +Status: **GO-WITH-CONDITIONS** (design + prototype-first). Builds on +`refactor/init-readiness` (PR #15: early-bind + event-driven readiness + PID1 +reaper) and `feat/p1-template-pool` (PR #18: the warm-sandbox pool controller). +Derived from an adversarial mapping of the real #15+#18 base. + +## 1. Goal + +The pool MVP (PR #18) runs a command in a warm VM via the **exec stream**, so its +output comes back over the exec protocol, **not** the json-file `logs`. P2 gives a +pooled sandbox **full `box` semantics** — the command becomes the VM's real +**container main**, so its exit code flows through the normal `/upper/.a3s_exit_code` +path and its stdout/stderr land in `/logs/container.json` exactly like a +normal `box run`. The VM still skips cold boot (it was pre-warmed), but now behaves +like a first-class box. + +## 2. Verdict & the two crux realizations + +**GO-WITH-CONDITIONS** — both hard problems are tractable on the #15 base: + +1. **Console logs are "free" via process-wide fd inheritance.** The shim wires the + libkrun split console at boot (`shim/main.rs` `add_split_console`, fds kept alive + via `mem::forget`); inside the guest, PID 1 holds fds 1/2 = `/dev/console`. + guest-init routes *its own* logs to `/dev/kmsg` to keep the console clean. The + boot main reaches `container.json` today **only** because `namespace::spawn_isolated` + leaves stdout/stderr at the default `Stdio::inherit`. So a deferred main spawned + with `Stdio::inherit` inherits PID 1's console fds and its output flows to + `console.log`/`console.err.log` → the log processor tags it into `container.json`. + **No fd-stashing, dup-to-100, or env-passing of fd numbers is required** — there + is one shared fd table. (The exec path's `Stdio::piped()` is exactly why exec + output does *not* reach the logs today.) + +2. **The multi-threaded fork hazard is avoidable.** Do **not** spawn the deferred + main via `spawn_isolated`'s raw `fork()` — its child runs heavy allocating code + (tracing, `fs::metadata`, user resolution) before exec and can deadlock on an + allocator/tracing lock held by another thread of the (deeply multi-threaded) + PID 1. Instead spawn via `std::process::Command::spawn()` — the same clone/exec + primitive the exec server already uses safely — whose child runs **only** the + registered async-signal-safe `pre_exec` hook before `execvp`. The VM itself + provides isolation, so the deferred main needs **no** `unshare()`/PID-namespace, + removing the second fork entirely. + +**Conditions:** exactly one spawn-main (CAS on the container-pid sentinel); the +late container-pid is handed to the reaper atomically (register MANAGED → publish +→ drop guard) so it can't be reaped as an orphan (the issue-#3 class of bug). + +## 3. Design (end-to-end) + +Deferred-main is a **replacement** for the keepalive (it IS the VM's main), not a +companion. + +- **Boot IDLE** — `BOX_DEFERRED_MAIN=1` makes `run_init` skip `spawn_isolated`; + the container-pid stays the sentinel `-1` and PID 1 stays alive (the + `wait_for_children` loop already loops). The early-bind (Step 2.6) and accept-loop + (Step 8) are **unchanged**, so host readiness passes IDLE: the heartbeat handler + is a pure protocol handshake with **no** container-pid dependency — so + `BoxState::Ready` already means "exec server live", which is the de-facto contract + today. +- **spawn-main control frame** — the host sends `spawn-main:` on + the exec vsock. The handler funnels the request through a **safe** spawn path: + `std::process::Command` + `Command::spawn()` under `reaper::spawn_managed`, with a + `build_command` variant that leaves stdout/stderr at `Stdio::inherit()` (so the + child inherits PID 1's console fds). Security (seccomp, user resolution, binary + stat) is built in the **parent** before spawn, mirroring `apply_security_before_exec`. +- **Reaper / exit-code handoff** — make the supervision loop's `container_pid` an + `AtomicI32` read each tick. Order is the crux: `spawn_managed` (lock-held-across- + spawn closes the fork/registration race) → `set_container_pid(pid)` **while still + MANAGED** → drop the guard. The `is_managed` branch covers the pre-publish window; + the `pid == container_pid` branch then reaps it, persists `/.a3s_exit_code` + (overlay upper), and `process::exit(code)` halts the VM. The handler replies + `spawn-main-ack` only **after** a successful spawn+publish (so a fork failure is + reported, not lost). +- **Pool integration (#18)** — add `Request::SpawnMain` to `pool.rs`; the daemon + sends spawn-main instead of `vm.exec_command`, waits for VM exit (the existing + teardown owns lifecycle), and reads exit code from `/upper/.a3s_exit_code` + and logs from `/logs/container.json`. `Request::Run` stays for back-compat. + +## 4. Risk-ranked blockers (with mitigations) + +| sev | blocker | mitigation | +|-----|---------|-----------| +| HIGH | multi-threaded fork deadlock (raw `fork()` + allocating child) | spawn via `Command::spawn()`, not `spawn_isolated`; build seccomp/user/stat in the parent; no `unshare()` (VM isolates) | +| HIGH | late container-pid race → reaped as orphan, exit code lost (issue-#3 class) | `AtomicI32` container-pid read each tick; `spawn_managed` → publish-while-MANAGED → drop guard; guest unit test: spawn-main an immediate-exit cmd, assert real code not 0 | +| HIGH | console logs broken if deferred main reuses the exec path's `Stdio::piped()` | `build_command` variant with `Stdio::inherit()` stdout/stderr; integration test: spawn-main `echo`, assert the line appears stream-tagged in `container.json` | +| MED | readiness-contract drift (Ready = "no container yet") reopens connection-refused races | scope P2 to the **pool path only** (daemon explicitly drives spawn-main then waits); leave normal `box run` on eager boot-spawn; IDLE-Ready is pool-internal | +| MED | multiple spawn-main frames → two mains race to set container-pid / write exit code | CAS container-pid `-1 → pending → pid`; a second concurrent spawn-main gets "main already spawned" | +| LOW | PTY-server `pre_exec` uses `set_var` (not async-signal-safe) | base the deferred spawner on the **exec** path (`Command::spawn`), never the PTY raw-fork path | + +## 5. Phased plan (smallest verifiable steps) + +0. **PROTOTYPE (throwaway, KVM)** — from inside an exec-server connection thread + (i.e. multi-threaded PID 1), `Command::spawn` `/bin/sh -c 'echo OUT; echo ERR 1>&2; + exit 7'` with `Stdio::inherit`. Verify **at once**: (a) no fork deadlock under + concurrent exec load, (b) OUT/ERR land in `container.json` correctly stream-tagged, + (c) exit 7 propagates via `/.a3s_exit_code`. This one prototype de-risks the whole + feature. **Must run on real KVM** (fork/allocator-lock + virtio-console don't + reproduce on the macOS dev stub). +1. IDLE boot behind `BOX_DEFERRED_MAIN=1`; assert host readiness still reaches Ready + with no container. +2. Convert supervision-loop `container_pid` to `AtomicI32` (set once at boot as + today — no behavior change); regression-test the eager boot main still reaps right. +3. Safe deferred spawner: `Stdio::inherit` `build_command` variant under + `spawn_managed`; CAS sentinel→pending→pid; publish-before-drop; ack on success. +4. Host spawn-main control frame: `send_spawn_main_control_frame` + + `EXEC_CONTROL_SPAWN_MAIN`; `wait_main_exit` (poll `/.a3s_exit_code`) + + `collect_logs` (read `container.json`). +5. Pool integration: `Request::SpawnMain` in `pool.rs`; full e2e — pool spawn-main a + real image entrypoint, assert exit code + json-file logs + clean teardown. +6. Full KVM matrix: issue-#3 before/after readiness, fast-exit (`false`) exit-code, + large-output log flushing, concurrent spawn-main rejection. Docs per repo rule. + +## 6. Prototype-first + +The single experiment that de-risks everything: a multi-threaded spawn-main that +simultaneously proves **(a)** safe fork via `Command::spawn()` under concurrent exec +load (no deadlock), **(b)** inherited-stdio output → `container.json` correctly +stream-tagged, **(c)** real exit code in `/.a3s_exit_code`. On real KVM only.