From 328bb01a931c615924ab7c7390f7ca415aac36fc Mon Sep 17 00:00:00 2001 From: Roy Lin Date: Thu, 11 Jun 2026 09:53:59 +0800 Subject: [PATCH] fix(runtime): tolerate slow guest exec-server bind on boot (issue #3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On a cold first run on a slow/loaded host, guest-init binds its exec (vsock 4089) and PTY (vsock 4090) servers only late in boot — after the virtio-fs pivot, network bring-up, and the container spawn. It cannot start them earlier: spawn_isolated forks the container and runs non-async-signal-safe code (tracing/alloc) before exec, which is only safe because the parent is single-threaded at fork time; starting the server threads first would risk a malloc/log-lock deadlock in the child. That late bind could land past the host's fixed 10s exec-readiness budget, producing the false "Exec socket appeared but heartbeat failed, exec will not be available" warning. For `run -it` the same slow bind made the PTY attach race the server ("Connection refused"). Raise the wait_for_exec_ready budget to 30s. It stays cheap for healthy boxes (returns the moment the heartbeat passes) and already bails out the instant the VM exits, so a fast-exiting container never stalls for the full budget. Because boot blocks on this exec wait before the `run -it` PTY attach, and the guest brings exec+PTY up back-to-back, the existing 10s PTY connect retry then succeeds — no PTY-budget change needed. Also: reword the two readiness warnings (exec/attach connect on demand, so a timed-out probe no longer claims exec is unavailable); fix the dead `/sbin/init` BOX_EXEC_EXEC default to `/bin/sh` (the runtime always sets the var, and /sbin/init is absent on Alpine — the original #3 symptom); and correct a stale resolve_oci_entrypoint doc comment. Scope: addresses the timing race on a healthy guest. A hard boot failure where the guest never binds (e.g. bridge-mode eth0 setup failing) is a separate fault and surfaces after the wait rather than being masked. --- CHANGELOG.md | 25 +++++++++++++++++++++++++ src/guest/init/src/main.rs | 7 +++++-- src/runtime/src/vm/ready.rs | 28 ++++++++++++++++++++++------ src/runtime/src/vm/spec.rs | 3 ++- 4 files changed, 54 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1e1c628..8d27bb5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,31 @@ All notable changes to A3S Box will be documented in this file. ## [Unreleased] +### Fixed +- **Slow-boot exec/PTY readiness race (`WARN Exec socket appeared but heartbeat + failed`, and `run -it` PTY `Connection refused`) — issue #3.** The guest binds + its exec (vsock 4089) and PTY (vsock 4090) servers only late in boot — after + the virtio-fs pivot, network bring-up, and the container spawn — and it cannot + start them earlier without forking the container while multi-threaded (which + would risk a deadlock in the forked child). On a cold first run on a slow or + loaded host that bind could land past the host's fixed **10 s** readiness + budget, producing a false "heartbeat failed" warning. The host now waits up to + **30 s** for the exec heartbeat in `wait_for_exec_ready`. This also fixes + `run -it`: boot blocks on that exec-readiness wait *before* attaching the PTY, + and the guest brings the exec and PTY servers up back-to-back, so once the exec + heartbeat passes the PTY server is already listening and the existing 10 s PTY + connect retry succeeds. The wait stays cheap for healthy boxes (it returns the + moment the heartbeat passes) and still bails out immediately when the VM exits, + so a fast-exiting container never stalls for the full budget. The two readiness + warnings were also corrected — exec/attach connect on demand, so a timed-out + probe no longer claims "exec will not be available". Note: this addresses the + *timing* race on an otherwise-healthy guest; a hard boot failure where the + guest never binds the server (e.g. `--network` bridge mode when guest eth0 + setup fails) is a separate fault and will surface after the wait rather than be + masked. +- Guest-init's defensive `BOX_EXEC_EXEC` default is now `/bin/sh` instead of the + non-existent-on-Alpine `/sbin/init`, matching the runtime's real fallback. + ## [2.0.7] — 2026-06-06 ### Added diff --git a/src/guest/init/src/main.rs b/src/guest/init/src/main.rs index 4a60564..edca61e 100644 --- a/src/guest/init/src/main.rs +++ b/src/guest/init/src/main.rs @@ -43,8 +43,11 @@ impl ExecConfig { /// - BOX_EXEC_ENV_*: container environment variables /// - BOX_EXEC_WORKDIR: working directory (defaults to "/") fn from_env() -> Self { - let executable = - std::env::var("BOX_EXEC_EXEC").unwrap_or_else(|_| "/sbin/init".to_string()); + // The runtime always sets BOX_EXEC_EXEC when guest-init is PID 1 + // (runtime/src/vm/spec.rs), so this default is only a defensive fallback. + // Use /bin/sh — universal across distros — never /sbin/init, which does + // not exist on Alpine and was the original cause of issue #3. + let executable = std::env::var("BOX_EXEC_EXEC").unwrap_or_else(|_| "/bin/sh".to_string()); // Parse args from individual env vars (BOX_EXEC_ARGC + BOX_EXEC_ARG_0..N) let args: Vec = match std::env::var("BOX_EXEC_ARGC") diff --git a/src/runtime/src/vm/ready.rs b/src/runtime/src/vm/ready.rs index 7cc48ed..eeb42f9 100644 --- a/src/runtime/src/vm/ready.rs +++ b/src/runtime/src/vm/ready.rs @@ -41,7 +41,17 @@ impl VmManager { &mut self, exec_socket_path: &std::path::Path, ) -> Result<()> { - const MAX_WAIT_MS: u64 = 10000; + // The guest binds the exec server only late in boot (after virtio-fs + // pivot, passt network bring-up, and the container spawn — guest-init + // cannot start it earlier without forking the container while + // multi-threaded, which is unsafe). A cold first run on a slow/loaded + // host can push that past the old 10s budget, which surfaced as a false + // "heartbeat failed" warning and, for `run -it`, a PTY connect that gave + // up before the server came up (issue #3). Wait longer — this is cheap + // for healthy boxes (they return as soon as the heartbeat passes) and the + // loop already bails out the moment the VM exits, so a fast-exiting + // container never stalls for the full budget. + const MAX_WAIT_MS: u64 = 30000; const POLL_INTERVAL_MS: u64 = 200; tracing::debug!( @@ -54,7 +64,10 @@ impl VmManager { // Phase 1: Wait for socket file to appear loop { if start.elapsed().as_millis() >= MAX_WAIT_MS as u128 { - tracing::warn!("Exec socket did not appear, exec will not be available"); + tracing::warn!( + timeout_ms = MAX_WAIT_MS, + "Exec socket did not appear within timeout; exec/attach will connect on demand if the guest exposes it" + ); return Ok(()); } @@ -82,9 +95,9 @@ impl VmManager { // container shuts the VM down. The shim becomes a zombie the moment // the VM halts, so use has_exited (zombie-aware) rather than // is_running — without this, a container that exits before its first - // heartbeat stalls the whole boot for MAX_WAIT_MS (~10s), which hit - // every short-lived `run` that lost the heartbeat race and every - // monitor restart of a fast-exiting container. + // heartbeat stalls the whole boot for the full MAX_WAIT_MS budget, + // which hit every short-lived `run` that lost the heartbeat race and + // every monitor restart of a fast-exiting container. if let Some(ref handler) = *self.handler.read().await { if handler.has_exited() { tracing::debug!("VM exited before exec server became ready"); @@ -114,7 +127,10 @@ impl VmManager { tokio::time::sleep(tokio::time::Duration::from_millis(POLL_INTERVAL_MS)).await; } - tracing::warn!("Exec socket appeared but heartbeat failed, exec will not be available"); + tracing::warn!( + timeout_ms = MAX_WAIT_MS, + "Exec server did not pass a heartbeat within timeout; exec/attach connect on demand and may still succeed once the guest finishes starting" + ); Ok(()) } } diff --git a/src/runtime/src/vm/spec.rs b/src/runtime/src/vm/spec.rs index 7b66429..f38bca8 100644 --- a/src/runtime/src/vm/spec.rs +++ b/src/runtime/src/vm/spec.rs @@ -342,7 +342,8 @@ impl VmManager { /// - If `entrypoint_override` is set, it replaces the OCI ENTRYPOINT /// - If ENTRYPOINT is set: executable = ENTRYPOINT[0], args = ENTRYPOINT[1:] + CMD /// - If only CMD is set: executable = CMD[0], args = CMD[1:] - /// - If neither: fall back to `/sbin/init` + /// - If neither: fall back to `/bin/sh` (universal across distros; `/sbin/init` + /// does not exist on Alpine, which was the original cause of issue #3) /// - If `cmd_override` is non-empty, it replaces the OCI CMD /// /// Paths are used as-is since the OCI image is always extracted at rootfs root.