From 7df23884681601c741355ff833d121ae5feed413 Mon Sep 17 00:00:00 2001 From: Roy Lin Date: Thu, 11 Jun 2026 12:41:36 +0800 Subject: [PATCH 1/3] feat(shim): opt-in KSM page-merging for guest memory (A3S_BOX_KSM) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark the shim's anonymous memory — including libkrun's guest RAM — KSM-mergeable via prctl(PR_SET_MEMORY_MERGE) (Linux 6.4+) when A3S_BOX_KSM=1. With KSM enabled on the host, identical pages across same-image microVMs (kernel text, common runtime/libs) are deduplicated by ksmd, so N warm VMs of one image cost far less host RAM than N× their size — the memory-density half of a CoW-fork model, without any libkrun change. Best-effort: no-op when unset or on pre-6.4 kernels. Tier 2 of the CoW optimization plan (libkrun has no VM snapshot/fork; KSM is the VMM-agnostic, host-side path to clone-style '100 VMs ~ 10' memory density). --- src/shim/src/main.rs | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/src/shim/src/main.rs b/src/shim/src/main.rs index 3f2f087..23a6977 100644 --- a/src/shim/src/main.rs +++ b/src/shim/src/main.rs @@ -75,6 +75,42 @@ fn main() { } } +/// Opt-in (env `A3S_BOX_KSM=1`): mark this shim's anonymous memory — including +/// libkrun's guest RAM, which `start_enter` allocates as anonymous `mmap` after +/// this runs — as KSM-mergeable via `prctl(PR_SET_MEMORY_MERGE)` (Linux 6.4+). +/// With KSM enabled on the host (`/sys/kernel/mm/ksm/run = 1`), identical pages +/// across same-image microVMs (kernel text, common runtime/libs) are deduplicated +/// by ksmd, so N warm VMs of one image cost far less host RAM than N× their size. +/// Best-effort: a no-op when the env is unset or on pre-6.4 kernels (EINVAL). +#[cfg(target_os = "linux")] +fn maybe_enable_ksm_merge() { + // PR_SET_MEMORY_MERGE (since Linux 6.4) — not in all libc versions, so use + // the numeric value directly. + const PR_SET_MEMORY_MERGE: libc::c_int = 67; + + let enabled = std::env::var("A3S_BOX_KSM") + .map(|v| matches!(v.as_str(), "1" | "true" | "yes" | "on")) + .unwrap_or(false); + if !enabled { + return; + } + + // SAFETY: PR_SET_MEMORY_MERGE takes a single scalar (enable=1); no pointers + // or out-params. A non-zero return (e.g. pre-6.4 kernel → EINVAL) is non-fatal. + let rc = unsafe { libc::prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0) }; + if rc == 0 { + tracing::info!("KSM page-merging enabled for guest memory (PR_SET_MEMORY_MERGE)"); + } else { + tracing::warn!( + error = %std::io::Error::last_os_error(), + "A3S_BOX_KSM set but PR_SET_MEMORY_MERGE failed (needs Linux 6.4+); continuing without KSM" + ); + } +} + +#[cfg(not(target_os = "linux"))] +fn maybe_enable_ksm_merge() {} + fn run() -> Result<()> { let args = Args::parse(); @@ -129,6 +165,9 @@ fn run() -> Result<()> { "Starting VM" ); + // Opt-in KSM: mark guest memory mergeable before libkrun allocates it. + maybe_enable_ksm_merge(); + // Validate rootfs exists if !spec.rootfs_path.exists() { return Err(BoxError::BoxBootError { From 264ecf165680c6ae17ea5afaa1ab6ab349f093f6 Mon Sep 17 00:00:00 2001 From: Roy Lin Date: Thu, 11 Jun 2026 12:53:03 +0800 Subject: [PATCH 2/3] perf(boot): trim 1000ms stabilize floor + reflink CoW rootfs copy fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tier 1 of the CoW/boot-latency work. - ready.rs wait_for_vm_running: the fixed 1000 ms "stabilize" sleep ran on EVERY boot before the readiness probe even started. Replace it with a 250 ms has_exited poll — still catches an immediate launch failure (bad config makes libkrun exit in milliseconds) and fails loudly, but shaves ~750 ms off every boot. Later crashes are still caught by wait_for_exec_ready's has_exited checks. - layer_cache.rs copy_dir_recursive: the CopyProvider fallback (used when overlayfs is unavailable) did a full per-file byte copy. Add copy_file_cow, which prefers a FICLONE reflink so a new box's rootfs shares blocks CoW with the cached image on btrfs/XFS(reflink)/bcachefs (instant, no extra disk), falling back to a byte copy on ext4/cross-device. Permissions preserved like fs::copy. --- src/runtime/src/cache/layer_cache.rs | 42 +++++++++++++++++++++++++++- src/runtime/src/vm/ready.rs | 36 ++++++++++++++++-------- 2 files changed, 65 insertions(+), 13 deletions(-) diff --git a/src/runtime/src/cache/layer_cache.rs b/src/runtime/src/cache/layer_cache.rs index 23a2d84..b2a3679 100644 --- a/src/runtime/src/cache/layer_cache.rs +++ b/src/runtime/src/cache/layer_cache.rs @@ -323,7 +323,7 @@ pub(crate) fn copy_dir_recursive(src: &Path, dst: &Path) -> Result<()> { } else if meta.is_dir() { copy_dir_recursive(&src_path, &dst_path)?; } else { - std::fs::copy(&src_path, &dst_path).map_err(|e| { + copy_file_cow(&src_path, &dst_path).map_err(|e| { BoxError::CacheError(format!( "Failed to copy {} to {}: {}", src_path.display(), @@ -338,6 +338,46 @@ pub(crate) fn copy_dir_recursive(src: &Path, dst: &Path) -> Result<()> { Ok(()) } +/// Copy a regular file, preferring a copy-on-write reflink (`FICLONE`) so a new +/// box's rootfs shares blocks with the cached image — instant, no extra disk — on +/// reflink-capable filesystems (btrfs, XFS `reflink=1`, bcachefs). Falls back to a +/// plain byte copy when reflink is unsupported (e.g. ext4) or the source and +/// destination are on different filesystems. Overlay is preferred on Linux, so +/// this only runs on the `CopyProvider` fallback path. +fn copy_file_cow(src: &Path, dst: &Path) -> std::io::Result<()> { + #[cfg(target_os = "linux")] + { + use std::os::unix::io::AsRawFd; + // FICLONE = _IOW(0x94, 9, int) + const FICLONE: libc::c_ulong = 0x4004_9409; + let reflinked = (|| -> std::io::Result { + let s = std::fs::File::open(src)?; + let d = std::fs::OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(dst)?; + // SAFETY: FICLONE's argument is the source fd; both fds are valid for + // the call. A non-zero return (unsupported FS / cross-device) just + // means "fall back to a byte copy". + let rc = unsafe { libc::ioctl(d.as_raw_fd(), FICLONE, s.as_raw_fd()) }; + if rc != 0 { + return Ok(false); + } + // FICLONE clones data only — copy the permission bits like fs::copy. + if let Ok(perm) = s.metadata().map(|m| m.permissions()) { + let _ = d.set_permissions(perm); + } + Ok(true) + })() + .unwrap_or(false); + if reflinked { + return Ok(()); + } + } + std::fs::copy(src, dst).map(|_| ()) +} + /// Calculate the total size of a directory recursively. pub(crate) fn dir_size(path: &Path) -> std::io::Result { let mut total = 0; diff --git a/src/runtime/src/vm/ready.rs b/src/runtime/src/vm/ready.rs index 8bac187..e4cc3c1 100644 --- a/src/runtime/src/vm/ready.rs +++ b/src/runtime/src/vm/ready.rs @@ -8,22 +8,34 @@ use crate::grpc::ExecClient; use super::VmManager; impl VmManager { - /// Wait for the VM process to be running (for generic OCI images without an agent). + /// Confirm the VM didn't fail on launch (for generic OCI images without an agent). /// - /// Gives the VM a brief moment to start, then verifies the process hasn't exited. + /// A bad config makes libkrun exit within milliseconds, so we only need a short + /// window to catch an *immediate* crash and fail loudly. Poll for that instead + /// of a fixed 1 s sleep — it shaved ~750 ms off every boot. Crashes that happen + /// later are caught by `wait_for_exec_ready`'s `has_exited` checks, which gate + /// the rest of boot anyway. pub(crate) async fn wait_for_vm_running(&self) -> Result<()> { - const STABILIZE_MS: u64 = 1000; + const MAX_WAIT_MS: u64 = 250; + const POLL_MS: u64 = 25; - tracing::debug!("Waiting for VM process to stabilize"); - tokio::time::sleep(tokio::time::Duration::from_millis(STABILIZE_MS)).await; - - if let Some(ref handler) = *self.handler.read().await { - if !handler.is_running() { - return Err(BoxError::BoxBootError { - message: "VM process exited immediately after start".to_string(), - hint: Some("Check console output for errors".to_string()), - }); + tracing::debug!("Confirming VM process started"); + let start = std::time::Instant::now(); + loop { + if let Some(ref handler) = *self.handler.read().await { + // has_exited is zombie-aware (a halted VM's shim becomes a zombie); + // is_running's kill(pid,0) would still report it alive. + if handler.has_exited() { + return Err(BoxError::BoxBootError { + message: "VM process exited immediately after start".to_string(), + hint: Some("Check console output for errors".to_string()), + }); + } + } + if start.elapsed().as_millis() >= MAX_WAIT_MS as u128 { + break; } + tokio::time::sleep(tokio::time::Duration::from_millis(POLL_MS)).await; } tracing::debug!("VM process is running"); From 8cb8423e002f526fbdfc73450cc2a77d1423526f Mon Sep 17 00:00:00 2001 From: Roy Lin Date: Thu, 11 Jun 2026 14:18:38 +0800 Subject: [PATCH 3/3] test(cache): cover copy_file_cow (content + mode preserved, overwrite) Tests the reflink-preferring copy helper on the fallback path (any FS): content and permission bits survive whether FICLONE reflinks or fs::copy is used, and the destination is truncated/overwritten. --- src/runtime/src/cache/layer_cache.rs | 37 ++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/src/runtime/src/cache/layer_cache.rs b/src/runtime/src/cache/layer_cache.rs index b2a3679..7702aee 100644 --- a/src/runtime/src/cache/layer_cache.rs +++ b/src/runtime/src/cache/layer_cache.rs @@ -940,4 +940,41 @@ mod tests { cache.invalidate(digest).unwrap(); assert!(cache.get(digest).unwrap().is_none()); } + + #[test] + fn test_copy_file_cow_preserves_content_and_mode() { + // Works whether the FS supports reflink (FICLONE) or falls back to a byte + // copy — both must preserve content and the permission bits. + let tmp = TempDir::new().unwrap(); + let src = tmp.path().join("src.bin"); + let dst = tmp.path().join("dst.bin"); + std::fs::write(&src, b"hello copy-on-write").unwrap(); + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + std::fs::set_permissions(&src, std::fs::Permissions::from_mode(0o755)).unwrap(); + } + + copy_file_cow(&src, &dst).unwrap(); + + assert_eq!(std::fs::read(&dst).unwrap(), b"hello copy-on-write"); + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let mode = std::fs::metadata(&dst).unwrap().permissions().mode() & 0o777; + assert_eq!(mode, 0o755, "executable bit must survive the copy"); + } + } + + #[test] + fn test_copy_file_cow_overwrites_existing_dst() { + // FICLONE and the fs::copy fallback both truncate the destination. + let tmp = TempDir::new().unwrap(); + let src = tmp.path().join("src"); + let dst = tmp.path().join("dst"); + std::fs::write(&src, b"new").unwrap(); + std::fs::write(&dst, b"old-and-longer").unwrap(); + copy_file_cow(&src, &dst).unwrap(); + assert_eq!(std::fs::read(&dst).unwrap(), b"new"); + } }