Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 78 additions & 1 deletion src/runtime/src/cache/layer_cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ pub(crate) fn copy_dir_recursive(src: &Path, dst: &Path) -> Result<()> {
} else if meta.is_dir() {
copy_dir_recursive(&src_path, &dst_path)?;
} else {
std::fs::copy(&src_path, &dst_path).map_err(|e| {
copy_file_cow(&src_path, &dst_path).map_err(|e| {
BoxError::CacheError(format!(
"Failed to copy {} to {}: {}",
src_path.display(),
Expand All @@ -338,6 +338,46 @@ pub(crate) fn copy_dir_recursive(src: &Path, dst: &Path) -> Result<()> {
Ok(())
}

/// Copy a regular file, preferring a copy-on-write reflink (`FICLONE`) so a new
/// box's rootfs shares blocks with the cached image — instant, no extra disk — on
/// reflink-capable filesystems (btrfs, XFS `reflink=1`, bcachefs). Falls back to a
/// plain byte copy when reflink is unsupported (e.g. ext4) or the source and
/// destination are on different filesystems. Overlay is preferred on Linux, so
/// this only runs on the `CopyProvider` fallback path.
fn copy_file_cow(src: &Path, dst: &Path) -> std::io::Result<()> {
#[cfg(target_os = "linux")]
{
use std::os::unix::io::AsRawFd;
// FICLONE = _IOW(0x94, 9, int)
const FICLONE: libc::c_ulong = 0x4004_9409;
let reflinked = (|| -> std::io::Result<bool> {
let s = std::fs::File::open(src)?;
let d = std::fs::OpenOptions::new()
.write(true)
.create(true)
.truncate(true)
.open(dst)?;
// SAFETY: FICLONE's argument is the source fd; both fds are valid for
// the call. A non-zero return (unsupported FS / cross-device) just
// means "fall back to a byte copy".
let rc = unsafe { libc::ioctl(d.as_raw_fd(), FICLONE, s.as_raw_fd()) };
if rc != 0 {
return Ok(false);
}
// FICLONE clones data only — copy the permission bits like fs::copy.
if let Ok(perm) = s.metadata().map(|m| m.permissions()) {
let _ = d.set_permissions(perm);
}
Ok(true)
})()
.unwrap_or(false);
if reflinked {
return Ok(());
}
}
std::fs::copy(src, dst).map(|_| ())
}

/// Calculate the total size of a directory recursively.
pub(crate) fn dir_size(path: &Path) -> std::io::Result<u64> {
let mut total = 0;
Expand Down Expand Up @@ -900,4 +940,41 @@ mod tests {
cache.invalidate(digest).unwrap();
assert!(cache.get(digest).unwrap().is_none());
}

#[test]
fn test_copy_file_cow_preserves_content_and_mode() {
// Works whether the FS supports reflink (FICLONE) or falls back to a byte
// copy — both must preserve content and the permission bits.
let tmp = TempDir::new().unwrap();
let src = tmp.path().join("src.bin");
let dst = tmp.path().join("dst.bin");
std::fs::write(&src, b"hello copy-on-write").unwrap();
#[cfg(unix)]
{
use std::os::unix::fs::PermissionsExt;
std::fs::set_permissions(&src, std::fs::Permissions::from_mode(0o755)).unwrap();
}

copy_file_cow(&src, &dst).unwrap();

assert_eq!(std::fs::read(&dst).unwrap(), b"hello copy-on-write");
#[cfg(unix)]
{
use std::os::unix::fs::PermissionsExt;
let mode = std::fs::metadata(&dst).unwrap().permissions().mode() & 0o777;
assert_eq!(mode, 0o755, "executable bit must survive the copy");
}
}

#[test]
fn test_copy_file_cow_overwrites_existing_dst() {
// FICLONE and the fs::copy fallback both truncate the destination.
let tmp = TempDir::new().unwrap();
let src = tmp.path().join("src");
let dst = tmp.path().join("dst");
std::fs::write(&src, b"new").unwrap();
std::fs::write(&dst, b"old-and-longer").unwrap();
copy_file_cow(&src, &dst).unwrap();
assert_eq!(std::fs::read(&dst).unwrap(), b"new");
}
}
36 changes: 24 additions & 12 deletions src/runtime/src/vm/ready.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,34 @@ use crate::grpc::ExecClient;
use super::VmManager;

impl VmManager {
/// Wait for the VM process to be running (for generic OCI images without an agent).
/// Confirm the VM didn't fail on launch (for generic OCI images without an agent).
///
/// Gives the VM a brief moment to start, then verifies the process hasn't exited.
/// A bad config makes libkrun exit within milliseconds, so we only need a short
/// window to catch an *immediate* crash and fail loudly. Poll for that instead
/// of a fixed 1 s sleep — it shaved ~750 ms off every boot. Crashes that happen
/// later are caught by `wait_for_exec_ready`'s `has_exited` checks, which gate
/// the rest of boot anyway.
pub(crate) async fn wait_for_vm_running(&self) -> Result<()> {
const STABILIZE_MS: u64 = 1000;
const MAX_WAIT_MS: u64 = 250;
const POLL_MS: u64 = 25;

tracing::debug!("Waiting for VM process to stabilize");
tokio::time::sleep(tokio::time::Duration::from_millis(STABILIZE_MS)).await;

if let Some(ref handler) = *self.handler.read().await {
if !handler.is_running() {
return Err(BoxError::BoxBootError {
message: "VM process exited immediately after start".to_string(),
hint: Some("Check console output for errors".to_string()),
});
tracing::debug!("Confirming VM process started");
let start = std::time::Instant::now();
loop {
if let Some(ref handler) = *self.handler.read().await {
// has_exited is zombie-aware (a halted VM's shim becomes a zombie);
// is_running's kill(pid,0) would still report it alive.
if handler.has_exited() {
return Err(BoxError::BoxBootError {
message: "VM process exited immediately after start".to_string(),
hint: Some("Check console output for errors".to_string()),
});
}
}
if start.elapsed().as_millis() >= MAX_WAIT_MS as u128 {
break;
}
tokio::time::sleep(tokio::time::Duration::from_millis(POLL_MS)).await;
}

tracing::debug!("VM process is running");
Expand Down
39 changes: 39 additions & 0 deletions src/shim/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,42 @@ fn main() {
}
}

/// Opt-in (env `A3S_BOX_KSM=1`): mark this shim's anonymous memory — including
/// libkrun's guest RAM, which `start_enter` allocates as anonymous `mmap` after
/// this runs — as KSM-mergeable via `prctl(PR_SET_MEMORY_MERGE)` (Linux 6.4+).
/// With KSM enabled on the host (`/sys/kernel/mm/ksm/run = 1`), identical pages
/// across same-image microVMs (kernel text, common runtime/libs) are deduplicated
/// by ksmd, so N warm VMs of one image cost far less host RAM than N× their size.
/// Best-effort: a no-op when the env is unset or on pre-6.4 kernels (EINVAL).
#[cfg(target_os = "linux")]
fn maybe_enable_ksm_merge() {
// PR_SET_MEMORY_MERGE (since Linux 6.4) — not in all libc versions, so use
// the numeric value directly.
const PR_SET_MEMORY_MERGE: libc::c_int = 67;

let enabled = std::env::var("A3S_BOX_KSM")
.map(|v| matches!(v.as_str(), "1" | "true" | "yes" | "on"))
.unwrap_or(false);
if !enabled {
return;
}

// SAFETY: PR_SET_MEMORY_MERGE takes a single scalar (enable=1); no pointers
// or out-params. A non-zero return (e.g. pre-6.4 kernel → EINVAL) is non-fatal.
let rc = unsafe { libc::prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0) };
if rc == 0 {
tracing::info!("KSM page-merging enabled for guest memory (PR_SET_MEMORY_MERGE)");
} else {
tracing::warn!(
error = %std::io::Error::last_os_error(),
"A3S_BOX_KSM set but PR_SET_MEMORY_MERGE failed (needs Linux 6.4+); continuing without KSM"
);
}
}

#[cfg(not(target_os = "linux"))]
fn maybe_enable_ksm_merge() {}

fn run() -> Result<()> {
let args = Args::parse();

Expand Down Expand Up @@ -129,6 +165,9 @@ fn run() -> Result<()> {
"Starting VM"
);

// Opt-in KSM: mark guest memory mergeable before libkrun allocates it.
maybe_enable_ksm_merge();

// Validate rootfs exists
if !spec.rootfs_path.exists() {
return Err(BoxError::BoxBootError {
Expand Down
Loading