diff --git a/config/config.go b/config/config.go index 99de7dd..c9feb50 100644 --- a/config/config.go +++ b/config/config.go @@ -63,11 +63,11 @@ type SystemUserConfig struct { // AutoLifecycleConfig controls automatic sandbox lifecycle transitions type AutoLifecycleConfig struct { - Enabled bool - PauseAfterIdleSec int // auto-pause after N seconds of inactivity (default: 60) - StopAfterPausedSec int // auto-stop after N seconds of being paused (default: 900) - DeleteAfterStoppedSec int // auto-delete after N seconds of being stopped (default: 604800) - CheckIntervalSec int // how often the manager scans (default: 30) + Enabled bool + SnapshotAfterIdleSec int // auto-snapshot after N seconds of inactivity (default: 60) + DeleteAfterSnapshottedSec int // auto-delete after N seconds of being snapshotted (default: 604800) + CheckIntervalSec int // how often the manager scans (default: 30) + Concurrency int // max concurrent snapshot/delete operations (default: 10) } // Config holds all application configuration @@ -214,11 +214,11 @@ const ( DefaultRedisPassword = "" DefaultRedisDB = 0 // Auto-lifecycle defaults - DefaultAutoLifecycleEnabled = true - DefaultAutoLifecyclePauseAfterIdleSec = 60 // 1 minute - DefaultAutoLifecycleStopAfterPausedSec = 300 // 5 minutes - DefaultAutoLifecycleDeleteAfterStoppedSec = 604800 // 1 week - DefaultAutoLifecycleCheckIntervalSec = 30 // 30 seconds + DefaultAutoLifecycleEnabled = true + DefaultAutoLifecycleSnapshotAfterIdleSec = 60 // 1 minute + DefaultAutoLifecycleDeleteAfterSnapshottedSec = 604800 // 1 week + DefaultAutoLifecycleCheckIntervalSec = 30 // 30 seconds + DefaultAutoLifecycleConcurrency = 10 // Monitor defaults DefaultMonitorEnabled = true // Pagination defaults @@ -317,11 +317,11 @@ func New() *Config { MaxAgeSec: getEnvInt("CORS_MAX_AGE_SEC", DefaultCORSMaxAgeSec), }, AutoLifecycle: AutoLifecycleConfig{ - Enabled: getEnvBool("AUTO_LIFECYCLE_ENABLED", DefaultAutoLifecycleEnabled), - PauseAfterIdleSec: getEnvInt("AUTO_LIFECYCLE_PAUSE_AFTER_IDLE_SEC", DefaultAutoLifecyclePauseAfterIdleSec), - StopAfterPausedSec: getEnvInt("AUTO_LIFECYCLE_STOP_AFTER_PAUSED_SEC", DefaultAutoLifecycleStopAfterPausedSec), - DeleteAfterStoppedSec: getEnvInt("AUTO_LIFECYCLE_DELETE_AFTER_STOPPED_SEC", DefaultAutoLifecycleDeleteAfterStoppedSec), - CheckIntervalSec: getEnvInt("AUTO_LIFECYCLE_CHECK_INTERVAL_SEC", DefaultAutoLifecycleCheckIntervalSec), + Enabled: getEnvBool("AUTO_LIFECYCLE_ENABLED", DefaultAutoLifecycleEnabled), + SnapshotAfterIdleSec: getEnvInt("AUTO_LIFECYCLE_SNAPSHOT_AFTER_IDLE_SEC", DefaultAutoLifecycleSnapshotAfterIdleSec), + DeleteAfterSnapshottedSec: getEnvInt("AUTO_LIFECYCLE_DELETE_AFTER_SNAPSHOTTED_SEC", DefaultAutoLifecycleDeleteAfterSnapshottedSec), + CheckIntervalSec: getEnvInt("AUTO_LIFECYCLE_CHECK_INTERVAL_SEC", DefaultAutoLifecycleCheckIntervalSec), + Concurrency: getEnvInt("AUTO_LIFECYCLE_CONCURRENCY", DefaultAutoLifecycleConcurrency), }, Monitor: MonitorConfig{ Enabled: getEnvBool("MONITOR_ENABLED", DefaultMonitorEnabled), diff --git a/docs/snapshot-restore-scale-security-review.md b/docs/snapshot-restore-scale-security-review.md new file mode 100644 index 0000000..e3f886d --- /dev/null +++ b/docs/snapshot-restore-scale-security-review.md @@ -0,0 +1,216 @@ +# Snapshot/Restore Scale and Security Review + +Date: 2026-06-19 +Branch reviewed: `feat/ch-snap-restore` +Scope: local working-tree changes in `voidrun` + +## Executive Summary + +The snapshot/restore redesign is moving in a useful direction for startup latency and fleet efficiency, but it is not yet ready to be called optimized for scale and security. + +The strongest positives are: + +- `singleflight` deduplication for concurrent auto-restore calls +- persisted network metadata (`macAddress`, `netnsName`, `tapName`) to make restore deterministic +- bounded lifecycle concurrency for snapshot/delete sweeps + +The main blockers are: + +1. Restored VMs lose part of the host-side confinement that fresh boots still have. +2. The new DNS firewall rules weaken network isolation because they are inserted before the private-range drops. +3. Auto-restore work is tied to the first caller's request context, which can cause shared restores to fail under load. +4. The public API contract was not updated to match the lifecycle rewrite. +5. The new memory settings may reduce VM density, and there is no evidence in this branch that the trade-off was measured. +6. The repo is not currently green under `go test ./...`. + +Verdict: good prototype progress, but not yet production-ready from a scale/security standpoint. + +## What Changed + +This branch replaces the old `start/stop/pause/resume` flow with a `snapshot/restore` model and updates the service layer to auto-restore snapshotted sandboxes on demand. + +Major themes in the diff: + +- lifecycle state model changes from `running/paused/stopped` to `running/snapshotted/killed/deleted` +- runtime snapshot creation and restore support added in `runtime/lifecycle.go` +- sandbox service updated to auto-restore via `singleflight` +- lifecycle manager updated to auto-snapshot idle sandboxes and auto-delete old snapshotted sandboxes +- network namespace setup updated to allow DNS only to configured nameservers +- router changed from `/start`, `/stop`, `/pause`, `/resume` to `/snapshot`, `/restore` + +## Findings + +### 1. High: restore path drops Landlock confinement + +Fresh boots still enable both seccomp and Landlock, but the production restore path only re-enables seccomp. That means a restored Cloud Hypervisor process can end up with broader filesystem access than a newly created VM. + +Why it matters: + +- security posture becomes inconsistent by lifecycle state +- a sandbox that was safe at create-time becomes less isolated after restore +- this is the kind of regression that can be missed in functional testing but matters in a multi-tenant environment + +Evidence: + +- `runtime/lifecycle.go` fresh create path appends `--seccomp` and `--landlock` +- `runtime/lifecycle.go` restore path appends only `--seccomp` + +Recommended fix: + +- make restore use the same Landlock policy builder as create +- avoid maintaining two separate security configurations for the same VMM role +- add an automated test that asserts restore and create both include the same confinement flags + +### 2. High: DNS allow rules are ordered before the private-range drops + +The new rules allow DNS to configured nameservers before the branch drops traffic to metadata and RFC1918 ranges. If a configured nameserver lives in link-local or private space, that allow rule wins. + +Why it matters: + +- it weakens the current "deny internal networks from the guest" model +- metadata or internal resolver access could be reintroduced through configuration +- the new test already shows the rule order is opposite of the intended policy + +Evidence: + +- `runtime/network.go` inserts DNS `ACCEPT` rules before the `169.254.169.254`, `10/8`, `172.16/12`, and `192.168/16` drops +- `runtime/network_test.go` fails with `DNS rules should be AFTER the drops` + +Recommended fix: + +- move DNS allow rules after the metadata/private-network drops, or +- explicitly reject private/link-local nameserver addresses at config validation time +- keep the regression test and require it to pass before merge + +### 3. Medium: shared auto-restore is coupled to a caller request context + +The `singleflight` dedupe is a good idea, but the shared restore still runs inside the first caller's request context. If that caller disconnects or times out, the restore can be canceled and rolled back for every concurrent waiter. + +Why it matters: + +- burst traffic to the same sandbox can fail together +- tail latency becomes sensitive to client disconnects and gateway timeouts +- this turns a scale optimization into a reliability hazard under load + +Evidence: + +- `service/sandbox.go` calls `s.restoreGroup.Do(id, func() { return s.Restore(ctx, orgID, id) })` +- `service/sandbox.go` then uses that same `ctx` in `waitForAgent()` + +Recommended fix: + +- decouple the restore worker from the first request by using a fresh bounded internal context +- let callers wait on the shared work result, but do not let one caller cancel the whole restore +- consider a per-sandbox in-flight state machine if restore behavior keeps growing + +### 4. Medium: API docs and route contract drifted apart + +The router now exposes `/snapshot` and `/restore`, but the OpenAPI spec still documents `/start`, `/stop`, `/pause`, and `/resume`. The schema enum also still advertises old states. + +Why it matters: + +- generated SDKs and external clients will be wrong +- support and product teams can share outdated lifecycle behavior +- integration breakage is likely even if the server code works + +Evidence: + +- `server/server.go` registers `/snapshot` and `/restore` +- `openapi.yml` still documents `/sandboxes/{id}/start`, `/stop`, `/pause`, `/resume` +- `openapi.yml` still lists lifecycle states including `stopped` and `paused`, not `snapshotted` + +Recommended fix: + +- update `openapi.yml` in the same change set as route changes +- regenerate any downstream clients after the spec is corrected +- add a lightweight check that route names and OpenAPI paths stay in sync + +### 5. Medium: memory settings may reduce density, with no proof of the trade-off + +The branch changes memory configuration from shared memory mode to private memory mode on both the API and CLI paths. + +Why it matters: + +- memory sharing is often important for VM density when many guests share the same base image +- disabling it may be the right compatibility decision for snapshots, but it can reduce host efficiency +- the branch does not include benchmark evidence showing the fleet-level impact is acceptable + +Evidence: + +- `runtime/lifecycle.go` changes `Shared: true` to `Shared: false` +- `runtime/lifecycle.go` changes CLI memory flags from `size=%dM,shared=on,mergeable=off` to `size=%dM` + +Recommended fix: + +- document why shared memory had to be disabled +- run before/after density and memory-pressure measurements +- if the change is required for restore correctness, call that out explicitly in docs and rollout notes + +### 6. Medium: current branch is not test-clean + +The branch currently fails `go test ./...`. + +Why it matters: + +- merge confidence is lower when a lifecycle rewrite is not validated end to end +- one failure is directly tied to the new network policy behavior +- another failure comes from a helper program that no longer matches current interfaces + +Observed failures: + +- `runtime/network_test.go` fails because DNS rules are ordered before the deny rules +- `cmd/test-sandbox/main.go` does not compile against the current repository APIs + +Recommended fix: + +- make the full Go test suite green before merge +- either update `cmd/test-sandbox/main.go` to current interfaces or exclude it from normal package builds if it is only a local experiment + +## Scale Assessment + +### Improvements + +- `singleflight` is the right direction for preventing restore stampedes +- lifecycle manager concurrency caps are a good guardrail for bulk snapshot/delete work +- storing MAC and NetNS metadata should reduce restore-time recomputation and edge cases + +### Remaining scale concerns + +- restore cancellation is still fragile because it depends on request-scoped context +- restore readiness still relies on tight polling loops and serial post-restore steps +- memory density impact is unknown after disabling shared guest memory +- API contract drift increases rollout cost across SDKs and automation + +Overall scale verdict: improved architecture, but not yet proven or hardened for high-concurrency production use. + +## Security Assessment + +### Improvements + +- DNS is now restricted to configured nameservers instead of broad outbound UDP/TCP allowances +- sandbox network metadata is persisted, reducing restore-time guessing +- Cloud Hypervisor lifecycle handling appears more explicit than the earlier warm-start model + +### Remaining security concerns + +- restore path loses Landlock parity with fresh create +- DNS rule order weakens isolation if nameservers are internal or link-local +- configuration should validate nameservers against forbidden ranges instead of relying only on iptables ordering +- route/spec drift makes it easier for external callers to rely on outdated lifecycle assumptions + +Overall security verdict: not ready to claim secure-by-default until restore confinement and firewall ordering are fixed. + +## Recommended Next Steps + +1. Fix restore-path security parity by reusing the same Landlock policy generation as create. +2. Reorder DNS firewall rules or reject unsafe nameserver addresses during config validation. +3. Decouple `singleflight` restore execution from request-scoped cancellation. +4. Update `openapi.yml` and any generated clients to the new lifecycle model. +5. Benchmark memory density and restore latency before and after the shared-memory change. +6. Get `go test ./...` green and keep the new network regression test in CI. + +## Merge Recommendation + +Do not merge as-is if the goal is a production-ready scale/security improvement. + +This branch is close enough to keep iterating on, but it should clear the restore confinement issue, the firewall ordering issue, and the current test failures before being treated as ready to share as a completed solution rather than an in-progress design. diff --git a/go.mod b/go.mod index 71e882b..ee24cff 100644 --- a/go.mod +++ b/go.mod @@ -14,6 +14,7 @@ require ( github.com/vishvananda/netlink v1.3.1 go.mongodb.org/mongo-driver v1.16.1 golang.org/x/crypto v0.46.0 + golang.org/x/sync v0.19.0 ) require ( @@ -60,7 +61,6 @@ require ( go.uber.org/mock v0.6.0 // indirect golang.org/x/arch v0.23.0 // indirect golang.org/x/net v0.48.0 // indirect - golang.org/x/sync v0.19.0 // indirect golang.org/x/text v0.33.0 // indirect google.golang.org/protobuf v1.36.11 // indirect ) diff --git a/handler/sandbox.go b/handler/sandbox.go index 77799f3..f5128fe 100644 --- a/handler/sandbox.go +++ b/handler/sandbox.go @@ -126,7 +126,7 @@ func (h *SandboxHandler) Delete(c *gin.Context) error { return nil } -func (h *SandboxHandler) Start(c *gin.Context) error { +func (h *SandboxHandler) Snapshot(c *gin.Context) error { id := c.Param("id") orgID, err := util.GetOrgIDFromContext(c) @@ -134,14 +134,14 @@ func (h *SandboxHandler) Start(c *gin.Context) error { return err } - if err := h.sandboxService.Start(c.Request.Context(), orgID, id); err != nil { - return util.ErrInternal("Start failed", err) + if err := h.sandboxService.Snapshot(c.Request.Context(), orgID, id); err != nil { + return util.ErrInternal("Snapshot failed", err) } - c.JSON(http.StatusOK, model.NewSuccessResponse("Sandbox started", nil)) + c.JSON(http.StatusOK, model.NewSuccessResponse("Sandbox snapshotted", nil)) return nil } -func (h *SandboxHandler) Stop(c *gin.Context) error { +func (h *SandboxHandler) Restore(c *gin.Context) error { id := c.Param("id") orgID, err := util.GetOrgIDFromContext(c) @@ -149,39 +149,9 @@ func (h *SandboxHandler) Stop(c *gin.Context) error { return err } - if err := h.sandboxService.Stop(c.Request.Context(), orgID, id); err != nil { - return util.ErrInternal("Stop failed", err) + if err := h.sandboxService.Restore(c.Request.Context(), orgID, id); err != nil { + return util.ErrInternal("Restore failed", err) } - c.JSON(http.StatusOK, model.NewSuccessResponse("Sandbox stopped", nil)) - return nil -} - -func (h *SandboxHandler) Pause(c *gin.Context) error { - id := c.Param("id") - - orgID, err := util.GetOrgIDFromContext(c) - if err != nil { - return err - } - - if err := h.sandboxService.Pause(c.Request.Context(), orgID, id); err != nil { - return util.ErrInternal("Pause failed", err) - } - c.JSON(http.StatusOK, model.NewSuccessResponse("Sandbox paused", nil)) - return nil -} - -func (h *SandboxHandler) Resume(c *gin.Context) error { - id := c.Param("id") - - orgID, err := util.GetOrgIDFromContext(c) - if err != nil { - return err - } - - if err := h.sandboxService.Resume(c.Request.Context(), orgID, id); err != nil { - return util.ErrInternal("Resume failed", err) - } - c.JSON(http.StatusOK, model.NewSuccessResponse("Sandbox resumed", nil)) + c.JSON(http.StatusOK, model.NewSuccessResponse("Sandbox restored", nil)) return nil } diff --git a/mcp/tools.go b/mcp/tools.go index 1154705..9ddb6ed 100644 --- a/mcp/tools.go +++ b/mcp/tools.go @@ -37,7 +37,7 @@ func toolCreateSandbox() mcp.Tool { mcp.Description("Unique name for the sandbox (DNS-1123 subdomain format: lowercase alphanumeric and hyphens)"), ), mcp.WithString("image", - mcp.Description("Image name in name or name:ver form (e.g. code, max, docker). Defaults to code if omitted."), + mcp.Description("Image name in name or name:ver form (e.g. code, docker-lite, max, docker). Defaults to code if omitted."), ), mcp.WithNumber("cpu", mcp.Description("Number of vCPUs (1-8). Defaults to 1."), @@ -58,7 +58,7 @@ func toolCreateSandbox() mcp.Tool { mcp.Description("Environment variables for the sandbox (string map)."), ), mcp.WithBoolean("autoSleep", - mcp.Description("If true, auto-pause the VM after idle time."), + mcp.Description("If true, auto-snapshot the VM after idle time."), ), mcp.WithString("region", mcp.Description("Target region when supported by your account."), @@ -109,7 +109,7 @@ func toolDeleteSandbox() mcp.Tool { func toolExecuteCommand() mcp.Tool { return mcp.NewTool( "execute_command", - mcp.WithDescription("Execute a shell command in a sandbox and return the output. The sandbox must be running (it will be auto-resumed if paused)."), + mcp.WithDescription("Execute a shell command in a sandbox and return the output. The sandbox must be running (it will be auto-restored if snapshotted)."), mcp.WithString("id", mcp.Required(), mcp.Description("The sandbox ID"), diff --git a/model/sandbox.go b/model/sandbox.go index 83a13d1..e91125b 100644 --- a/model/sandbox.go +++ b/model/sandbox.go @@ -18,8 +18,7 @@ type Sandbox struct { Status string `bson:"status" json:"status"` AutoSleep bool `bson:"autoSleep" json:"autoSleep"` LastActivityAt *time.Time `bson:"lastActivityAt,omitempty" json:"-"` - PausedAt *time.Time `bson:"pausedAt,omitempty" json:"-"` - StoppedAt *time.Time `bson:"stoppedAt,omitempty" json:"-"` + SnapshottedAt *time.Time `bson:"snapshottedAt,omitempty" json:"-"` CreatedAt time.Time `bson:"createdAt" json:"createdAt"` CreatedBy primitive.ObjectID `bson:"createdBy" json:"createdBy"` OrgID primitive.ObjectID `bson:"orgId" json:"orgId"` @@ -28,6 +27,7 @@ type Sandbox struct { RefID string `bson:"refId,omitempty" json:"refId,omitempty"` TapName string `bson:"tapName,omitempty" json:"-"` NetNSName string `bson:"netnsName,omitempty" json:"-"` + MacAddress string `bson:"macAddress,omitempty" json:"-"` TapDeleted bool `bson:"tapDeleted,omitempty" json:"-"` BillingCompleted bool `bson:"billingCompleted,omitempty" json:"-"` } diff --git a/openapi.yml b/openapi.yml index 556f796..94f5da6 100644 --- a/openapi.yml +++ b/openapi.yml @@ -1222,13 +1222,13 @@ paths: schema: $ref: "#/components/schemas/ErrorResponse" - /sandboxes/{id}/start: + /sandboxes/{id}/snapshot: post: tags: - Sandboxes - summary: Start sandbox - description: Start a stopped sandbox - operationId: startSandbox + summary: Snapshot sandbox + description: Snapshot a running sandbox and stop the VM process + operationId: snapshotSandbox security: - ApiKeyAuth: [] parameters: @@ -1240,85 +1240,7 @@ paths: example: 65ae1234567890abcdef1234 responses: "200": - description: Sandbox started - content: - application/json: - schema: - $ref: "#/components/schemas/SuccessResponse" - "400": - description: Invalid request (sandbox not stopped) - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "401": - description: Unauthorized - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "404": - description: Sandbox not found - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - - /sandboxes/{id}/stop: - post: - tags: - - Sandboxes - summary: Stop sandbox - description: Stop a running sandbox - operationId: stopSandbox - security: - - ApiKeyAuth: [] - parameters: - - name: id - in: path - required: true - schema: - type: string - example: 65ae1234567890abcdef1234 - responses: - "200": - description: Sandbox stopped - content: - application/json: - schema: - $ref: "#/components/schemas/SuccessResponse" - "401": - description: Unauthorized - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "404": - description: Sandbox not found - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - - /sandboxes/{id}/pause: - post: - tags: - - Sandboxes - summary: Pause sandbox - description: Pause a running sandbox - operationId: pauseSandbox - security: - - ApiKeyAuth: [] - parameters: - - name: id - in: path - required: true - schema: - type: string - example: 65ae1234567890abcdef1234 - responses: - "200": - description: Sandbox paused + description: Sandbox snapshotted content: application/json: schema: @@ -1336,13 +1258,13 @@ paths: schema: $ref: "#/components/schemas/ErrorResponse" - /sandboxes/{id}/resume: + /sandboxes/{id}/restore: post: tags: - Sandboxes - summary: Resume sandbox - description: Resume a paused sandbox - operationId: resumeSandbox + summary: Restore sandbox + description: Restore a snapshotted sandbox from its latest snapshot + operationId: restoreSandbox security: - ApiKeyAuth: [] parameters: @@ -1354,7 +1276,7 @@ paths: example: 65ae1234567890abcdef1234 responses: "200": - description: Sandbox resumed + description: Sandbox restored content: application/json: schema: diff --git a/repository/sandbox.go b/repository/sandbox.go index 2fd40a8..363460e 100644 --- a/repository/sandbox.go +++ b/repository/sandbox.go @@ -34,11 +34,9 @@ type ISandboxRepository interface { NextAvailableIP() (string, error) // Lifecycle management methods TouchActivity(ctx context.Context, id primitive.ObjectID) error - SetPausedAt(ctx context.Context, id primitive.ObjectID) error - SetStoppedAt(ctx context.Context, id primitive.ObjectID) error + SetSnapshottedAt(ctx context.Context, id primitive.ObjectID) error FindIdleRunning(ctx context.Context, threshold time.Time) ([]*model.Sandbox, error) - FindStalePaused(ctx context.Context, threshold time.Time) ([]*model.Sandbox, error) - FindStaleStopped(ctx context.Context, threshold time.Time) ([]*model.Sandbox, error) + FindStaleSnapshotted(ctx context.Context, threshold time.Time) ([]*model.Sandbox, error) FindByID(ctx context.Context, id primitive.ObjectID, opts options.FindOneOptions) (*model.Sandbox, error) FreeIP(ctx context.Context, ip string) } @@ -300,24 +298,13 @@ func (r *SandboxRepository) TouchActivity(ctx context.Context, id primitive.Obje return err } -// SetPausedAt sets the pausedAt timestamp and status to paused -func (r *SandboxRepository) SetPausedAt(ctx context.Context, id primitive.ObjectID) error { +// SetSnapshottedAt sets the snapshottedAt timestamp and status to snapshotted +func (r *SandboxRepository) SetSnapshottedAt(ctx context.Context, id primitive.ObjectID) error { now := time.Now() _, err := r.collection.UpdateOne(ctx, bson.M{"_id": id}, bson.M{"$set": bson.M{ - "status": "paused", - "pausedAt": now, - "updatedAt": now, - }}) - return err -} - -// SetStoppedAt sets the stoppedAt timestamp and status to stopped -func (r *SandboxRepository) SetStoppedAt(ctx context.Context, id primitive.ObjectID) error { - now := time.Now() - _, err := r.collection.UpdateOne(ctx, bson.M{"_id": id}, bson.M{"$set": bson.M{ - "status": "stopped", - "stoppedAt": now, - "updatedAt": now, + "status": "snapshotted", + "snapshottedAt": now, + "updatedAt": now, }}) return err } @@ -346,34 +333,14 @@ func (r *SandboxRepository) FindIdleRunning(ctx context.Context, threshold time. return sandboxes, nil } -// FindStalePaused finds paused sandboxes that have been paused since before the threshold -func (r *SandboxRepository) FindStalePaused(ctx context.Context, threshold time.Time) ([]*model.Sandbox, error) { - filter := bson.M{ - "status": "paused", - "pausedAt": bson.M{"$lt": threshold}, - } - cursor, err := r.collection.Find(ctx, filter, &options.FindOptions{ - Projection: bson.M{"_id": 1, "orgId": 1, "name": 1}, - }) - if err != nil { - return nil, err - } - defer cursor.Close(ctx) - var sandboxes []*model.Sandbox - if err = cursor.All(ctx, &sandboxes); err != nil { - return nil, err - } - return sandboxes, nil -} - -// FindStaleStopped finds stopped sandboxes that have been stopped since before the threshold -func (r *SandboxRepository) FindStaleStopped(ctx context.Context, threshold time.Time) ([]*model.Sandbox, error) { +// FindStaleSnapshotted finds snapshotted sandboxes that have been snapshotted since before the threshold +func (r *SandboxRepository) FindStaleSnapshotted(ctx context.Context, threshold time.Time) ([]*model.Sandbox, error) { filter := bson.M{ - "status": "stopped", - "stoppedAt": bson.M{"$lt": threshold}, + "status": "snapshotted", + "snapshottedAt": bson.M{"$lt": threshold}, } cursor, err := r.collection.Find(ctx, filter, &options.FindOptions{ - Projection: bson.M{"_id": 1, "orgId": 1, "name": 1, "createdBy": 1, "tapName": 1}, + Projection: bson.M{"_id": 1, "orgId": 1, "name": 1, "createdBy": 1, "tapName": 1, "netnsName": 1}, }) if err != nil { return nil, err diff --git a/runtime/clh_types.go b/runtime/clh_types.go index 43cfe4f..76cf50c 100644 --- a/runtime/clh_types.go +++ b/runtime/clh_types.go @@ -163,6 +163,7 @@ type RestoreConfig struct { SourceURL string `json:"source_url"` Prefault bool `json:"prefault,omitempty"` Net []NetConfig `json:"net_fds,omitempty"` + Resume bool `json:"resume,omitempty"` } // ReceiveMigrationData is used for receiving migrations diff --git a/runtime/client.go b/runtime/client.go index 94eae16..20aba85 100644 --- a/runtime/client.go +++ b/runtime/client.go @@ -9,6 +9,7 @@ import ( "net" "net/http" "os" + "path/filepath" "strings" "time" ) @@ -251,14 +252,29 @@ func GetEventOffsetPath(sbxID string) string { return fmt.Sprintf("%s/%s/vm.evt_offset", InstancesRoot, sbxID) } -func GetSnapshotsRoot() string { - return fmt.Sprintf("%s/snapshots", InstancesRoot) +// GetSnapshotBaseDir returns the root directory for all snapshots for a sandbox. +func GetSnapshotBaseDir(sbxID string) string { + return fmt.Sprintf("%s/%s/snapshots", InstancesRoot, sbxID) } -func GetSnapshotsDir(sbxID string) string { - return fmt.Sprintf("%s/%s", GetSnapshotsRoot(), sbxID) +// GetLatestSnapshotDir finds the newest timestamped snapshot directory for a sandbox. +func GetLatestSnapshotDir(sbxID string) string { + baseDir := GetSnapshotBaseDir(sbxID) + entries, err := os.ReadDir(baseDir) + if err != nil { + return "" + } + var latest string + for _, entry := range entries { + if entry.IsDir() && strings.HasPrefix(entry.Name(), "snap-") { + if entry.Name() > latest { + latest = entry.Name() + } + } + } + if latest != "" { + return filepath.Join(baseDir, latest) + } + return "" } -func GetSnapshotTempDir(sbxID string) string { - return fmt.Sprintf("%s/%s/.tmp", GetSnapshotsRoot(), sbxID) -} diff --git a/runtime/lifecycle.go b/runtime/lifecycle.go index 2478a12..ac32740 100644 --- a/runtime/lifecycle.go +++ b/runtime/lifecycle.go @@ -29,7 +29,7 @@ func ConfigureNetwork(cfg config.Config, spec *model.SandboxSpec) error { // Create an isolated network namespace with a tap device inside it. // This protects the host from VM-based network attacks and is immune // to host-level `iptables -F` flushes. - nsName, tapName, err := CreateSandboxNetNS(cfg.Network.BridgeName, macAddr, cfg.Network.Prefix) + nsName, tapName, err := CreateSandboxNetNS(cfg.Network.BridgeName, macAddr, cfg.Network.Prefix, cfg.Network.Nameservers) if err != nil { return fmt.Errorf("create netns: %w", err) } @@ -96,6 +96,11 @@ func Create(cfg config.Config, spec model.SandboxSpec, overlayPath string) error return fmt.Errorf("VM crashed on start. Logs:\n%s", string(logs)) } + // Ensure tap0 is attached to br0 in netns after VMM starts + if err := EnsureTapBridge(spec.NetNSName, spec.TapName); err != nil { + log.Printf("[WARN] EnsureTapBridge failed in Create: %v\n", err) + } + tapName := spec.TapName macAddr := spec.MacAddress log.Printf(" [Create] spec.TapName=%q, spec.MacAddress=%q\n", tapName, macAddr) @@ -131,7 +136,7 @@ func Create(cfg config.Config, spec model.SandboxSpec, overlayPath string) error }, Memory: &MemoryConfig{ Size: int64(spec.MemoryMB) * 1024 * 1024, - Shared: true, + Shared: false, Mergeable: false, Prefault: false, }, @@ -178,21 +183,16 @@ func Create(cfg config.Config, spec model.SandboxSpec, overlayPath string) error return nil } -func CreateCLI(cfg config.Config, spec model.SandboxSpec, overlayPath string) error { - defer util.Track("Sandbox Start (Total CLI)")() - - overlayPath, _ = filepath.Abs(overlayPath) - +// BuildCLIArgs constructs the Cloud Hypervisor CLI arguments from the sandbox configuration +func BuildCLIArgs(cfg config.Config, spec model.SandboxSpec, overlayPath string) []string { // Use centralized path helpers socketPath := GetSocketPath(spec.ID) logPath := GetLogPath(spec.ID) - pidPath := GetPIDPath(spec.ID) vsockPath := GetVsockPath(spec.ID) eventPath := GetEventPath(spec.ID) tapName := spec.TapName macAddr := spec.MacAddress - log.Printf(" [CreateCLI] spec.TapName=%q, spec.MacAddress=%q\n", tapName, macAddr) // 1. Map Configurations to CLI Strings cmdLine := strings.TrimSpace(cfg.Sandbox.KernelCmdline) @@ -214,13 +214,13 @@ func CreateCLI(cfg config.Config, spec model.SandboxSpec, overlayPath string) er // 2. Build the Base CLI Arguments args := []string{ - "--api-socket", socketPath, // Still useful for monitoring/poweroff + "--api-socket", socketPath, "--log-file", logPath, "--event-monitor", "path=" + eventPath, "--kernel", cfg.Paths.KernelPath, "--cmdline", cmdLine, "--cpus", fmt.Sprintf("boot=%d,max=%d", spec.CPUs, spec.CPUs), - "--memory", fmt.Sprintf("size=%dM,shared=on,mergeable=off", spec.MemoryMB), + "--memory", fmt.Sprintf("size=%dM", spec.MemoryMB), "--disk", fmt.Sprintf("path=%s,backing_files=%s,image_type=%s", overlayPath, backingFiles, imageType), "--net", fmt.Sprintf("tap=%s,mac=%s", tapName, macAddr), "--vsock", fmt.Sprintf("cid=%d,socket=%s", getCidFromIP(spec.IPAddress), vsockPath), @@ -255,18 +255,12 @@ func CreateCLI(cfg config.Config, spec model.SandboxSpec, overlayPath string) er var llRules []string // Use a map to collect unique rules, then we'll sort them - // Key: path, Value: access string ("r" or "rw") rulesMap := make(map[string]string) - // Kernel image (read file) rulesMap[absKernel] = "r" - // Log file (write) rulesMap[logPath] = "rw" - // Entire instance directory: overlay.qcow2, vm.sock, vsock.sock, vm.evt rulesMap[absInstanceDir] = "rw" - // RNG rulesMap["/dev/urandom"] = "r" - // TUN/TAP and sysfs rulesMap["/dev/net/tun"] = "rw" rulesMap["/sys"] = "r" @@ -276,17 +270,12 @@ func CreateCLI(cfg config.Config, spec model.SandboxSpec, overlayPath string) er } if backingFiles == "on" { - // Landlock path traversal requires every ancestor directory to have ReadDir. absDataDir, _ := filepath.Abs(filepath.Dir(absBaseDir)) rulesMap[absDataDir] = "r" rulesMap[absBaseDir] = "r" rulesMap[absBackingFile] = "r" } - // Sort rules by path length (shortest first) to ensure broader rules - // are added before narrower ones. This avoids a Landlock bug where - // adding a specific file rule before a broad directory rule causes - // siblings of the specific file to be denied access. var paths []string for p := range rulesMap { paths = append(paths, p) @@ -301,14 +290,27 @@ func CreateCLI(cfg config.Config, spec model.SandboxSpec, overlayPath string) er args = append(args, "--landlock-rules") args = append(args, llRules...) - } + return args +} + +func CreateCLI(cfg config.Config, spec model.SandboxSpec, overlayPath string) error { + defer util.Track("Sandbox Start (Total CLI)")() + + overlayPath, _ = filepath.Abs(overlayPath) + + socketPath := GetSocketPath(spec.ID) + logPath := GetLogPath(spec.ID) + pidPath := GetPIDPath(spec.ID) + + args := BuildCLIArgs(cfg, spec, overlayPath) log.Println(args) + netnsArgs := append([]string{"netns", "exec", spec.NetNSName, cfg.CHBinary}, args...) + // 4. Start Cloud Hypervisor Process inside the sandbox NetNS fmt.Printf(">> [Native] Spawning full CLH process inside NetNS %s (CLI Mode)...\n", spec.NetNSName) - netnsArgs := append([]string{"netns", "exec", spec.NetNSName, cfg.CHBinary}, args...) cmd := exec.Command("ip", netnsArgs...) logFile, _ := os.Create(logPath) @@ -328,7 +330,6 @@ func CreateCLI(cfg config.Config, spec model.SandboxSpec, overlayPath string) er cmd.Process.Release() // 5. Wait for Socket (Acts as a Readiness Probe) - // Because we passed the full config, CH creates the socket and boots immediately. client := NewAPIClient(socketPath) if err := client.WaitForSocket(2 * time.Second); err != nil { logs, _ := os.ReadFile(logPath) @@ -336,60 +337,384 @@ func CreateCLI(cfg config.Config, spec model.SandboxSpec, overlayPath string) er return fmt.Errorf("VM crashed on start. Logs:\n%s", string(logs)) } + // Ensure tap0 is attached to br0 in netns after VMM starts + if err := EnsureTapBridge(spec.NetNSName, spec.TapName); err != nil { + log.Printf("[WARN] EnsureTapBridge failed in CreateCLI: %v\n", err) + } + fmt.Printf(" [+] VM Active! PID: %d, NetNS: %s\n", pid, spec.NetNSName) return nil } -// Stop gracefully shuts down the VM via CLH API (keeps hypervisor and network for restart) -func Stop(id string) error { - defer util.Track("lifecycle: Sandbox Stop")() +// Snapshot creates a snapshot of the VM and terminates the hypervisor. +// It is safe to call concurrently for different sandbox IDs. +func Snapshot(id string) error { + defer util.Track("lifecycle: Sandbox Snapshot")() socketPath := GetSocketPath(id) + baseSnapshotDir := GetSnapshotBaseDir(id) + + // Generate a unique timestamped directory for this snapshot + snapshotDir := filepath.Join(baseSnapshotDir, fmt.Sprintf("snap-%d", time.Now().UnixNano())) + + client := NewCLHClientWithTimeout(socketPath, 30*time.Second) + if !client.IsSocketAvailable() { + return fmt.Errorf("Sandbox not running") + } + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + // Ensure base directory exists + if err := os.MkdirAll(baseSnapshotDir, 0755); err != nil { + return fmt.Errorf("failed to create snapshot base dir: %w", err) + } + if err := os.MkdirAll(snapshotDir, 0755); err != nil { + return fmt.Errorf("failed to create snapshot dir: %w", err) + } + + // 1. Pause VM (tolerate InvalidStateTransition — VM may already be paused) + if err := client.VmPause(ctx); err != nil { + log.Printf("[Snapshot] Warning: VmPause failed for %s (may already be paused): %v", id, err) + } + + // 2. Take Snapshot + snapshotUrl := "file://" + snapshotDir + "/" + if err := client.VmSnapshot(ctx, snapshotUrl); err != nil { + return fmt.Errorf("VmSnapshot failed: %w", err) + } + + // 3. Shutdown VMM (kills the process) + if err := client.VmmShutdown(ctx); err != nil { + log.Printf("[Snapshot] Warning: VmmShutdown failed for %s: %v", id, err) + } + + // 4. Wait for socket to disappear (process dead) — synchronous so the caller + // knows the VMM is truly gone before DB state is written, and so that old- + // snapshot cleanup doesn't race with a concurrent Restore's GetLatestSnapshotDir. + for i := 0; i < 20; i++ { + if !client.IsSocketAvailable() { + break + } + time.Sleep(100 * time.Millisecond) + } - // 1. Gracefully shutdown VM via CLH API (keeps hypervisor process running) - client := NewCLHClient(socketPath) if client.IsSocketAvailable() { - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() + log.Printf("[Snapshot] WARNING: VMM %s still alive after 2s, force-killing", id) + if err := forceKillByPIDFile(id); err != nil { + os.Remove(socketPath) + return fmt.Errorf("VMM %s hung and force-kill failed: %w", id, err) + } + } - if err := client.VmShutdown(ctx); err != nil { - fmt.Printf("Warning: VmShutdown failed for %s: %v\n", id, err) + os.Remove(socketPath) + log.Printf("[Snapshot] VM %s snapshotted successfully to %s", id, snapshotDir) + + // 5. Clean up older snapshots synchronously to avoid racing with Restore's + // GetLatestSnapshotDir. Best-effort: log failures but don't fail the snapshot. + if entries, err := os.ReadDir(baseSnapshotDir); err == nil { + for _, entry := range entries { + if entry.IsDir() && strings.HasPrefix(entry.Name(), "snap-") { + fullPath := filepath.Join(baseSnapshotDir, entry.Name()) + if fullPath != snapshotDir { + if rmErr := os.RemoveAll(fullPath); rmErr != nil { + log.Printf("[Snapshot] Warning: failed to remove old snapshot %s: %v", fullPath, rmErr) + } + } + } } + } else { + log.Printf("[Snapshot] Warning: could not read snapshot dir for cleanup %s: %v", baseSnapshotDir, err) } - fmt.Printf(" [+] VM %s Stopped (CLH process and TAP interface preserved).\n", id) + return nil } -// Start boots a VM that is in shutdown state -func Start(id string) error { - defer util.Track("lifecycle: Sandbox Start")() +// Stop gracefully shuts down a VM process via the API and waits for the socket to disappear. +// This is used for cleanup when VM creation/boot fails. +func Stop(id string) error { + defer util.Track("lifecycle: Sandbox Stop")() socketPath := GetSocketPath(id) - client := NewCLHClient(socketPath) + client := NewCLHClientForSandbox(id) if !client.IsSocketAvailable() { - return fmt.Errorf("VM socket not available. Is the hypervisor process running?") + return fmt.Errorf("Sandbox not running") } - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() - // Check current state - state, err := client.GetState(ctx) + if err := client.VmmShutdown(ctx); err != nil { + log.Printf("[Stop] Warning: VmmShutdown failed for %s: %v", id, err) + } + + for i := 0; i < 20; i++ { + if !client.IsSocketAvailable() { + break + } + time.Sleep(100 * time.Millisecond) + } + + if client.IsSocketAvailable() { + log.Printf("[Stop] WARNING: VMM %s still alive after 2s, force-killing", id) + if err := forceKillByPIDFile(id); err != nil { + os.Remove(socketPath) + return fmt.Errorf("VMM %s hung and force-kill failed: %w", id, err) + } + } + + os.Remove(socketPath) + + log.Printf("[Stop] VM %s stopped successfully", id) + return nil +} + +// forceKillByPIDFile reads the PID file and forcefully kills the process if it's still alive. +func forceKillByPIDFile(id string) error { + pidPath := GetPIDPath(id) + data, err := os.ReadFile(pidPath) if err != nil { - return fmt.Errorf("failed to get VM state: %w", err) + return fmt.Errorf("failed to read PID file: %w", err) + } + pidStr := strings.TrimSpace(string(data)) + pid, err := strconv.Atoi(pidStr) + if err != nil { + return fmt.Errorf("invalid PID in file: %w", err) } - // Can boot from Created or Shutdown states - if state != VmStateShutdown && state != "Created" { - return fmt.Errorf("VM must be in shutdown or created state to start (current state: %s)", state) + process, err := os.FindProcess(pid) + if err != nil { + return nil // Process already gone } - // Boot the VM - fmt.Printf(" [+] Starting VM %s (state: %s)...\n", id, state) - if err := client.VmBoot(ctx); err != nil { - return fmt.Errorf("vm.boot failed: %w", err) + if err := process.Signal(syscall.SIGKILL); err != nil { + log.Printf("Warning: failed to send SIGKILL to PID %d: %v", pid, err) + } + + time.Sleep(200 * time.Millisecond) + + // Check if it's still alive. A zombie process will respond to Signal(0), + // so we must read its state from /proc to see if it's actually dead. + if err := process.Signal(syscall.Signal(0)); err == nil { + statData, err := os.ReadFile(fmt.Sprintf("/proc/%d/stat", pid)) + if err == nil { + fields := strings.Fields(string(statData)) + if len(fields) >= 3 { + state := fields[2] + if state == "Z" || state == "X" { + // It's a zombie, so it's dead + return nil + } + } + } + return fmt.Errorf("process %d still alive after SIGKILL", pid) } - fmt.Printf(" [+] VM %s Started.\n", id) + return nil +} + +// Restore restores a VM from a snapshot using the REST API (to prevent warm boot) +func Restore(cfg config.Config, spec model.SandboxSpec, overlayPath, snapshotDir string) error { + defer util.Track("lifecycle: Sandbox Restore (API)")() + + if err := EnsureSandboxNetNS(cfg, &spec); err != nil { + return fmt.Errorf("ensure netns: %w", err) + } + + overlayPath, _ = filepath.Abs(overlayPath) + + socketPath := GetSocketPath(spec.ID) + pidPath := GetPIDPath(spec.ID) + logPath := GetLogPath(spec.ID) + + // Clean up old socket, vsock, and event files if they exist so we start fresh + os.Remove(socketPath) + os.Remove(GetEventPath(spec.ID)) + os.Remove(GetEventOffsetPath(spec.ID)) + os.Remove(GetVsockPath(spec.ID)) + + // 1. Build CLI args to start an empty Cloud Hypervisor process + args := []string{ + "--api-socket", socketPath, + "--log-file", logPath, + "--event-monitor", "path=" + GetEventPath(spec.ID), + } + + if cfg.Sandbox.Seccomp { + args = append(args, "--seccomp", "true") + } + + // 2. Prepend NetNS execution + netnsArgs := append([]string{"netns", "exec", spec.NetNSName, cfg.CHBinary}, args...) + + // 3. Start Cloud Hypervisor Process + fmt.Printf(">> [Native] Spawning empty CLH process for restore of %s inside NetNS %s...\n", spec.ID, spec.NetNSName) + cmd := exec.Command("ip", netnsArgs...) + + logFile, _ := os.OpenFile(logPath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + cmd.Stdout = logFile + cmd.Stderr = logFile + cmd.SysProcAttr = &syscall.SysProcAttr{Setsid: true} // Daemonize + + if err := cmd.Start(); err != nil { + return fmt.Errorf("process start failed during restore: %v", err) + } + + pid := cmd.Process.Pid + if err := os.WriteFile(pidPath, []byte(strconv.Itoa(pid)), 0644); err != nil { + cmd.Process.Kill() + return err + } + cmd.Process.Release() + + // 4. Wait for Socket to appear + client := NewAPIClient(socketPath) + if err := client.WaitForSocket(2 * time.Second); err != nil { + logs, _ := os.ReadFile(logPath) + Stop(spec.ID) // Cleanup + return fmt.Errorf("VM crashed on restore startup. Logs:\n%s", string(logs)) + } + + // Ensure tap0 is attached to br0 in netns after VMM starts + if err := EnsureTapBridge(spec.NetNSName, spec.TapName); err != nil { + log.Printf("[WARN] EnsureTapBridge failed in Restore: %v\n", err) + } + + // 5. Send Restore Config via API (use a longer timeout since loading snapshot RAM can take time) + clhClient := NewCLHClientWithTimeout(socketPath, 30*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + sourceURL := "file://" + snapshotDir + if !strings.HasSuffix(sourceURL, "/") { + sourceURL += "/" + } + + restoreCfg := &RestoreConfig{ + SourceURL: sourceURL, + Prefault: false, + Resume: true, + } + + if err := clhClient.VmRestore(ctx, restoreCfg); err != nil { + Stop(spec.ID) + return fmt.Errorf("vm.restore failed: %w", err) + } + + fmt.Printf(" [+] VM %s Restored via API! PID: %d\n", spec.ID, pid) + return nil +} + +// RestoreCLI restores a VM from a snapshot +func RestoreCLI(cfg config.Config, spec model.SandboxSpec, overlayPath, snapshotDir string) error { + defer util.Track("lifecycle: Sandbox Restore")() + + if err := EnsureSandboxNetNS(cfg, &spec); err != nil { + return fmt.Errorf("ensure netns: %w", err) + } + + overlayPath, _ = filepath.Abs(overlayPath) + + socketPath := GetSocketPath(spec.ID) + pidPath := GetPIDPath(spec.ID) + logPath := GetLogPath(spec.ID) + + // Clean up old socket, vsock, and event files if they exist so we start fresh + os.Remove(socketPath) + os.Remove(GetEventPath(spec.ID)) + os.Remove(GetEventOffsetPath(spec.ID)) + os.Remove(GetVsockPath(spec.ID)) + + // 1. Build minimal CLI args for restore + // CLH v52+ requires --kernel (or --firmware) even when restoring from a snapshot. + absKernelPath, _ := filepath.Abs(cfg.Paths.KernelPath) + args := []string{ + "--api-socket", socketPath, + "--log-file", logPath, + "--event-monitor", "path=" + GetEventPath(spec.ID), + "--kernel", absKernelPath, + } + + if cfg.Paths.InitrdPath != "" { + absInitrdPath, _ := filepath.Abs(cfg.Paths.InitrdPath) + args = append(args, "--initramfs", absInitrdPath) + } + + if cfg.Sandbox.Seccomp { + args = append(args, "--seccomp", "true") + args = append(args, "--landlock") + + absBaseDir, _ := filepath.Abs(cfg.Paths.BaseImagesDir) + // Parent of base-images dir (e.g. /root/void-run-prod) — mirrors the + // broad read rule used at fresh-boot so CLH can reach all required files. + absBaseParentDir := filepath.Dir(absBaseDir) + absInstanceDir, _ := filepath.Abs(filepath.Dir(overlayPath)) + absSnapshotDir, _ := filepath.Abs(snapshotDir) + + // Each rule must be a separate element — CLH's clap parser treats + // --landlock-rules as a multi-value flag, not a single space-joined string. + llRules := []string{ + "path=/sys,access=r", + "path=/dev/urandom,access=r", + "path=/dev/net/tun,access=rw", + fmt.Sprintf("path=%s,access=r", absBaseParentDir), + fmt.Sprintf("path=%s,access=r", absBaseDir), + fmt.Sprintf("path=%s,access=r", absKernelPath), + fmt.Sprintf("path=%s,access=rw", absInstanceDir), + fmt.Sprintf("path=%s,access=r", absSnapshotDir), + } + if cfg.Paths.InitrdPath != "" { + absInitrdPath, _ := filepath.Abs(cfg.Paths.InitrdPath) + llRules = append(llRules, fmt.Sprintf("path=%s,access=r", absInitrdPath)) + } + args = append(args, "--landlock-rules") + args = append(args, llRules...) + } + + // 2. Append restore arguments + restoreArg := fmt.Sprintf("source_url=file://%s/,memory_restore_mode=ondemand,prefault=off,resume=true", snapshotDir) + args = append(args, "--restore", restoreArg) + + // 3. Prepend NetNS execution + netnsArgs := append([]string{"netns", "exec", spec.NetNSName, cfg.CHBinary}, args...) + + // 4. Start Cloud Hypervisor Process + fmt.Printf(">> [Native] Spawning restored CLH process for %s inside NetNS %s...\n", spec.ID, spec.NetNSName) + cmd := exec.Command("ip", netnsArgs...) + + logFile, _ := os.OpenFile(logPath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + cmd.Stdout = logFile + cmd.Stderr = logFile + cmd.SysProcAttr = &syscall.SysProcAttr{Setsid: true} // Daemonize + + if err := cmd.Start(); err != nil { + return fmt.Errorf("process start failed during restore: %v", err) + } + + pid := cmd.Process.Pid + if err := os.WriteFile(pidPath, []byte(strconv.Itoa(pid)), 0644); err != nil { + cmd.Process.Kill() + return err + } + cmd.Process.Release() + + // 5. Quick sanity check — just verify the process is alive, don't block + // waiting for the CLH API socket. The caller polls the vsock directly + // via waitForAgent, which is the actual readiness signal. + time.Sleep(5 * time.Millisecond) + if proc, err := os.FindProcess(pid); err == nil { + if err := proc.Signal(syscall.Signal(0)); err != nil { + logs, _ := os.ReadFile(logPath) + return fmt.Errorf("VM crashed on restore. Logs:\n%s", string(logs)) + } + } + + // Ensure tap0 is attached to br0 in netns after VMM starts/restores + if err := EnsureTapBridge(spec.NetNSName, spec.TapName); err != nil { + log.Printf("[WARN] EnsureTapBridge failed in Restore: %v\n", err) + } + + + + fmt.Printf(" [+] VM %s Restored! PID: %d\n", spec.ID, pid) return nil } @@ -447,28 +772,6 @@ func Cleanup(id string) error { return nil } -// Pause pauses a running VM -func Pause(id string) error { - client := NewCLHClientForSandbox(id) - if !client.IsSocketAvailable() { - return fmt.Errorf("Sandbox not running") - } - ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - defer cancel() - return client.VmPause(ctx) -} - -// Resume resumes a paused VM -func Resume(id string) error { - client := NewCLHClientForSandbox(id) - if !client.IsSocketAvailable() { - return fmt.Errorf("Sandbox not running") - } - ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - defer cancel() - return client.VmResume(ctx) -} - // Info returns the raw JSON info from Cloud Hypervisor func Info(id string) (string, error) { client := NewCLHClientForSandbox(id) diff --git a/runtime/network.go b/runtime/network.go index 7ad8708..d6f98dc 100644 --- a/runtime/network.go +++ b/runtime/network.go @@ -6,9 +6,14 @@ import ( "fmt" "log" "net" + "os" "os/exec" "strings" + "voidrun/config" + "voidrun/model" + "voidrun/util" + "github.com/vishvananda/netlink" "github.com/vishvananda/netns" ) @@ -18,7 +23,8 @@ const maxIfaceNameLen = 15 // CreateSandboxNetNS creates a fully isolated network namespace for a sandbox. // It wires it to the host bridge via a veth pair and applies strict firewall rules. // Returns (nsName, tapName, error). tapName is always "tap0" inside the netns. -func CreateSandboxNetNS(bridgeName, macAddr, netPrefix string) (nsName, tapName string, err error) { +func CreateSandboxNetNS(bridgeName, macAddr, netPrefix string, nameservers []string) (nsName, tapName string, err error) { + defer util.Track("network:CreateSandboxNetNS")() // Calculate how many random hex bytes we can fit. // Interface name budget: maxIfaceNameLen (15). Separator "-vh-" is 4 chars. // So random hex can use at most maxIfaceNameLen - 4 - len(netPrefix) characters. @@ -39,7 +45,7 @@ func CreateSandboxNetNS(bridgeName, macAddr, netPrefix string) (nsName, tapName hostVeth := netPrefix + "-vh-" + randPart nsVeth := netPrefix + "-vn-" + randPart - if setupErr := setupNetNS(ns, hostVeth, nsVeth, bridgeName, macAddr); setupErr != nil { + if setupErr := setupNetNS(ns, hostVeth, nsVeth, bridgeName, macAddr, nameservers); setupErr != nil { lastErr = setupErr continue } @@ -48,8 +54,44 @@ func CreateSandboxNetNS(bridgeName, macAddr, netPrefix string) (nsName, tapName return "", "", fmt.Errorf("failed to create sandbox netns after 5 attempts, last error: %w", lastErr) } +// EnsureSandboxNetNS checks if the network namespace exists, and if not, recreates it +// with the exact name stored in the spec. +func EnsureSandboxNetNS(cfg config.Config, spec *model.SandboxSpec) error { + defer util.Track("network:EnsureSandboxNetNS")() + if spec.NetNSName == "" { + // If there is no NetNSName, we need to create one. + return ConfigureNetwork(cfg, spec) + } + + _, err := os.Stat("/var/run/netns/" + spec.NetNSName) + if err == nil { + // Namespace already exists, nothing to do + return nil + } + + // Namespace doesn't exist, we must recreate it exactly as it was. + var hostVeth, nsVeth string + nsName := spec.NetNSName + + if strings.Contains(nsName, "-ns-") { + hostVeth = strings.Replace(nsName, "-ns-", "-vh-", 1) + nsVeth = strings.Replace(nsName, "-ns-", "-vn-", 1) + } else if len(nsName) > 3 { + // Legacy format + suffix := nsName[3:] + hostVeth = "veth-h-" + suffix + nsVeth = "veth-n-" + suffix + } else { + return fmt.Errorf("unrecognized netns name format: %s", nsName) + } + + log.Printf(" [Net] Recreating missing NetNS %s (hostVeth: %s, nsVeth: %s)\n", nsName, hostVeth, nsVeth) + return setupNetNS(nsName, hostVeth, nsVeth, cfg.Network.BridgeName, spec.MacAddress, cfg.Network.Nameservers) +} + // setupNetNS performs all the steps to create a fully wired and firewalled netns. -func setupNetNS(nsName, hostVeth, nsVeth, bridgeName, macAddr string) error { +func setupNetNS(nsName, hostVeth, nsVeth, bridgeName, macAddr string, nameservers []string) error { + defer util.Track("network:setupNetNS - " + nsName)() var ok bool // Cleanup guard: on any failure, tear down everything we created so far. defer func() { @@ -107,32 +149,38 @@ func setupNetNS(nsName, hostVeth, nsVeth, bridgeName, macAddr string) error { // WARNING: The iptables-restore heredoc block (<= 3 { + state := fields[2] + if state != "Z" && state != "X" { + t.Errorf("Process should have been killed, but it is alive (state: %s)", state) + } + } + } + } +} diff --git a/server/server.go b/server/server.go index 6bda783..6e89e4c 100644 --- a/server/server.go +++ b/server/server.go @@ -235,10 +235,8 @@ func setupRouter(cfg *config.Config, h *Handlers, s *Services, mw *Middlewares, sandboxByID := sandboxes.Group("/:id") sandboxByID.GET("", handler.Handle(h.Sandbox.Get)) sandboxByID.DELETE("", handler.Handle(h.Sandbox.Delete)) - sandboxByID.POST("/start", handler.Handle(h.Sandbox.Start)) - sandboxByID.POST("/stop", handler.Handle(h.Sandbox.Stop)) - sandboxByID.POST("/pause", handler.Handle(h.Sandbox.Pause)) - sandboxByID.POST("/resume", handler.Handle(h.Sandbox.Resume)) + sandboxByID.POST("/snapshot", handler.Handle(h.Sandbox.Snapshot)) + sandboxByID.POST("/restore", handler.Handle(h.Sandbox.Restore)) sandboxByID.POST("/exec", handler.Handle(h.Exec.Exec)) sandboxByID.POST("/exec-stream", handler.Handle(h.Exec.ExecStream)) sandboxByID.POST("/session-exec", handler.Handle(h.Exec.SessionExec)) diff --git a/server/setup.go b/server/setup.go index 0a8eab6..2b368d7 100644 --- a/server/setup.go +++ b/server/setup.go @@ -15,7 +15,6 @@ import ( "voidrun/util" "github.com/gin-gonic/gin" - "go.mongodb.org/mongo-driver/bson/primitive" "go.mongodb.org/mongo-driver/mongo" ) @@ -183,27 +182,5 @@ func PopulateInitialData(cfg *config.Config, repos *Repositories) error { cfg.SystemUser.OrgID = localOrg.ID } - // Create default system images (using concrete repo) - if imgRepo, ok := repos.Image.(interface{ EnsureSystemImage(model.Image) error }); ok { - if err := imgRepo.EnsureSystemImage(model.Image{ - ID: primitive.NewObjectID(), - Name: "alpine", - Tag: "latest", - Active: true, - CreatedBy: systemUserID, - }); err != nil { - return err - } - if err := imgRepo.EnsureSystemImage(model.Image{ - ID: primitive.NewObjectID(), - Name: "debian", - Tag: "latest", - Active: true, - CreatedBy: systemUserID, - }); err != nil { - return err - } - } - return nil } diff --git a/service/lifecycle_manager.go b/service/lifecycle_manager.go index a7c9d8c..f48b560 100644 --- a/service/lifecycle_manager.go +++ b/service/lifecycle_manager.go @@ -49,8 +49,8 @@ func (m *LifecycleManager) Start(ctx context.Context) { } interval := time.Duration(intervalSec) * time.Second - log.Printf("[lifecycle] started (check every %s, pause-idle=%ds, stop-paused=%ds, delete-stopped=%ds)", - interval, m.cfg.PauseAfterIdleSec, m.cfg.StopAfterPausedSec, m.cfg.DeleteAfterStoppedSec) + log.Printf("[lifecycle] started (check every %s, snapshot-idle=%ds, delete-snapshotted=%ds)", + interval, m.cfg.SnapshotAfterIdleSec, m.cfg.DeleteAfterSnapshottedSec) ticker := time.NewTicker(interval) go func() { @@ -69,16 +69,11 @@ func (m *LifecycleManager) Start(ctx context.Context) { func (m *LifecycleManager) tick(ctx context.Context) { var wg sync.WaitGroup - wg.Add(3) + wg.Add(2) go func() { defer wg.Done() - m.autoPause(ctx) - }() - - go func() { - defer wg.Done() - m.autoStop(ctx) + m.autoSnapshot(ctx) }() go func() { @@ -89,99 +84,111 @@ func (m *LifecycleManager) tick(ctx context.Context) { wg.Wait() } -// autoPause pauses running sandboxes that have been idle too long. -func (m *LifecycleManager) autoPause(ctx context.Context) { - if m.cfg.PauseAfterIdleSec <= 0 { +// autoSnapshot snapshots running sandboxes that have been idle too long. +func (m *LifecycleManager) autoSnapshot(ctx context.Context) { + if m.cfg.SnapshotAfterIdleSec <= 0 { return } - threshold := time.Now().Add(-time.Duration(m.cfg.PauseAfterIdleSec) * time.Second) + threshold := time.Now().Add(-time.Duration(m.cfg.SnapshotAfterIdleSec) * time.Second) sandboxes, err := m.repo.FindIdleRunning(ctx, threshold) if err != nil { - log.Printf("[lifecycle] auto-pause query failed: %v", err) + log.Printf("[lifecycle] auto-snapshot query failed: %v", err) return } - for _, sb := range sandboxes { - id := sb.ID.Hex() - if err := runtime.Pause(id); err != nil { - log.Printf("[lifecycle] auto-pause runtime failed for %s (%s): %v", sb.Name, id, err) - continue - } - if err := m.repo.SetPausedAt(ctx, sb.ID); err != nil { - log.Printf("[lifecycle] auto-pause DB update failed for %s (%s): %v", sb.Name, id, err) - continue - } - log.Printf("[lifecycle] auto-paused sandbox %s (%s) after %ds idle", sb.Name, id, m.cfg.PauseAfterIdleSec) - } -} - -// autoStop stops paused sandboxes that have been paused too long. -func (m *LifecycleManager) autoStop(ctx context.Context) { - if m.cfg.StopAfterPausedSec <= 0 { - return - } - - threshold := time.Now().Add(-time.Duration(m.cfg.StopAfterPausedSec) * time.Second) - sandboxes, err := m.repo.FindStalePaused(ctx, threshold) - if err != nil { - log.Printf("[lifecycle] auto-stop query failed: %v", err) - return + maxConc := m.cfg.Concurrency + if maxConc <= 0 { + maxConc = 10 } + sem := make(chan struct{}, maxConc) + var wg sync.WaitGroup for _, sb := range sandboxes { - id := sb.ID.Hex() - if err := runtime.Stop(id); err != nil { - log.Printf("[lifecycle] auto-stop runtime failed for %s (%s): %v", sb.Name, id, err) - continue - } - if m.metrics != nil { - m.metrics.UnregisterSandbox(id) - } - if err := m.repo.SetStoppedAt(ctx, sb.ID); err != nil { - log.Printf("[lifecycle] auto-stop DB update failed for %s (%s): %v", sb.Name, id, err) - continue - } - log.Printf("[lifecycle] auto-stopped sandbox %s (%s) after %ds paused", sb.Name, id, m.cfg.StopAfterPausedSec) + sb := sb + wg.Add(1) + sem <- struct{}{} + + go func() { + defer func() { <-sem; wg.Done() }() + + id := sb.ID.Hex() + + // Stop event monitor BEFORE snapshotting so it can do a final sync + // while the CLH API socket is still alive. + if m.monitor != nil { + m.monitor.Stop(ctx, id) + } + + if err := runtime.Snapshot(id); err != nil { + log.Printf("[lifecycle] auto-snapshot runtime failed for %s (%s): %v", sb.Name, id, err) + return + } + if m.metrics != nil { + m.metrics.UnregisterSandbox(id) + } + if err := m.repo.SetSnapshottedAt(ctx, sb.ID); err != nil { + log.Printf("[lifecycle] auto-snapshot DB update failed for %s (%s): %v", sb.Name, id, err) + return + } + log.Printf("[lifecycle] auto-snapshotted sandbox %s (%s) after %ds idle", sb.Name, id, m.cfg.SnapshotAfterIdleSec) + }() } + wg.Wait() } -// autoDelete deletes stopped sandboxes that have been stopped too long. +// autoDelete deletes snapshotted sandboxes that have been snapshotted too long. func (m *LifecycleManager) autoDelete(ctx context.Context) { - if m.cfg.DeleteAfterStoppedSec <= 0 { + if m.cfg.DeleteAfterSnapshottedSec <= 0 { return } - threshold := time.Now().Add(-time.Duration(m.cfg.DeleteAfterStoppedSec) * time.Second) - sandboxes, err := m.repo.FindStaleStopped(ctx, threshold) + threshold := time.Now().Add(-time.Duration(m.cfg.DeleteAfterSnapshottedSec) * time.Second) + sandboxes, err := m.repo.FindStaleSnapshotted(ctx, threshold) if err != nil { log.Printf("[lifecycle] auto-delete query failed: %v", err) return } - for _, sb := range sandboxes { - id := sb.ID.Hex() + maxConc := m.cfg.Concurrency + if maxConc <= 0 { + maxConc = 10 + } + sem := make(chan struct{}, maxConc) + var wg sync.WaitGroup - if err := runtime.Delete(id, sb.TapName, sb.NetNSName); err != nil { - log.Printf("[lifecycle] auto-delete runtime failed for %s (%s): %v", sb.Name, id, err) - // Continue with cleanup anyway — the VM may already be gone - } + for _, sb := range sandboxes { + sb := sb + wg.Add(1) + sem <- struct{}{} + + go func() { + defer func() { <-sem; wg.Done() }() + + id := sb.ID.Hex() + + if err := runtime.Delete(id, sb.TapName, sb.NetNSName); err != nil { + log.Printf("[lifecycle] auto-delete runtime failed for %s (%s): %v", sb.Name, id, err) + // Continue with cleanup anyway — the VM may already be gone + } - // Stop event monitor (final sync) - if m.monitor != nil { - m.monitor.Stop(ctx, id) - } + // Stop event monitor (final sync) + if m.monitor != nil { + m.monitor.Stop(ctx, id) + } - // Physical cleanup - if err := runtime.Cleanup(id); err != nil { - fmt.Printf("[lifecycle] auto-delete cleanup failed for %s (%s): %v\n", sb.Name, id, err) - } + // Physical cleanup + if err := runtime.Cleanup(id); err != nil { + fmt.Printf("[lifecycle] auto-delete cleanup failed for %s (%s): %v\n", sb.Name, id, err) + } - // Mark as deleted in DB - if err := m.repo.UpdateStatusForHealth(ctx, sb.ID, "deleted"); err != nil { - log.Printf("[lifecycle] auto-delete DB update failed for %s (%s): %v", sb.Name, id, err) - continue - } - log.Printf("[lifecycle] auto-deleted sandbox %s (%s) after %ds stopped", sb.Name, id, m.cfg.DeleteAfterStoppedSec) + // Mark as deleted in DB + if err := m.repo.UpdateStatusForHealth(ctx, sb.ID, "deleted"); err != nil { + log.Printf("[lifecycle] auto-delete DB update failed for %s (%s): %v", sb.Name, id, err) + return + } + log.Printf("[lifecycle] auto-deleted sandbox %s (%s) after %ds snapshotted", sb.Name, id, m.cfg.DeleteAfterSnapshottedSec) + }() } + wg.Wait() } diff --git a/service/sandbox.go b/service/sandbox.go index 3c84c1c..be40eaa 100644 --- a/service/sandbox.go +++ b/service/sandbox.go @@ -24,18 +24,20 @@ import ( "go.mongodb.org/mongo-driver/bson" "go.mongodb.org/mongo-driver/bson/primitive" "go.mongodb.org/mongo-driver/mongo/options" + "golang.org/x/sync/singleflight" ) var ErrSandboxNotFound = errors.New("sandbox not found") // SandboxService handles sandbox business logic type SandboxService struct { - repo repository.ISandboxRepository - imageRepo repository.IImageRepository - cfg *config.Config - metrics *metrics.Manager - monitor *runtime.EventMonitor - projection primitive.M + repo repository.ISandboxRepository + imageRepo repository.IImageRepository + cfg *config.Config + metrics *metrics.Manager + monitor *runtime.EventMonitor + projection primitive.M + restoreGroup singleflight.Group // deduplicates concurrent auto-restore calls per sandbox } // NewSandboxService creates a new sandbox service @@ -50,14 +52,14 @@ func NewSandboxService(cfg *config.Config, repo repository.ISandboxRepository, i "_id": 1, "name": 1, "image": 1, + "ip": 1, "cpu": 1, "mem": 1, "diskMB": 1, "status": 1, "autoSleep": 1, "lastActivityAt": 1, - "pausedAt": 1, - "stoppedAt": 1, + "snapshottedAt": 1, "createdAt": 1, "orgId": 1, "createdBy": 1, @@ -66,6 +68,7 @@ func NewSandboxService(cfg *config.Config, repo repository.ISandboxRepository, i "tapName": 1, "tapDeleted": 1, "netnsName": 1, + "macAddress": 1, }, } } @@ -221,14 +224,23 @@ func (s *SandboxService) Create(ctx context.Context, req model.CreateSandboxRequ }() } - go func() { - log.Printf(" [Agent] Configuring network on %s (async)...\n", spec.ID) + if syncEnabled { + log.Printf(" [Agent] Configuring network on %s (sync)...\n", spec.ID) if cfgErr := configureAgentNetwork(spec.ID, &netCfg); cfgErr != nil { log.Printf(" [Agent] network config failed on %s: %v\n", spec.ID, cfgErr) } else { log.Printf(" [Agent] network config done on %s\n", spec.ID) } - }() + } else { + go func() { + log.Printf(" [Agent] Configuring network on %s (async)...\n", spec.ID) + if cfgErr := configureAgentNetwork(spec.ID, &netCfg); cfgErr != nil { + log.Printf(" [Agent] network config failed on %s: %v\n", spec.ID, cfgErr) + } else { + log.Printf(" [Agent] network config done on %s\n", spec.ID) + } + }() + } autoSleep := true if req.AutoSleep != nil { @@ -239,7 +251,7 @@ func (s *SandboxService) Create(ctx context.Context, req model.CreateSandboxRequ sandbox := &model.Sandbox{ ID: objID, Name: req.Name, - Image: req.Image, + Image: imageName, IP: ip, CPU: cpu, Mem: mem, @@ -251,6 +263,7 @@ func (s *SandboxService) Create(ctx context.Context, req model.CreateSandboxRequ RefID: req.RefID, TapName: spec.TapName, NetNSName: spec.NetNSName, + MacAddress: spec.MacAddress, // persist so Restore doesn't need to re-derive it LastActivityAt: &now, Status: "running", CreatedAt: now, @@ -314,218 +327,178 @@ func (s *SandboxService) Delete(ctx context.Context, orgID primitive.ObjectID, i return nil } -func (s *SandboxService) Start(ctx context.Context, orgID primitive.ObjectID, id string) error { +func (s *SandboxService) Snapshot(ctx context.Context, orgID primitive.ObjectID, id string) error { sandbox, err := s.getOrgScopedSandbox(ctx, orgID, id) if err != nil { return err } - // Verify it's stopped - if sandbox.Status != "stopped" { - return fmt.Errorf("sandbox is not stopped (current status: %s)", sandbox.Status) + if sandbox.Status != "running" { + return fmt.Errorf("sandbox is not running (current status: %s)", sandbox.Status) } - socketPath := runtime.GetSocketPath(id) - - // Check if hypervisor is running (socket exists) - client := runtime.NewCLHClient(socketPath) - if client.IsSocketAvailable() { - // Warm start - hypervisor running, just boot the VM - log.Printf("[Start] Warm start for sandbox %s\n", id) - if err := runtime.Start(id); err != nil { - return fmt.Errorf("failed to start VM: %w", err) - } - - timeout := 30 * time.Second - if err := waitForAgent(ctx, id, timeout); err != nil { - - return fmt.Errorf("agent not ready: %w", err) - } - } else { - // Cold start - hypervisor not running, need to recreate - log.Printf("[Start] Cold start for sandbox %s - recreating VM\n", id) - - spec := model.SandboxSpec{ - ID: id, - Type: sandbox.Image, - CPUs: sandbox.CPU, - MemoryMB: sandbox.Mem, - DiskMB: sandbox.DiskMB, - IPAddress: sandbox.IP, - } - - tap := strings.TrimSpace(sandbox.TapName) - nsName := strings.TrimSpace(sandbox.NetNSName) - if tap == "" || nsName == "" { - // No existing netns — create a fresh one - if err := runtime.ConfigureNetwork(*s.cfg, &spec); err != nil { - return fmt.Errorf("cold start network setup failed: %w", err) - } - if ok, err := s.repo.UpdateNetNSByIDAndOrg(ctx, sandbox.ID, orgID, spec.TapName, spec.NetNSName); err != nil { - log.Printf("[WARN] failed to persist netns info for %s: %v\n", id, err) - } else if !ok { - log.Printf("[WARN] netns update matched no document for %s\n", id) - } - } else { - spec.TapName = tap - spec.NetNSName = nsName - spec.MacAddress = runtime.GenerateMAC(sandbox.IP) - } - - overlayPath := runtime.GetOverlayPath(id) - if err := runtime.Create(*s.cfg, spec, overlayPath); err != nil { - return fmt.Errorf("failed to recreate VM: %w", err) - } + // Stop event monitor BEFORE snapshot so it can do a final sync while the CLH socket is alive. + if s.monitor != nil { + s.monitor.Stop(ctx, id) + } - // Wait for agent - if err := waitForAgent(ctx, id, 30*time.Second); err != nil { - return fmt.Errorf("agent not ready after restart: %w", err) - } + if err := runtime.Snapshot(id); err != nil { + return err } - // Update status to running and clear stoppedAt - if _, err := s.repo.UpdateStatusByIDAndOrg(ctx, sandbox.ID, orgID, "running"); err != nil { - // VM is running but DB update failed - log but don't fail - fmt.Printf("[WARN] VM started but failed to update DB status: %v\n", err) + // Update database status to snapshotted and set snapshottedAt + if _, err := s.repo.UpdateStatusByIDAndOrg(ctx, sandbox.ID, orgID, "snapshotted"); err != nil { + return fmt.Errorf("failed to update status: %w", err) + } + if err := s.repo.SetSnapshottedAt(ctx, sandbox.ID); err != nil { + log.Printf("[WARN] Failed to set snapshottedAt for %s: %v", id, err) } - // Register with metrics if s.metrics != nil { - spec := model.SandboxSpec{ - ID: id, - CPUs: sandbox.CPU, - MemoryMB: sandbox.Mem, - DiskMB: sandbox.DiskMB, - } - s.metrics.RegisterSandbox(spec.ID, sandbox.Name, runtime.GetSocketPath(spec.ID), spec.CPUs, spec.MemoryMB, spec.DiskMB) + s.metrics.UnregisterSandbox(sandbox.ID.Hex()) } return nil } -func (s *SandboxService) Stop(ctx context.Context, orgID primitive.ObjectID, id string) error { +func (s *SandboxService) Restore(ctx context.Context, orgID primitive.ObjectID, id string) error { sandbox, err := s.getOrgScopedSandbox(ctx, orgID, id) if err != nil { return err } - if sandbox.Status != "running" { - return fmt.Errorf("sandbox is not running (current status: %s)", sandbox.Status) + // Verify it's snapshotted + if sandbox.Status != "snapshotted" { + return fmt.Errorf("sandbox is not snapshotted (current status: %s)", sandbox.Status) } - if err := runtime.Stop(id); err != nil { - return err - } - if s.metrics != nil { - s.metrics.UnregisterSandbox(sandbox.ID.Hex()) + imageName := sandbox.Image + if !strings.Contains(imageName, ":") { + img, err := s.imageRepo.GetLatestByNameForOrg(imageName, orgID) + if err == nil && img != nil && img.Tag != "" { + imageName = fmt.Sprintf("%s:%s", img.Name, img.Tag) + } } - // Update database status to stopped and set stoppedAt - if _, err := s.repo.UpdateStatusByIDAndOrg(ctx, sandbox.ID, orgID, "stopped"); err != nil { - return fmt.Errorf("failed to update status: %w", err) - } - // Also set stoppedAt timestamp for auto-delete tracking - if err := s.repo.SetStoppedAt(ctx, sandbox.ID); err != nil { - log.Printf("[WARN] Failed to set stoppedAt for %s: %v", id, err) + // Resolve MAC: prefer stored value, fall back to deterministic derivation for + // sandboxes created before this field was added. + macAddr := sandbox.MacAddress + if macAddr == "" { + macAddr = runtime.GenerateMAC(sandbox.IP) } - return nil -} - -// EnsureRunning checks if sandbox is running and starts it if stopped (auto-start feature) -func (s *SandboxService) EnsureRunning(ctx context.Context, orgID primitive.ObjectID, id string) error { - // Get sandbox from DB to check status - sandbox, err := s.getOrgScopedSandbox(ctx, orgID, id) - if err != nil { - return err + spec := model.SandboxSpec{ + ID: id, + Type: imageName, + CPUs: sandbox.CPU, + MemoryMB: sandbox.Mem, + IPAddress: sandbox.IP, + TapName: sandbox.TapName, + MacAddress: macAddr, + NetNSName: sandbox.NetNSName, + } + + var overlayPath string + if s.cfg.Sandbox.DiskFormat == "raw" { + overlayPath = runtime.GetRawOverlayPath(id) + } else { + overlayPath = runtime.GetOverlayPath(id) } - - // If already running, return immediately - if sandbox.Status == "running" { - return nil + snapshotDir := runtime.GetLatestSnapshotDir(id) + if snapshotDir == "" { + return fmt.Errorf("no valid snapshot found for sandbox %s", id) } - // If paused, resume it - if sandbox.Status == "paused" { - log.Printf("[Auto-Resume] Sandbox %s is paused, resuming...\n", id) - if err := s.Resume(ctx, orgID, id); err != nil { - return fmt.Errorf("failed to auto-resume sandbox: %w", err) - } - - log.Printf("[Auto-Resume] Sandbox %s resumed and ready\n", id) - return nil + if err := runtime.Restore(*s.cfg, spec, overlayPath, snapshotDir); err != nil { + return fmt.Errorf("failed to restore VM: %w", err) } - // If stopped, start it - if sandbox.Status == "stopped" { - log.Printf("[Auto-Start] Sandbox %s is stopped, starting...\n", id) - if err := s.Start(ctx, orgID, id); err != nil { - return fmt.Errorf("failed to auto-start sandbox: %w", err) + // From this point, the VMM is running. Any failure must clean it up. + cleanup := func() { + log.Printf("[Restore] Rolling back: stopping VM %s", id) + if stopErr := runtime.Stop(id); stopErr != nil { + log.Printf("[Restore] Rollback stop failed for %s: %v", id, stopErr) } + } - log.Printf("[Auto-Start] Sandbox %s started and ready\n", id) - return nil + timeout := 30 * time.Second + if err := waitForAgent(ctx, id, timeout); err != nil { + cleanup() + return fmt.Errorf("agent not ready after restore: %w", err) } - // Other states - return fmt.Errorf("sandbox in unexpected state for auto-start/resume: %s", sandbox.Status) -} + // Sync the guest clock — after a snapshot restore the VM clock is frozen at + // the time the snapshot was taken. Inject the current wall-clock time via the + // agent so that `date`, cron jobs, TLS expiry checks, etc. see the right time. + syncSandboxClock(id) -func (s *SandboxService) Pause(ctx context.Context, orgID primitive.ObjectID, id string) error { - sandbox, err := s.getOrgScopedSandbox(ctx, orgID, id) - if err != nil { - return err + // After a snapshot restore, the virtio-net device inside the guest comes back + // with eth0 DOWN (cloud-hypervisor resets the virtio-net device on restore). + // Re-apply the network config to bring eth0 up and restore IP/routes/DNS. + netCfg := buildAgentNetConfig(s.cfg, sandbox.IP, sandbox.Name) + if cfgErr := configureAgentNetwork(id, &netCfg); cfgErr != nil { + log.Printf(" [Restore] network re-config failed on %s: %v\n", id, cfgErr) + } else { + log.Printf(" [Restore] network re-config done on %s\n", id) } - if sandbox.Status != "running" { - return fmt.Errorf("sandbox is not running (current status: %s)", sandbox.Status) + // Update status to running + if _, err := s.repo.UpdateStatusByIDAndOrg(ctx, sandbox.ID, orgID, "running"); err != nil { + cleanup() + return fmt.Errorf("VM restored but failed to update DB status: %w", err) } - if !sandbox.AutoSleep { - return fmt.Errorf("sandbox has auto-sleep disabled") + // Touch activity on restore so the sandbox doesn't immediately get auto-snapshotted again + if err := s.repo.TouchActivity(ctx, sandbox.ID); err != nil { + log.Printf("[WARN] Failed to touch activity on restore for %s: %v", id, err) } - if err := runtime.Pause(id); err != nil { - return err + // Register with metrics + if s.metrics != nil { + s.metrics.RegisterSandbox(id, sandbox.Name, runtime.GetSocketPath(id), sandbox.CPU, sandbox.Mem, sandbox.DiskMB) } - // Update database status to paused and set pausedAt - if _, err := s.repo.UpdateStatusByIDAndOrg(ctx, sandbox.ID, orgID, "paused"); err != nil { - return fmt.Errorf("failed to update status: %w", err) - } - if err := s.repo.SetPausedAt(ctx, sandbox.ID); err != nil { - log.Printf("[WARN] Failed to set pausedAt for %s: %v", id, err) + // Restart CLH event monitor so restored sandboxes get event tracking + if s.monitor != nil { + s.monitor.Start(ctx, sandbox.ID, sandbox.OrgID, sandbox.CreatedBy) } return nil } -func (s *SandboxService) Resume(ctx context.Context, orgID primitive.ObjectID, id string) error { +// EnsureRunning checks if sandbox is running and restores it if snapshotted (auto-restore feature). +// Uses singleflight to deduplicate concurrent restore calls — if 100 exec requests arrive for the +// same snapshotted sandbox, only 1 will actually call Restore(); the other 99 block and share the result. +func (s *SandboxService) EnsureRunning(ctx context.Context, orgID primitive.ObjectID, id string) error { + // Get sandbox from DB to check status sandbox, err := s.getOrgScopedSandbox(ctx, orgID, id) if err != nil { return err } - if sandbox.Status != "paused" { - return fmt.Errorf("sandbox is not paused (current status: %s)", sandbox.Status) + // If already running, return immediately + if sandbox.Status == "running" { + return nil } - if err := runtime.Resume(id); err != nil { - log.Printf("[ERROR] Failed to resume sandbox %s: %v\n", id, err) + // If snapshotted, restore it via singleflight to prevent thundering herd + if sandbox.Status == "snapshotted" { + _, err, shared := s.restoreGroup.Do(id, func() (interface{}, error) { + log.Printf("[Auto-Restore] Sandbox %s is snapshotted, restoring...\n", id) + if err := s.Restore(ctx, orgID, id); err != nil { + return nil, fmt.Errorf("failed to auto-restore sandbox: %w", err) + } + log.Printf("[Auto-Restore] Sandbox %s restored and ready\n", id) + return nil, nil + }) + if shared { + log.Printf("[Auto-Restore] Sandbox %s restore was shared with concurrent caller\n", id) + } return err } - // Update database status to running - if _, err := s.repo.UpdateStatusByIDAndOrg(ctx, sandbox.ID, orgID, "running"); err != nil { - return fmt.Errorf("failed to update status: %w", err) - } - - // Touch activity on resume so the sandbox doesn't immediately get auto-paused again - if err := s.repo.TouchActivity(ctx, sandbox.ID); err != nil { - log.Printf("[WARN] Failed to touch activity on resume for %s: %v", id, err) - } - - return nil + // Other states + return fmt.Errorf("sandbox in unexpected state for auto-restore: %s", sandbox.Status) } func (s *SandboxService) Info(id string) (string, error) { @@ -533,7 +506,7 @@ func (s *SandboxService) Info(id string) (string, error) { } // RefreshStatuses checks each sandbox health and updates status field in DB. -// Status values: running, paused, stopped. +// Status values: running, snapshotted, killed, deleted. func (s *SandboxService) RefreshStatuses(ctx context.Context) error { // Optimization 1: Fetch only necessary fields projection := bson.M{"_id": 1, "status": 1} @@ -558,9 +531,10 @@ func (s *SandboxService) RefreshStatuses(ctx context.Context) error { client := runtime.NewAPIClientForSandbox(id) socketExists := client.IsSocketAvailable() // Fast os.Stat check - // Case 1: DB says Stopped + Socket is GONE. - // Conclusion: It is definitely stopped/dead. No need to call API. - if sb.Status == "stopped" && !socketExists { + // Case 1: DB says Snapshotted. + // Conclusion: It is either snapshotted (socket gone) or in the process of restoring (socket exists). + // In either case, the health check should not touch its status. + if sb.Status == "snapshotted" { continue } @@ -577,7 +551,7 @@ func (s *SandboxService) RefreshStatuses(ctx context.Context) error { go func() { defer func() { <-sem; wg.Done() }() - newState := "stopped" + newState := "killed" if socketExists { apiCtx, cancel := context.WithTimeout(ctx, 2*time.Second) @@ -589,18 +563,14 @@ func (s *SandboxService) RefreshStatuses(ctx context.Context) error { switch strings.ToLower(sbxState) { case "running", "runningvirtualized": newState = "running" - case "paused": - newState = "paused" - case "loaded": - // 'Loaded' means Process active, but Guest not booted. - // For your app, this is "stopped" (ready to start). - newState = "stopped" default: - newState = "stopped" + // If the socket is somehow still there but state is not running + // it might be a zombie, so map it to killed. + newState = "killed" } } else { // Socket exists, but API refused connection or timed out. - // Process is likely zombie or unresponsive. Treat as stopped. + // Process is likely zombie or unresponsive. Treat as killed. fmt.Printf("[health] Sandbox %s unresponsive (socket exists): %v\n", id, err) newState = "killed" } @@ -633,15 +603,24 @@ func waitForAgent(ctx context.Context, sbxID string, timeout time.Duration) erro ctx, cancel := context.WithTimeout(ctx, timeout) defer cancel() - ticker := time.NewTicker(50 * time.Millisecond) - defer ticker.Stop() - start := time.Now() attempts := 0 var lastErr error + // Tight 10ms polling interval with 15ms probe timeout. + // The vsock needs ~350ms to synchronize after restore regardless of + // how often we poll. Using 10ms interval ensures we catch the exact + // moment it becomes ready (at most 25ms overshoot). + const pollInterval = 10 * time.Millisecond + const probeTimeout = 15 * time.Millisecond // CONNECT+OK takes <5ms once ready + + // Use a Ticker (not time.After) to avoid allocating a new timer object + // every iteration — time.After leaks ~3000 timers over a 30s timeout. + ticker := time.NewTicker(pollInterval) + defer ticker.Stop() + for { - err := runtime.Probe(sbxID, 1024, 50*time.Millisecond) + err := runtime.Probe(sbxID, 1024, probeTimeout) attempts++ if err == nil { log.Printf(" [Agent] Ready on %s after %s (%d attempts)\n", sbxID, time.Since(start), attempts) @@ -682,10 +661,59 @@ func configureAgentNetwork(sbxID string, netCfg *agentNetConfig) error { body, _ := io.ReadAll(resp.Body) return fmt.Errorf("configure network status %d: %s", resp.StatusCode, strings.TrimSpace(string(body))) } + io.Copy(io.Discard, resp.Body) return nil } +// syncSandboxClock injects the current wall-clock time into a restored sandbox +// guest via `date -s @`. After a VM snapshot/restore the guest +// clock is frozen at the snapshot timestamp; this call corrects it so the +// guest sees the real current time immediately after restore. +// +// The agent vsock health-check can pass a split-second before the /exec HTTP +// handler is fully initialised (EOF on handshake), so we retry a few times +// with a short back-off before giving up. +// This is best-effort: a failure is logged but never causes the restore to fail. +func syncSandboxClock(sbxID string) { + now := time.Now().Unix() + cmd := fmt.Sprintf("sudo date -s @%d", now) + + payload := map[string]interface{}{ + "cmd": cmd, + "timeout": 5, + } + body, err := json.Marshal(payload) + if err != nil { + log.Printf("[Restore] syncSandboxClock: marshal error for %s: %v", sbxID, err) + return + } + + const maxAttempts = 5 + for attempt := 1; attempt <= maxAttempts; attempt++ { + ctx, cancel := context.WithTimeout(context.Background(), 6*time.Second) + resp, err := ExecAgentCommand(ctx, nil, sbxID, bytes.NewReader(body)) + cancel() + + if err != nil { + log.Printf("[Restore] syncSandboxClock: attempt %d/%d exec error for %s: %v", attempt, maxAttempts, sbxID, err) + time.Sleep(200 * time.Millisecond) + continue + } + io.Copy(io.Discard, resp.Body) + resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + log.Printf("[Restore] syncSandboxClock: attempt %d/%d agent returned %d for %s", attempt, maxAttempts, resp.StatusCode, sbxID) + time.Sleep(200 * time.Millisecond) + continue + } + log.Printf(" [Restore] clock synced to epoch %d on %s (attempt %d)", now, sbxID, attempt) + return + } + log.Printf("[WARN] syncSandboxClock: gave up syncing clock for %s after %d attempts", sbxID, maxAttempts) +} + func buildAgentNetConfig(cfg *config.Config, ip, name string) agentNetConfig { hostname := name if hostname == "" { @@ -864,6 +892,7 @@ func setAgentEnvVars(sbxID string, envVars map[string]string) error { body, _ := io.ReadAll(resp.Body) return fmt.Errorf("agent returned status %d: %s", resp.StatusCode, string(body)) } + io.Copy(io.Discard, resp.Body) fmt.Printf("[INFO] Environment variables set on sandbox %s: %v\n", sbxID, envVars) return nil