diff --git a/crates/openshell-driver-docker/src/tests.rs b/crates/openshell-driver-docker/src/tests.rs index c9b34ff8f..935b64d71 100644 --- a/crates/openshell-driver-docker/src/tests.rs +++ b/crates/openshell-driver-docker/src/tests.rs @@ -45,6 +45,7 @@ fn test_sandbox() -> DriverSandbox { gpu: false, gpu_device: String::new(), sandbox_token: String::new(), + network_enforcement: 0, }), status: None, } diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index 5a43eb980..ddc8b47f0 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -330,6 +330,10 @@ impl KubernetesComputeDriver { enable_user_namespaces: self.config.enable_user_namespaces, workspace_default_storage_size: &self.config.workspace_default_storage_size, sa_token_ttl_secs: self.config.effective_sa_token_ttl_secs(), + is_platform_mode: sandbox + .spec + .as_ref() + .is_some_and(|s| s.network_enforcement == 1), }; obj.data = sandbox_to_k8s_spec(sandbox.spec.as_ref(), ¶ms); let api = self.api(); @@ -823,6 +827,7 @@ fn apply_supervisor_sideload( supervisor_image: &str, supervisor_image_pull_policy: &str, method: SupervisorSideloadMethod, + is_platform_mode: bool, ) { let Some(spec) = pod_template.get_mut("spec").and_then(|v| v.as_object_mut()) else { return; @@ -882,16 +887,16 @@ fn apply_supervisor_sideload( serde_json::json!([format!("{}/openshell-sandbox", SUPERVISOR_MOUNT_PATH)]), ); - // Force the supervisor to run as root (UID 0). Sandbox images may set - // a non-root USER directive (e.g. `USER sandbox`), but the supervisor - // needs root to create network namespaces, set up the proxy, and - // configure Landlock/seccomp. The supervisor itself drops privileges - // for child processes via the policy's `run_as_user`/`run_as_group`. - let security_context = container - .entry("securityContext") - .or_insert_with(|| serde_json::json!({})); - if let Some(sc) = security_context.as_object_mut() { - sc.insert("runAsUser".to_string(), serde_json::json!(0)); + // In namespace mode, force root (UID 0) so the supervisor can create + // network namespaces and drop privileges for child processes. + // In platform mode, keep the image's default non-root user. + if !is_platform_mode { + let security_context = container + .entry("securityContext") + .or_insert_with(|| serde_json::json!({})); + if let Some(sc) = security_context.as_object_mut() { + sc.insert("runAsUser".to_string(), serde_json::json!(0)); + } } // Add volume mount @@ -1044,6 +1049,10 @@ struct SandboxPodParams<'a> { /// Lifetime (seconds) of the projected `ServiceAccount` token used /// for the bootstrap `IssueSandboxToken` exchange. sa_token_ttl_secs: i64, + /// Platform network enforcement mode (Issue #899). When true, sandbox + /// pods are emitted without elevated capabilities, compatible with + /// restricted-v2 SCC and restricted Pod Security Standard. + is_platform_mode: bool, } impl Default for SandboxPodParams<'_> { @@ -1065,6 +1074,7 @@ impl Default for SandboxPodParams<'_> { enable_user_namespaces: false, workspace_default_storage_size: DEFAULT_WORKSPACE_STORAGE_SIZE, sa_token_ttl_secs: 3600, + is_platform_mode: false, } } } @@ -1265,22 +1275,32 @@ fn sandbox_template_to_k8s( container.insert("env".to_string(), serde_json::Value::Array(env)); - let mut capabilities: Vec<&str> = vec!["SYS_ADMIN", "NET_ADMIN", "SYS_PTRACE", "SYSLOG"]; - if use_user_namespaces { - // In a user namespace the bounding set is reset. SETUID/SETGID are - // needed for the supervisor to drop privileges to the sandbox user. - // DAC_READ_SEARCH is needed for cross-UID /proc//fd/ access - // for process identity resolution in network policy enforcement. - capabilities.extend(["SETUID", "SETGID", "DAC_READ_SEARCH"]); + if params.is_platform_mode { + // Platform mode: zero elevated capabilities. Compatible with + // restricted-v2 SCC and restricted Pod Security Standard. + container.insert( + "securityContext".to_string(), + serde_json::json!({ + "allowPrivilegeEscalation": false, + "capabilities": { + "drop": ["ALL"] + } + }), + ); + } else { + let mut capabilities: Vec<&str> = vec!["SYS_ADMIN", "NET_ADMIN", "SYS_PTRACE", "SYSLOG"]; + if use_user_namespaces { + capabilities.extend(["SETUID", "SETGID", "DAC_READ_SEARCH"]); + } + container.insert( + "securityContext".to_string(), + serde_json::json!({ + "capabilities": { + "add": capabilities + } + }), + ); } - container.insert( - "securityContext".to_string(), - serde_json::json!({ - "capabilities": { - "add": capabilities - } - }), - ); // Mount client TLS secret for mTLS to the server, plus the projected // ServiceAccount token used to bootstrap the sandbox's gateway JWT @@ -1363,6 +1383,7 @@ fn sandbox_template_to_k8s( params.supervisor_image, params.supervisor_image_pull_policy, params.supervisor_sideload_method, + params.is_platform_mode, ); // Inject workspace persistence (init container + PVC volume mount) so @@ -1750,6 +1771,7 @@ mod tests { "custom-image:latest", "IfNotPresent", SupervisorSideloadMethod::InitContainer, + false, ); let sc = &pod_template["spec"]["containers"][0]["securityContext"]; @@ -1779,6 +1801,7 @@ mod tests { "supervisor-image:latest", "IfNotPresent", SupervisorSideloadMethod::InitContainer, + false, ); let sc = &pod_template["spec"]["containers"][0]["securityContext"]; @@ -1804,6 +1827,7 @@ mod tests { "supervisor-image:latest", "IfNotPresent", SupervisorSideloadMethod::InitContainer, + false, ); // Volume should be an emptyDir @@ -1878,6 +1902,7 @@ mod tests { "supervisor-image:latest", "IfNotPresent", SupervisorSideloadMethod::ImageVolume, + false, ); let volumes = pod_template["spec"]["volumes"] @@ -1932,6 +1957,7 @@ mod tests { "supervisor-image:latest", "", SupervisorSideloadMethod::ImageVolume, + false, ); let volume = &pod_template["spec"]["volumes"][0]; diff --git a/crates/openshell-policy/src/lib.rs b/crates/openshell-policy/src/lib.rs index 26c8fc9d3..d55a2806c 100644 --- a/crates/openshell-policy/src/lib.rs +++ b/crates/openshell-policy/src/lib.rs @@ -378,6 +378,7 @@ fn to_proto(raw: PolicyFile) -> SandboxPolicy { run_as_group: p.run_as_group, }), network_policies, + network_enforcement: 0, } } @@ -649,6 +650,7 @@ pub fn restrictive_default_policy() -> SandboxPolicy { run_as_group: "sandbox".into(), }), network_policies: HashMap::new(), + network_enforcement: 0, // NAMESPACE (default) } } @@ -1262,6 +1264,7 @@ network_policies: filesystem: None, landlock: None, network_policies: HashMap::new(), + network_enforcement: 0, }; assert!(validate_sandbox_policy(&policy).is_ok()); } diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 126416546..e8b833d4b 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -23,6 +23,7 @@ pub mod procfs; mod provider_credentials; pub mod proxy; mod sandbox; +pub mod seccomp_notify; mod secrets; mod skills; mod ssh; @@ -487,7 +488,10 @@ pub async fn run_sandbox( // Generate ephemeral CA and TLS state for HTTPS L7 inspection. // The CA cert is written to disk so sandbox processes can trust it. - let (tls_state, ca_file_paths) = if matches!(policy.network.mode, NetworkMode::Proxy) { + let (tls_state, ca_file_paths) = if matches!( + policy.network.mode, + NetworkMode::Proxy | NetworkMode::Platform + ) { match SandboxCa::generate() { Ok(ca) => { let tls_dir = std::path::Path::new("/etc/openshell-tls"); @@ -600,79 +604,91 @@ pub async fn run_sandbox( // the entrypoint process's /proc/net/tcp for identity binding. let entrypoint_pid = Arc::new(AtomicU32::new(0)); - let (_proxy, denial_rx, bypass_denial_tx, activity_rx, bypass_activity_tx) = - if matches!(policy.network.mode, NetworkMode::Proxy) { - let proxy_policy = policy.network.proxy.as_ref().ok_or_else(|| { - miette::miette!( - "Network mode is set to proxy but no proxy configuration was provided" - ) - })?; + let (_proxy, denial_rx, bypass_denial_tx, activity_rx, bypass_activity_tx) = if matches!( + policy.network.mode, + NetworkMode::Proxy | NetworkMode::Platform + ) { + let proxy_policy = policy.network.proxy.as_ref().ok_or_else(|| { + miette::miette!("Network mode is set to proxy but no proxy configuration was provided") + })?; - let engine = opa_engine.clone().ok_or_else(|| { - miette::miette!("Proxy mode requires an OPA engine (--rego-policy and --rego-data)") - })?; + let engine = opa_engine.clone().ok_or_else(|| { + miette::miette!("Proxy mode requires an OPA engine (--rego-policy and --rego-data)") + })?; - let cache = identity_cache.clone().ok_or_else(|| { - miette::miette!( - "Proxy mode requires an identity cache (OPA engine must be configured)" - ) - })?; + let cache = identity_cache.clone().ok_or_else(|| { + miette::miette!("Proxy mode requires an identity cache (OPA engine must be configured)") + })?; + + // If we have a network namespace, bind to the veth host IP so sandboxed + // processes can reach the proxy via TCP. + #[cfg(target_os = "linux")] + let bind_addr = netns.as_ref().map(|ns| { + let port = proxy_policy.http_addr.map_or(3128, |addr| addr.port()); + SocketAddr::new(ns.host_ip(), port) + }); - // If we have a network namespace, bind to the veth host IP so sandboxed - // processes can reach the proxy via TCP. - #[cfg(target_os = "linux")] - let bind_addr = netns.as_ref().map(|ns| { + // Platform mode: no netns, bind proxy to loopback. + #[cfg(target_os = "linux")] + let bind_addr = bind_addr.or_else(|| { + if matches!(policy.network.mode, NetworkMode::Platform) { let port = proxy_policy.http_addr.map_or(3128, |addr| addr.port()); - SocketAddr::new(ns.host_ip(), port) - }); + Some(SocketAddr::new( + std::net::IpAddr::V4(std::net::Ipv4Addr::LOCALHOST), + port, + )) + } else { + None + } + }); - #[cfg(not(target_os = "linux"))] - let bind_addr: Option = None; + #[cfg(not(target_os = "linux"))] + let bind_addr: Option = None; - // Build inference context for local routing of intercepted inference calls. - let inference_ctx = build_inference_context( - sandbox_id.as_deref(), - openshell_endpoint_for_proxy.as_deref(), - inference_routes.as_deref(), - ) - .await?; - - // Create denial aggregator channel if in gRPC mode (sandbox_id present). - // Clone the sender for the bypass monitor before passing to the proxy. - let (denial_tx, denial_rx, bypass_denial_tx) = if sandbox_id.is_some() { - let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); - let bypass_tx = tx.clone(); - (Some(tx), Some(rx), Some(bypass_tx)) - } else { - (None, None, None) - }; - let (activity_tx, activity_rx, bypass_activity_tx) = - activity_collection_channels(sandbox_id.as_deref()); - - let proxy_handle = ProxyHandle::start_with_bind_addr( - proxy_policy, - bind_addr, - engine, - cache, - entrypoint_pid.clone(), - tls_state, - inference_ctx, - Some(provider_credentials.clone()), - Some(policy_local_ctx.clone()), - denial_tx, - activity_tx, - ) - .await?; - ( - Some(proxy_handle), - denial_rx, - bypass_denial_tx, - activity_rx, - bypass_activity_tx, - ) + // Build inference context for local routing of intercepted inference calls. + let inference_ctx = build_inference_context( + sandbox_id.as_deref(), + openshell_endpoint_for_proxy.as_deref(), + inference_routes.as_deref(), + ) + .await?; + + // Create denial aggregator channel if in gRPC mode (sandbox_id present). + // Clone the sender for the bypass monitor before passing to the proxy. + let (denial_tx, denial_rx, bypass_denial_tx) = if sandbox_id.is_some() { + let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); + let bypass_tx = tx.clone(); + (Some(tx), Some(rx), Some(bypass_tx)) } else { - (None, None, None, None, None) + (None, None, None) }; + let (activity_tx, activity_rx, bypass_activity_tx) = + activity_collection_channels(sandbox_id.as_deref()); + + let proxy_handle = ProxyHandle::start_with_bind_addr( + proxy_policy, + bind_addr, + engine, + cache, + entrypoint_pid.clone(), + tls_state, + inference_ctx, + Some(provider_credentials.clone()), + Some(policy_local_ctx.clone()), + denial_tx, + activity_tx, + ) + .await?; + ( + Some(proxy_handle), + denial_rx, + bypass_denial_tx, + activity_rx, + bypass_activity_tx, + ) + } else { + (None, None, None, None, None) + }; // Spawn bypass detection monitor (Linux only, proxy mode only). // Reads /dev/kmsg for nftables log entries and emits structured @@ -705,18 +721,30 @@ pub async fn run_sandbox( #[cfg(not(target_os = "linux"))] let ssh_netns_fd: Option = None; - let ssh_proxy_url = if matches!(policy.network.mode, NetworkMode::Proxy) { + let ssh_proxy_url = if matches!( + policy.network.mode, + NetworkMode::Proxy | NetworkMode::Platform + ) { #[cfg(target_os = "linux")] { - netns.as_ref().map(|ns| { + if let Some(ns) = netns.as_ref() { let port = policy .network .proxy .as_ref() .and_then(|p| p.http_addr) .map_or(3128, |addr| addr.port()); - format!("http://{}:{port}", ns.host_ip()) - }) + Some(format!("http://{}:{port}", ns.host_ip())) + } else { + // Platform mode: proxy on loopback + let port = policy + .network + .proxy + .as_ref() + .and_then(|p| p.http_addr) + .map_or(3128, |addr| addr.port()); + Some(format!("http://127.0.0.1:{port}")) + } } #[cfg(not(target_os = "linux"))] { @@ -1729,8 +1757,10 @@ where } fn enrich_sandbox_baseline_paths(policy: &mut SandboxPolicy) { - let (ro, rw) = - active_baseline_enrichment_paths(matches!(policy.network.mode, NetworkMode::Proxy)); + let (ro, rw) = active_baseline_enrichment_paths(matches!( + policy.network.mode, + NetworkMode::Proxy | NetworkMode::Platform + )); let modified = enrich_sandbox_baseline_paths_with(policy, &ro, &rw, std::path::Path::exists); if modified { diff --git a/crates/openshell-sandbox/src/opa.rs b/crates/openshell-sandbox/src/opa.rs index f5ff5923b..cbb7b4074 100644 --- a/crates/openshell-sandbox/src/opa.rs +++ b/crates/openshell-sandbox/src/opa.rs @@ -1262,6 +1262,7 @@ mod tests { run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, } } @@ -2518,6 +2519,7 @@ network_policies: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; let engine = OpaEngine::from_proto(&proto).expect("engine from proto"); @@ -2641,6 +2643,7 @@ network_policies: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; let engine = OpaEngine::from_proto(&proto).expect("engine from proto"); @@ -2698,6 +2701,7 @@ network_policies: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; let engine = OpaEngine::from_proto(&proto).expect("engine from proto"); @@ -2755,6 +2759,7 @@ network_policies: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; let engine = OpaEngine::from_proto(&proto).expect("engine from proto"); @@ -3704,6 +3709,7 @@ process: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; let engine = OpaEngine::from_proto(&proto).expect("engine from proto"); let input = NetworkInput { @@ -3758,6 +3764,7 @@ process: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; let engine = OpaEngine::from_proto(&proto).expect("engine from proto"); let input = NetworkInput { @@ -3828,6 +3835,7 @@ process: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; let engine = OpaEngine::from_proto(&proto).expect("Failed to create engine from proto"); @@ -4058,6 +4066,7 @@ network_policies: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; let engine = OpaEngine::from_proto(&proto).unwrap(); // Port 443 @@ -5017,6 +5026,7 @@ network_policies: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; // Build engine with our PID (symlink resolution will work via /proc/self/root/) @@ -5094,6 +5104,7 @@ network_policies: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; // Initial load at pid=0 — no symlink expansion diff --git a/crates/openshell-sandbox/src/policy.rs b/crates/openshell-sandbox/src/policy.rs index 0827fa0d0..fde345b83 100644 --- a/crates/openshell-sandbox/src/policy.rs +++ b/crates/openshell-sandbox/src/policy.rs @@ -5,7 +5,8 @@ use openshell_core::proto::{ FilesystemPolicy as ProtoFilesystemPolicy, LandlockPolicy as ProtoLandlockPolicy, - ProcessPolicy as ProtoProcessPolicy, SandboxPolicy as ProtoSandboxPolicy, + NetworkEnforcementMode, ProcessPolicy as ProtoProcessPolicy, + SandboxPolicy as ProtoSandboxPolicy, }; use std::net::SocketAddr; use std::path::PathBuf; @@ -62,6 +63,9 @@ pub enum NetworkMode { Block, Proxy, Allow, + /// Platform mode: Landlock + seccomp + loopback proxy, no network namespace. + /// Compatible with restricted-v2 SCC and restricted Pod Security Standard. + Platform, } #[derive(Debug, Clone)] @@ -99,10 +103,13 @@ impl TryFrom for SandboxPolicy { type Error = miette::Report; fn try_from(proto: ProtoSandboxPolicy) -> Result { - // In cluster mode we always run with proxy networking so all egress - // can be evaluated by OPA and `inference.local` is always addressable. + let mode = match proto.network_enforcement() { + NetworkEnforcementMode::NetworkEnforcementNamespace => NetworkMode::Proxy, + NetworkEnforcementMode::NetworkEnforcementPlatform => NetworkMode::Platform, + }; + let network = NetworkPolicy { - mode: NetworkMode::Proxy, + mode, proxy: Some(ProxyPolicy { http_addr: None }), }; diff --git a/crates/openshell-sandbox/src/process.rs b/crates/openshell-sandbox/src/process.rs index d004bb7d4..f059584ce 100644 --- a/crates/openshell-sandbox/src/process.rs +++ b/crates/openshell-sandbox/src/process.rs @@ -226,27 +226,25 @@ impl ProcessHandle { cmd.current_dir(dir); } - if matches!(policy.network.mode, NetworkMode::Proxy) { + if matches!( + policy.network.mode, + NetworkMode::Proxy | NetworkMode::Platform + ) { let proxy = policy.network.proxy.as_ref().ok_or_else(|| { miette::miette!( "Network mode is set to proxy but no proxy configuration was provided" ) })?; - // When using network namespace, set proxy URL to the veth host IP - if netns_fd.is_some() { - // The proxy is on 10.200.0.1:3128 (or configured port) - let port = proxy.http_addr.map_or(3128, |addr| addr.port()); - let proxy_url = format!("http://10.200.0.1:{port}"); - // Both uppercase and lowercase variants: curl/wget use uppercase, - // gRPC C-core (libgrpc) checks lowercase http_proxy/https_proxy. - for (key, value) in child_env::proxy_env_vars(&proxy_url) { - cmd.env(key, value); - } - } else if let Some(http_addr) = proxy.http_addr { - let proxy_url = format!("http://{http_addr}"); - for (key, value) in child_env::proxy_env_vars(&proxy_url) { - cmd.env(key, value); - } + let port = proxy.http_addr.map_or(3128, |addr| addr.port()); + let proxy_url = if netns_fd.is_some() { + // Namespace mode: proxy on veth host IP + format!("http://10.200.0.1:{port}") + } else { + // Platform mode (or non-Linux): proxy on loopback + format!("http://127.0.0.1:{port}") + }; + for (key, value) in child_env::proxy_env_vars(&proxy_url) { + cmd.env(key, value); } } @@ -368,7 +366,10 @@ impl ProcessHandle { cmd.current_dir(dir); } - if matches!(policy.network.mode, NetworkMode::Proxy) { + if matches!( + policy.network.mode, + NetworkMode::Proxy | NetworkMode::Platform + ) { let proxy = policy.network.proxy.as_ref().ok_or_else(|| { miette::miette!( "Network mode is set to proxy but no proxy configuration was provided" diff --git a/crates/openshell-sandbox/src/sandbox/linux/landlock.rs b/crates/openshell-sandbox/src/sandbox/linux/landlock.rs index e7f37ce4f..32e888650 100644 --- a/crates/openshell-sandbox/src/sandbox/linux/landlock.rs +++ b/crates/openshell-sandbox/src/sandbox/linux/landlock.rs @@ -3,10 +3,10 @@ //! Landlock filesystem sandboxing. -use crate::policy::{LandlockCompatibility, SandboxPolicy}; +use crate::policy::{LandlockCompatibility, NetworkMode, SandboxPolicy}; use landlock::{ - ABI, Access, AccessFs, CompatLevel, Compatible, PathBeneath, PathFd, PathFdError, Ruleset, - RulesetAttr, RulesetCreatedAttr, + ABI, Access, AccessFs, AccessNet, CompatLevel, Compatible, NetPort, PathBeneath, PathFd, + PathFdError, Ruleset, RulesetAttr, RulesetCreatedAttr, Scope, }; use miette::{IntoDiagnostic, Result}; use std::path::{Path, PathBuf}; @@ -184,6 +184,29 @@ pub fn prepare(policy: &SandboxPolicy, workdir: Option<&str>) -> Result>>>>>> 5c72320 (feat(sandbox): Landlock TCP port restriction in Platform mode) + } + let mut ruleset = ruleset.create().into_diagnostic()?; let mut rules_applied: usize = 0; @@ -207,6 +230,28 @@ pub fn prepare(policy: &SandboxPolicy, workdir: Option<&str>) -> Result) -> Result Result<()> { } pub fn apply(policy: &SandboxPolicy) -> Result<()> { - let allow_inet = matches!(policy.network.mode, NetworkMode::Proxy | NetworkMode::Allow); + let allow_inet = matches!( + policy.network.mode, + NetworkMode::Proxy | NetworkMode::Allow | NetworkMode::Platform + ); let main_filter = build_filter(allow_inet)?; let clone3_filter = build_clone3_filter()?; @@ -202,6 +205,21 @@ fn build_filter_rules(allow_inet: bool) -> Result add_socket_domain_rule(&mut rules, domain)?; } + // Block UDP sockets (SOCK_DGRAM) on AF_INET/AF_INET6. + // + // The agent doesn't need UDP: all traffic goes through the CONNECT proxy + // on 127.0.0.1:3128, which resolves DNS on behalf of the agent. This + // matches Full OpenShell behavior where nftables rejects all UDP in the + // network namespace (nft_ruleset.rs:48-49 has no UDP accept rule). + // + // Without this block, an agent could exfiltrate data via DNS tunneling + // (encoding secrets in DNS subdomain labels) or send UDP packets to + // arbitrary destinations -- Landlock ABI v4 only covers TCP. + if allow_inet { + add_sock_dgram_block(&mut rules, libc::AF_INET)?; + add_sock_dgram_block(&mut rules, libc::AF_INET6)?; + } + // Allow AF_NETLINK only for NETLINK_ROUTE (protocol 0). // // NETLINK_ROUTE is needed by getifaddrs(3) which is called by Node.js, @@ -339,6 +357,29 @@ fn add_netlink_non_route_rule(rules: &mut BTreeMap>) -> Re Ok(()) } +/// Block `socket(domain, SOCK_DGRAM, *)` to prevent UDP socket creation. +/// +/// Uses `MaskedEq` on arg1 with mask `0xF` (SOCK_TYPE_MASK) to match +/// `SOCK_DGRAM` (2) regardless of `SOCK_NONBLOCK` or `SOCK_CLOEXEC` flags. +#[allow(clippy::cast_sign_loss)] +fn add_sock_dgram_block(rules: &mut BTreeMap>, domain: i32) -> Result<()> { + let domain_condition = + SeccompCondition::new(0, SeccompCmpArgLen::Dword, SeccompCmpOp::Eq, domain as u64) + .into_diagnostic()?; + + let type_condition = SeccompCondition::new( + 1, // type argument + SeccompCmpArgLen::Dword, + SeccompCmpOp::MaskedEq(0xF), // SOCK_TYPE_MASK + libc::SOCK_DGRAM as u64, + ) + .into_diagnostic()?; + + let rule = SeccompRule::new(vec![domain_condition, type_condition]).into_diagnostic()?; + rules.entry(libc::SYS_socket).or_default().push(rule); + Ok(()) +} + /// Block a syscall when a specific bit pattern is set in an argument. /// /// Uses `MaskedEq` to check `(arg & flag_bit) == flag_bit`, which triggers diff --git a/crates/openshell-sandbox/src/seccomp_notify.rs b/crates/openshell-sandbox/src/seccomp_notify.rs new file mode 100644 index 000000000..6064aeeec --- /dev/null +++ b/crates/openshell-sandbox/src/seccomp_notify.rs @@ -0,0 +1,602 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! seccomp-notify network enforcement for Platform mode. +//! +//! Provides kernel-level `connect()` interception using `SECCOMP_RET_USER_NOTIF`. +//! The supervisor intercepts network syscalls, reads the destination `sockaddr` +//! from the child's memory, evaluates it against a DNS-pinned allowlist, and +//! either performs the operation on behalf of the child via `pidfd_getfd()` or +//! denies it with `EPERM`. +//! +//! # Architecture +//! +//! The supervisor forks before exec'ing the agent. The child installs a seccomp +//! filter with `SECCOMP_FILTER_FLAG_NEW_LISTENER` that returns +//! `SECCOMP_RET_USER_NOTIF` for `connect()`, `sendto()`, and `sendmsg()`. The +//! notification fd is sent to the parent via a Unix socket. The parent runs an +//! async event loop processing notifications. +//! +//! # TOCTOU Safety +//! +//! The supervisor never uses `SECCOMP_USER_NOTIF_FLAG_CONTINUE`. Instead, it +//! reads the `sockaddr` once, validates it, then performs the `connect()` itself +//! using `pidfd_getfd()` to duplicate the child's socket fd. The original +//! syscall is never continued. +//! +//! # Requirements +//! +//! - Linux 5.0+ (`SECCOMP_RET_USER_NOTIF`) +//! - Linux 5.6+ (`pidfd_getfd`) +//! - Linux 5.9+ (`SECCOMP_IOCTL_NOTIF_ADDFD`) +//! - `RuntimeDefault` seccomp profile must allow the `seccomp()` syscall +//! - `io_uring` must be blocked (`RuntimeDefault` does this) +//! +//! # References +//! +//! - [`seccomp_unotify(2)`](https://www.man7.org/linux/man-pages/man2/seccomp_unotify.2.html) +//! - [`pidfd_getfd(2)`](https://man7.org/linux/man-pages/man2/pidfd_getfd.2.html) +//! - [Sandlock](https://github.com/multikernel/sandlock) -- reference implementation + +use std::collections::HashSet; +use std::net::{IpAddr, SocketAddr}; + +// --------------------------------------------------------------------------- +// DNS-Pinned Allowlist +// --------------------------------------------------------------------------- + +/// A set of allowed destination IPs, pinned at sandbox creation time. +/// +/// DNS resolution happens once during [`DnsPinnedAllowlist::add_domain()`]. +/// The resolved IPs are frozen for the sandbox session lifetime to prevent +/// DNS rebinding attacks. +/// +/// # Limitation: DNS wildcards +/// +/// Wildcard domains (e.g., `*.googleapis.com`) cannot be pinned because +/// `getaddrinfo("*.googleapis.com")` is not valid DNS and returns an error. +/// Given this OPA policy: +/// +/// ```yaml +/// endpoints: +/// - { host: api.anthropic.com, port: 443 } # exact -- pinnable +/// - { host: "*.googleapis.com", port: 443 } # wildcard -- NOT pinnable +/// ``` +/// +/// Exact domains work: `add_domain("api.anthropic.com")` resolves and pins +/// the IPs. But `add_domain("*.googleapis.com")` fails, pins zero IPs, and +/// connections to `us-central1-aiplatform.googleapis.com` are denied even +/// though the OPA policy allows them. +/// +/// Callers must skip wildcard endpoints (those containing `*`) and rely on +/// the proxy's OPA `glob.match()` for wildcard domain enforcement. +#[derive(Debug, Clone)] +pub struct DnsPinnedAllowlist { + allowed_ips: HashSet, + proxy_addr: SocketAddr, +} + +impl DnsPinnedAllowlist { + /// Create an allowlist that permits only loopback proxy connections. + pub fn new(proxy_addr: SocketAddr) -> Self { + let mut allowed_ips = HashSet::new(); + allowed_ips.insert(proxy_addr.ip()); + allowed_ips.insert(IpAddr::V4(std::net::Ipv4Addr::LOCALHOST)); + allowed_ips.insert(IpAddr::V6(std::net::Ipv6Addr::LOCALHOST)); + Self { + allowed_ips, + proxy_addr, + } + } + + /// Resolve a domain name and pin its IPs to the allowlist. + pub fn add_domain(&mut self, domain: &str) -> std::io::Result { + use std::net::ToSocketAddrs; + let addrs: Vec<_> = (domain, 0).to_socket_addrs()?.collect(); + let count = addrs.len(); + for addr in addrs { + self.allowed_ips.insert(addr.ip()); + } + Ok(count) + } + + /// Check whether a destination IP is in the allowlist. + pub fn is_allowed(&self, ip: &IpAddr) -> bool { + self.allowed_ips.contains(ip) + } + + /// The proxy address (always allowed). + pub fn proxy_addr(&self) -> SocketAddr { + self.proxy_addr + } + + /// Number of pinned IPs. + pub fn len(&self) -> usize { + self.allowed_ips.len() + } + + /// Whether the allowlist contains only the default entries. + pub fn is_empty(&self) -> bool { + self.allowed_ips.len() <= 3 + } +} + +// --------------------------------------------------------------------------- +// Linux-specific seccomp-notify syscall wrappers +// --------------------------------------------------------------------------- + +/// Raw Linux seccomp notification structures and syscall wrappers. +/// +/// These are defined here because `libc` 0.2.x does not export the full +/// notification API (`seccomp_notif`, `seccomp_notif_resp`, ioctls). +#[cfg(target_os = "linux")] +#[allow(unsafe_code, clippy::cast_possible_truncation)] +pub mod linux { + use std::io; + use std::mem; + use std::mem::size_of; + use std::os::unix::io::RawFd; + + // --- Seccomp constants --- + const SECCOMP_SET_MODE_FILTER: libc::c_uint = 1; + const SECCOMP_FILTER_FLAG_NEW_LISTENER: libc::c_uint = 1 << 3; + const SECCOMP_RET_USER_NOTIF: u32 = 0x7FC0_0000; + const SECCOMP_RET_ALLOW: u32 = 0x7FFF_0000; + + // ioctl commands for the notification fd. + // These match the kernel definitions for all architectures (x86_64, aarch64). + // Note: SECCOMP_IOCTL_NOTIF_ID_VALID changed from _IOR to _IOW in Linux 5.17. + // We use the post-5.17 value. On pre-5.17 kernels, id_valid() returns false + // and the caller should treat the notification as potentially stale. + const SECCOMP_IOCTL_NOTIF_RECV: libc::c_ulong = 0xC050_7500; + const SECCOMP_IOCTL_NOTIF_SEND: libc::c_ulong = 0xC018_7501; + const SECCOMP_IOCTL_NOTIF_ID_VALID: libc::c_ulong = 0x4008_7502; + + // Syscall numbers (same on x86_64 and aarch64) + const SYS_PIDFD_OPEN: libc::c_long = 434; + const SYS_PIDFD_GETFD: libc::c_long = 438; + + // --- Notification structs --- + + /// Seccomp notification received from the kernel. + #[repr(C)] + #[derive(Debug, Clone)] + pub struct SeccompNotif { + pub id: u64, + pub pid: u32, + pub flags: u32, + pub data: SeccompData, + } + + /// Syscall data from the notification. + #[repr(C)] + #[derive(Debug, Clone)] + pub struct SeccompData { + pub nr: i32, + pub arch: u32, + pub instruction_pointer: u64, + pub args: [u64; 6], + } + + /// Response to send back to the kernel. + #[repr(C)] + #[derive(Debug, Clone)] + pub struct SeccompNotifResp { + pub id: u64, + pub val: i64, + pub error: i32, + pub flags: u32, + } + + // --- BPF filter types --- + + #[repr(C)] + struct SockFilter { + code: u16, + jt: u8, + jf: u8, + k: u32, + } + + #[repr(C)] + struct SockFprog { + len: u16, + filter: *const SockFilter, + } + + // BPF instruction encoding constants + const BPF_LD: u16 = 0x00; + const BPF_W: u16 = 0x00; + const BPF_ABS: u16 = 0x20; + const BPF_JMP: u16 = 0x05; + const BPF_JEQ: u16 = 0x10; + const BPF_K: u16 = 0x00; + const BPF_RET: u16 = 0x06; + + // AUDIT_ARCH constants for BPF architecture validation + #[cfg(target_arch = "x86_64")] + const AUDIT_ARCH_NATIVE: u32 = 0xC000_003E; // AUDIT_ARCH_X86_64 + #[cfg(target_arch = "aarch64")] + const AUDIT_ARCH_NATIVE: u32 = 0xC000_00B7; // AUDIT_ARCH_AARCH64 + #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + compile_error!( + "seccomp-notify BPF filter requires AUDIT_ARCH_NATIVE. \ + Add the constant for your target_arch to seccomp_notify.rs." + ); + + /// Install a seccomp BPF filter that returns `SECCOMP_RET_USER_NOTIF` + /// for `connect()`, `sendto()`, and `sendmsg()` syscalls. + /// + /// Returns the notification fd on success. The caller is responsible for + /// closing the fd (e.g., by wrapping it in `OwnedFd`). + /// + /// The filter validates `AUDIT_ARCH` to prevent x32/compat ABI bypass. + /// + /// Must be called after `prctl(PR_SET_NO_NEW_PRIVS, 1)` has been set. + pub fn install_connect_notify_filter() -> io::Result { + let filter = [ + // [0] Load architecture from seccomp_data.arch (offset 4) + SockFilter { + code: BPF_LD | BPF_W | BPF_ABS, + jt: 0, + jf: 0, + k: 4, // offsetof(seccomp_data, arch) + }, + // [1] Verify native arch; non-native goes to NOTIFY [10] so the + // supervisor can inspect and deny compat-ABI syscalls. + SockFilter { + code: BPF_JMP | BPF_JEQ | BPF_K, + jt: 0, // continue + jf: 8, // non-native → NOTIFY [10] + k: AUDIT_ARCH_NATIVE, + }, + // [2] Load syscall number from seccomp_data.nr (offset 0) + SockFilter { + code: BPF_LD | BPF_W | BPF_ABS, + jt: 0, + jf: 0, + k: 0, // offsetof(seccomp_data, nr) + }, + // [3] Check connect → NOTIFY [10] + SockFilter { + code: BPF_JMP | BPF_JEQ | BPF_K, + jt: 6, // jump to NOTIFY + jf: 0, + k: libc::SYS_connect as u32, + }, + // [4] Check sendto → NOTIFY [10] + SockFilter { + code: BPF_JMP | BPF_JEQ | BPF_K, + jt: 5, + jf: 0, + k: libc::SYS_sendto as u32, + }, + // [5] Check sendmsg → NOTIFY [10] + SockFilter { + code: BPF_JMP | BPF_JEQ | BPF_K, + jt: 4, + jf: 0, + k: libc::SYS_sendmsg as u32, + }, + // [6] Check recvfrom → NOTIFY [10] + SockFilter { + code: BPF_JMP | BPF_JEQ | BPF_K, + jt: 3, + jf: 0, + k: libc::SYS_recvfrom as u32, + }, + // [7] Check recvmsg → NOTIFY [10] + SockFilter { + code: BPF_JMP | BPF_JEQ | BPF_K, + jt: 2, + jf: 0, + k: libc::SYS_recvmsg as u32, + }, + // [8] Check bind → NOTIFY [10] (prevent binding to arbitrary ports) + SockFilter { + code: BPF_JMP | BPF_JEQ | BPF_K, + jt: 1, + jf: 0, + k: libc::SYS_bind as u32, + }, + // [9] ALLOW + SockFilter { + code: BPF_RET | BPF_K, + jt: 0, + jf: 0, + k: SECCOMP_RET_ALLOW, + }, + // [10] NOTIFY + SockFilter { + code: BPF_RET | BPF_K, + jt: 0, + jf: 0, + k: SECCOMP_RET_USER_NOTIF, + }, + ]; + + let prog = SockFprog { + len: u16::try_from(filter.len()).expect("BPF filter exceeds u16::MAX instructions"), + filter: filter.as_ptr(), + }; + + // SAFETY: The SockFprog and SockFilter arrays are #[repr(C)] with correct + // layout for the kernel ABI. The filter array is stack-allocated and lives + // for the duration of the syscall. PR_SET_NO_NEW_PRIVS must be set before + // this call. The returned fd is valid until closed. + let fd = unsafe { + libc::syscall( + libc::SYS_seccomp, + SECCOMP_SET_MODE_FILTER, + SECCOMP_FILTER_FLAG_NEW_LISTENER, + std::ptr::from_ref(&prog), + ) + }; + + if fd < 0 { + return Err(io::Error::last_os_error()); + } + + Ok(fd as RawFd) + } + + /// Receive a seccomp notification from the notification fd. + /// + /// Blocks until a notification is available. + pub fn recv_notif(notify_fd: RawFd) -> io::Result { + // SAFETY: SeccompNotif is #[repr(C)] and matches the kernel's + // struct seccomp_notif layout. The kernel writes all fields. + let mut notif: SeccompNotif = unsafe { mem::zeroed() }; + let ret = unsafe { libc::ioctl(notify_fd, SECCOMP_IOCTL_NOTIF_RECV, &mut notif) }; + if ret < 0 { + return Err(io::Error::last_os_error()); + } + Ok(notif) + } + + /// Send a response to a seccomp notification. + pub fn send_resp(notify_fd: RawFd, resp: &SeccompNotifResp) -> io::Result<()> { + let ret = unsafe { libc::ioctl(notify_fd, SECCOMP_IOCTL_NOTIF_SEND, resp) }; + if ret < 0 { + return Err(io::Error::last_os_error()); + } + Ok(()) + } + + /// Check if a notification ID is still valid. + /// + /// Note: uses the post-Linux-5.17 ioctl constant (`_IOW`). On kernels + /// 5.0-5.16, this always returns `false` (the old constant was `_IOR`). + /// Callers should treat a `false` result as "proceed with caution" and + /// verify the operation result, not as "definitely expired." + pub fn id_valid(notify_fd: RawFd, id: u64) -> bool { + let ret = unsafe { libc::ioctl(notify_fd, SECCOMP_IOCTL_NOTIF_ID_VALID, &id) }; + ret == 0 + } + + /// Open a pid fd for a process. + pub fn pidfd_open(pid: u32) -> io::Result { + #[allow(clippy::cast_possible_wrap)] + let pid_t = pid as libc::pid_t; + let fd = unsafe { libc::syscall(SYS_PIDFD_OPEN, pid_t, 0_u32) }; + if fd < 0 { + return Err(io::Error::last_os_error()); + } + Ok(fd as RawFd) + } + + /// Duplicate a file descriptor from another process via its pidfd. + /// + /// # Security Note + /// + /// A multi-threaded child can `dup2()` a different fd into `target_fd` + /// between the notification and this call. The caller should verify the + /// duplicated fd is a socket of the expected type after duplication. + pub fn pidfd_getfd(pidfd: RawFd, target_fd: RawFd) -> io::Result { + let fd = unsafe { libc::syscall(SYS_PIDFD_GETFD, pidfd, target_fd, 0_u32) }; + if fd < 0 { + return Err(io::Error::last_os_error()); + } + Ok(fd as RawFd) + } + + /// Verify that a duplicated fd is a socket (not a regular file or pipe). + /// + /// Call this after `pidfd_getfd()` to mitigate the fd-swap race: if a + /// malicious child `dup2()`d a non-socket fd into the target slot, this + /// check catches it. + pub fn verify_socket_fd(fd: RawFd) -> io::Result { + let mut stat: libc::stat = unsafe { mem::zeroed() }; + // SAFETY: fstat on a valid fd is safe. The stat struct is zeroed and + // fully written by the kernel on success. + let ret = unsafe { libc::fstat(fd, std::ptr::from_mut(&mut stat)) }; + if ret < 0 { + return Err(io::Error::last_os_error()); + } + // S_IFSOCK = 0o140000 + Ok((stat.st_mode & libc::S_IFMT) == libc::S_IFSOCK) + } + + /// Read bytes from another process's memory via `/proc//mem`. + /// + /// Uses `read_exact` to ensure the full buffer is filled. Returns an + /// error if the read is short (e.g., at an unmapped page boundary). + /// + /// # Security Note + /// + /// Between the notification and this read, the process may exit and the + /// pid may be recycled. Call `id_valid()` before and check the result + /// of this function. For stronger guarantees, use `process_vm_readv()` + /// with a pidfd (not implemented here). + pub fn read_process_memory(pid: u32, addr: u64, buf: &mut [u8]) -> io::Result<()> { + use std::io::{Read, Seek}; + + let path = format!("/proc/{pid}/mem"); + let mut file = std::fs::File::open(&path)?; + file.seek(io::SeekFrom::Start(addr))?; + file.read_exact(buf) + } + + /// Parse a `sockaddr_in` or `sockaddr_in6` from raw bytes. + /// + /// `sa_family` is in native byte order. Port is in network (big-endian) + /// byte order per the `sockaddr_in` ABI. + pub fn parse_sockaddr(buf: &[u8]) -> Option { + if buf.len() < 2 { + return None; + } + + // sa_family is in native byte order + let family = u16::from_ne_bytes([buf[0], buf[1]]); + + match i32::from(family) { + libc::AF_INET if buf.len() >= size_of::() => { + // sin_port is in network (big-endian) byte order + let port = u16::from_be_bytes([buf[2], buf[3]]); + let ip = std::net::Ipv4Addr::new(buf[4], buf[5], buf[6], buf[7]); + Some(std::net::SocketAddr::V4(std::net::SocketAddrV4::new( + ip, port, + ))) + } + libc::AF_INET6 if buf.len() >= size_of::() => { + let port = u16::from_be_bytes([buf[2], buf[3]]); + // sin6_flowinfo at bytes 4-7 (network byte order) + let flowinfo = u32::from_be_bytes([buf[4], buf[5], buf[6], buf[7]]); + // sin6_addr at bytes 8-23 + let mut ip_bytes = [0u8; 16]; + ip_bytes.copy_from_slice(&buf[8..24]); + let ip = std::net::Ipv6Addr::from(ip_bytes); + // sin6_scope_id at bytes 24-27 (native byte order per POSIX) + let scope_id = u32::from_ne_bytes([buf[24], buf[25], buf[26], buf[27]]); + Some(std::net::SocketAddr::V6(std::net::SocketAddrV6::new( + ip, port, flowinfo, scope_id, + ))) + } + _ => None, + } + } + + /// Deny a notification with EPERM. + pub fn deny(notify_fd: RawFd, id: u64) -> io::Result<()> { + send_resp( + notify_fd, + &SeccompNotifResp { + id, + val: 0, + error: -libc::EPERM, + flags: 0, + }, + ) + } + + /// Allow a `connect()` notification by returning success (0) to the caller. + /// + /// **Only valid for `connect()` syscalls** which return 0 on success. + /// For `sendto()`/`sendmsg()`, the supervisor must perform the operation + /// on behalf of the child and return the actual byte count via + /// `send_resp()` directly. + pub fn allow_connect(notify_fd: RawFd, id: u64) -> io::Result<()> { + send_resp( + notify_fd, + &SeccompNotifResp { + id, + val: 0, + error: 0, + flags: 0, + }, + ) + } +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use super::*; + use std::net::{Ipv4Addr, Ipv6Addr}; + + #[test] + fn allowlist_permits_loopback() { + let proxy = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 3128); + let allowlist = DnsPinnedAllowlist::new(proxy); + assert!(allowlist.is_allowed(&IpAddr::V4(Ipv4Addr::LOCALHOST))); + assert!(allowlist.is_allowed(&IpAddr::V6(Ipv6Addr::LOCALHOST))); + } + + #[test] + fn allowlist_denies_arbitrary_ip() { + let proxy = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 3128); + let allowlist = DnsPinnedAllowlist::new(proxy); + let external = IpAddr::V4(Ipv4Addr::new(198, 51, 100, 5)); + assert!(!allowlist.is_allowed(&external)); + } + + #[test] + fn allowlist_resolves_domain() { + let proxy = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 3128); + let mut allowlist = DnsPinnedAllowlist::new(proxy); + let count = allowlist.add_domain("localhost").unwrap(); + assert!(count > 0); + assert!(allowlist.is_allowed(&IpAddr::V4(Ipv4Addr::LOCALHOST))); + } + + #[test] + fn allowlist_proxy_addr() { + let proxy = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 3128); + let allowlist = DnsPinnedAllowlist::new(proxy); + assert_eq!(allowlist.proxy_addr(), proxy); + } + + #[test] + fn allowlist_len_and_empty() { + let proxy = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 3128); + let allowlist = DnsPinnedAllowlist::new(proxy); + assert!(allowlist.len() >= 2); + assert!(allowlist.is_empty()); + } + + #[cfg(target_os = "linux")] + mod linux_tests { + use super::super::linux::parse_sockaddr; + + #[test] + fn parse_ipv4_sockaddr() { + let mut buf = [0u8; 16]; + // AF_INET = 2 in native byte order + let family_bytes = 2u16.to_ne_bytes(); + buf[0] = family_bytes[0]; + buf[1] = family_bytes[1]; + // Port 443 in big-endian + buf[2] = 0x01; + buf[3] = 0xBB; + // IP 93.184.216.34 + buf[4] = 93; + buf[5] = 184; + buf[6] = 216; + buf[7] = 34; + + let addr = parse_sockaddr(&buf).unwrap(); + assert_eq!(addr.port(), 443); + match addr { + std::net::SocketAddr::V4(v4) => { + assert_eq!(*v4.ip(), std::net::Ipv4Addr::new(93, 184, 216, 34)); + } + std::net::SocketAddr::V6(_) => panic!("expected IPv4"), + } + } + + #[test] + fn parse_too_short_returns_none() { + let buf = [0u8; 1]; + assert!(parse_sockaddr(&buf).is_none()); + } + + #[test] + fn parse_unknown_family_returns_none() { + let buf = [0xFF; 16]; + assert!(parse_sockaddr(&buf).is_none()); + } + } +} diff --git a/crates/openshell-server/src/compute/mod.rs b/crates/openshell-server/src/compute/mod.rs index 30a21c643..25e9dbc6e 100644 --- a/crates/openshell-server/src/compute/mod.rs +++ b/crates/openshell-server/src/compute/mod.rs @@ -1271,6 +1271,14 @@ fn driver_sandbox_spec_from_public(spec: &SandboxSpec) -> DriverSandboxSpec { gpu: spec.gpu, gpu_device: spec.gpu_device.clone(), sandbox_token: String::new(), + // Clamp to known NetworkEnforcementMode values. Unknown values + // default to NAMESPACE (0) for safety -- elevated capabilities are + // the more conservative posture since the supervisor can always + // create a network namespace for full isolation. + network_enforcement: match spec.policy.as_ref().map_or(0, |p| p.network_enforcement) { + 1 => 1, // PLATFORM + _ => 0, // NAMESPACE (default, reject unknown values) + }, } } diff --git a/proto/compute_driver.proto b/proto/compute_driver.proto index 610d491c7..48a7f4beb 100644 --- a/proto/compute_driver.proto +++ b/proto/compute_driver.proto @@ -6,6 +6,7 @@ syntax = "proto3"; package openshell.compute.v1; import "google/protobuf/struct.proto"; +import "sandbox.proto"; // Internal compute-driver contract used by the gateway. // @@ -96,6 +97,10 @@ message DriverSandboxSpec { // ServiceAccount token bootstrap instead). Never echoed to the public // Sandbox proto. string sandbox_token = 11; + // Network enforcement mode for this sandbox. When set to + // NETWORK_ENFORCEMENT_PLATFORM (1), the sandbox runs without elevated + // capabilities. Populated by the gateway from the SandboxPolicy. + openshell.sandbox.v1.NetworkEnforcementMode network_enforcement = 12; } // Driver-owned runtime template consumed by the compute platform. diff --git a/proto/sandbox.proto b/proto/sandbox.proto index ef0b0540f..e3330b3d0 100644 --- a/proto/sandbox.proto +++ b/proto/sandbox.proto @@ -13,6 +13,18 @@ package openshell.sandbox.v1; // - Public sandbox resource types live in `openshell.proto`. // - Internal compute-driver sandbox observation types live in `compute_driver.proto`. +// Network enforcement strategy for sandbox isolation. +enum NetworkEnforcementMode { + // Use a dedicated network namespace with veth pair and nftables bypass + // rules. Requires CAP_SYS_ADMIN and CAP_NET_ADMIN. Default. + NETWORK_ENFORCEMENT_NAMESPACE = 0; + // Rely on Kubernetes NetworkPolicy for L3/L4 egress control. The + // supervisor binds the CONNECT proxy to loopback instead of veth. No + // elevated capabilities required -- compatible with restricted-v2 SCC + // and restricted Pod Security Standard. + NETWORK_ENFORCEMENT_PLATFORM = 1; +} + // Sandbox security policy configuration. message SandboxPolicy { // Policy version. @@ -25,6 +37,9 @@ message SandboxPolicy { ProcessPolicy process = 4; // Network access policies keyed by name (e.g. "claude_code", "gitlab"). map network_policies = 5; + // Network enforcement mode. Default (0) preserves current namespace-based + // isolation for backward compatibility. + NetworkEnforcementMode network_enforcement = 6; } // Filesystem access policy.