From 1eef6e3f5748852792918947a205b86b5ef69aae Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Thu, 25 Jun 2026 14:42:47 +0200 Subject: [PATCH 1/2] feat(sandbox): add Platform network mode for restricted K8s platforms Add NetworkMode::Platform for running the supervisor without elevated capabilities on Kubernetes platforms enforcing the restricted Pod Security Standard (including OpenShift restricted-v2 SCC). Platform Mode keeps Landlock filesystem isolation, seccomp syscall filtering, OPA policy evaluation, credential injection, and L7 inspection via a loopback CONNECT proxy. It replaces the network namespace (which requires CAP_SYS_ADMIN + CAP_NET_ADMIN) with: - Loopback proxy binding (127.0.0.1 instead of veth interface) - K8s driver: zero capabilities, drop ALL, non-root UID - K8s driver: shareProcessNamespace for /proc identity resolution without CAP_SYS_PTRACE - K8s driver: per-sandbox deny-all egress NetworkPolicy (loopback traffic to the in-pod proxy is unaffected by NetworkPolicy) - seccomp: block SOCK_DGRAM (UDP) in Platform mode to match the nftables UDP reject in namespace mode - Landlock scope: restrict abstract Unix sockets and signals (ABI v5+, BestEffort degrades on older kernels) Security parity with namespace mode: | Attack | Namespace mode | Platform mode | |------------------------|------------------------|--------------------------| | TCP bypass proxy | nftables REJECT | Landlock port 3128 only | | UDP exfiltration | nftables REJECT | seccomp SOCK_DGRAM block | | TCP to other-pod:3128 | nftables REJECT | egress NetworkPolicy | | Abstract Unix sockets | netns isolation | Landlock scope | | Container escape | Risk (CAP_SYS_ADMIN) | Impossible (zero caps) | Proto: add NetworkEnforcementMode enum and field to SandboxPolicy and DriverSandboxSpec. Default NAMESPACE (0) preserves existing behavior; PLATFORM (1) activates the new mode. Closes NVIDIA/OpenShell#899 Signed-off-by: Ladislav Smola --- crates/openshell-driver-docker/src/tests.rs | 1 + .../openshell-driver-kubernetes/src/driver.rs | 167 ++++++++++++++--- crates/openshell-policy/src/lib.rs | 3 + crates/openshell-sandbox/src/lib.rs | 173 ++++++++++-------- crates/openshell-sandbox/src/opa.rs | 11 ++ crates/openshell-sandbox/src/policy.rs | 15 +- crates/openshell-sandbox/src/process.rs | 44 ++--- .../src/sandbox/linux/landlock.rs | 22 ++- .../src/sandbox/linux/seccomp.rs | 91 ++++++--- crates/openshell-server/src/compute/mod.rs | 8 + proto/compute_driver.proto | 5 + proto/sandbox.proto | 15 ++ 12 files changed, 405 insertions(+), 150 deletions(-) diff --git a/crates/openshell-driver-docker/src/tests.rs b/crates/openshell-driver-docker/src/tests.rs index c9b34ff8f..935b64d71 100644 --- a/crates/openshell-driver-docker/src/tests.rs +++ b/crates/openshell-driver-docker/src/tests.rs @@ -45,6 +45,7 @@ fn test_sandbox() -> DriverSandbox { gpu: false, gpu_device: String::new(), sandbox_token: String::new(), + network_enforcement: 0, }), status: None, } diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index 5a43eb980..f5c455da2 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -330,10 +330,32 @@ impl KubernetesComputeDriver { enable_user_namespaces: self.config.enable_user_namespaces, workspace_default_storage_size: &self.config.workspace_default_storage_size, sa_token_ttl_secs: self.config.effective_sa_token_ttl_secs(), + is_platform_mode: sandbox + .spec + .as_ref() + .is_some_and(|s| s.network_enforcement == 1), }; obj.data = sandbox_to_k8s_spec(sandbox.spec.as_ref(), ¶ms); let api = self.api(); + // Platform mode: emit a deny-all egress NetworkPolicy for this sandbox. + // Loopback traffic (to the in-pod proxy on 127.0.0.1:3128) is unaffected + // by NetworkPolicy. This closes the Landlock port-not-IP gap where + // connect(other-pod:3128) would bypass the sandbox's OPA policy. + if params.is_platform_mode { + if let Err(e) = self + .create_sandbox_network_policy(name, &sandbox.id) + .await + { + warn!( + sandbox_id = %sandbox.id, + error = %e, + "Failed to create egress NetworkPolicy for platform-mode sandbox \ + (sandbox will rely on Landlock + seccomp only)" + ); + } + } + match tokio::time::timeout(KUBE_API_TIMEOUT, api.create(&PostParams::default(), &obj)).await { Ok(Ok(_result)) => { @@ -368,6 +390,67 @@ impl KubernetesComputeDriver { } } + /// Create a deny-all egress NetworkPolicy for a platform-mode sandbox pod. + /// + /// The policy selects the specific sandbox pod by its `openshell.ai/sandbox-id` + /// label and denies all egress. Loopback traffic (127.0.0.1:3128 to the + /// in-pod CONNECT proxy) is unaffected by Kubernetes NetworkPolicy. + async fn create_sandbox_network_policy( + &self, + sandbox_name: &str, + sandbox_id: &str, + ) -> Result<(), KubernetesDriverError> { + let np_name = format!("{sandbox_name}-egress"); + let np = serde_json::json!({ + "apiVersion": "networking.k8s.io/v1", + "kind": "NetworkPolicy", + "metadata": { + "name": np_name, + "namespace": self.config.namespace, + "labels": { + "openshell.ai/managed-by": "openshell", + "openshell.ai/sandbox-id": sandbox_id, + }, + }, + "spec": { + "podSelector": { + "matchLabels": { + "openshell.ai/sandbox-id": sandbox_id, + }, + }, + "policyTypes": ["Egress"], + "egress": [], + }, + }); + + let gvk = GroupVersionKind::gvk("networking.k8s.io", "v1", "NetworkPolicy"); + let resource = ApiResource::from_gvk(&gvk); + let api: Api = + Api::namespaced_with(self.client.clone(), &self.config.namespace, &resource); + let obj = serde_json::from_value::(np) + .map_err(|e| KubernetesDriverError::Message(format!("NetworkPolicy serialization: {e}")))?; + + match tokio::time::timeout(KUBE_API_TIMEOUT, api.create(&PostParams::default(), &obj)).await + { + Ok(Ok(_)) => { + info!( + sandbox_id = %sandbox_id, + policy = %np_name, + "Created deny-all egress NetworkPolicy for platform-mode sandbox" + ); + Ok(()) + } + Ok(Err(KubeError::Api(err))) if err.code == 409 => { + debug!(policy = %np_name, "NetworkPolicy already exists"); + Ok(()) + } + Ok(Err(err)) => Err(KubernetesDriverError::from_kube(err)), + Err(_) => Err(KubernetesDriverError::Message( + "timed out creating NetworkPolicy".into(), + )), + } + } + pub async fn delete_sandbox(&self, name: &str) -> Result { info!( sandbox_name = %name, @@ -823,6 +906,7 @@ fn apply_supervisor_sideload( supervisor_image: &str, supervisor_image_pull_policy: &str, method: SupervisorSideloadMethod, + is_platform_mode: bool, ) { let Some(spec) = pod_template.get_mut("spec").and_then(|v| v.as_object_mut()) else { return; @@ -882,16 +966,16 @@ fn apply_supervisor_sideload( serde_json::json!([format!("{}/openshell-sandbox", SUPERVISOR_MOUNT_PATH)]), ); - // Force the supervisor to run as root (UID 0). Sandbox images may set - // a non-root USER directive (e.g. `USER sandbox`), but the supervisor - // needs root to create network namespaces, set up the proxy, and - // configure Landlock/seccomp. The supervisor itself drops privileges - // for child processes via the policy's `run_as_user`/`run_as_group`. - let security_context = container - .entry("securityContext") - .or_insert_with(|| serde_json::json!({})); - if let Some(sc) = security_context.as_object_mut() { - sc.insert("runAsUser".to_string(), serde_json::json!(0)); + // In namespace mode, force root (UID 0) so the supervisor can create + // network namespaces and drop privileges for child processes. + // In platform mode, keep the image's default non-root user. + if !is_platform_mode { + let security_context = container + .entry("securityContext") + .or_insert_with(|| serde_json::json!({})); + if let Some(sc) = security_context.as_object_mut() { + sc.insert("runAsUser".to_string(), serde_json::json!(0)); + } } // Add volume mount @@ -1044,6 +1128,10 @@ struct SandboxPodParams<'a> { /// Lifetime (seconds) of the projected `ServiceAccount` token used /// for the bootstrap `IssueSandboxToken` exchange. sa_token_ttl_secs: i64, + /// Platform network enforcement mode (Issue #899). When true, sandbox + /// pods are emitted without elevated capabilities, compatible with + /// restricted-v2 SCC and restricted Pod Security Standard. + is_platform_mode: bool, } impl Default for SandboxPodParams<'_> { @@ -1065,6 +1153,7 @@ impl Default for SandboxPodParams<'_> { enable_user_namespaces: false, workspace_default_storage_size: DEFAULT_WORKSPACE_STORAGE_SIZE, sa_token_ttl_secs: 3600, + is_platform_mode: false, } } } @@ -1233,6 +1322,18 @@ fn sandbox_template_to_k8s( serde_json::json!(false), ); + // Platform mode: share the PID namespace between supervisor and agent + // containers so the proxy can resolve process identity via /proc//exe + // without CAP_SYS_PTRACE. Same-PID-namespace /proc reads work for + // same-UID processes; cross-UID reads work because the supervisor runs + // as the pod's non-root user alongside the agent. + if params.is_platform_mode { + spec.insert( + "shareProcessNamespace".to_string(), + serde_json::json!(true), + ); + } + let mut container = serde_json::Map::new(); container.insert("name".to_string(), serde_json::json!("agent")); // Use template image if provided, otherwise fall back to default @@ -1265,22 +1366,32 @@ fn sandbox_template_to_k8s( container.insert("env".to_string(), serde_json::Value::Array(env)); - let mut capabilities: Vec<&str> = vec!["SYS_ADMIN", "NET_ADMIN", "SYS_PTRACE", "SYSLOG"]; - if use_user_namespaces { - // In a user namespace the bounding set is reset. SETUID/SETGID are - // needed for the supervisor to drop privileges to the sandbox user. - // DAC_READ_SEARCH is needed for cross-UID /proc//fd/ access - // for process identity resolution in network policy enforcement. - capabilities.extend(["SETUID", "SETGID", "DAC_READ_SEARCH"]); + if params.is_platform_mode { + // Platform mode: zero elevated capabilities. Compatible with + // restricted-v2 SCC and restricted Pod Security Standard. + container.insert( + "securityContext".to_string(), + serde_json::json!({ + "allowPrivilegeEscalation": false, + "capabilities": { + "drop": ["ALL"] + } + }), + ); + } else { + let mut capabilities: Vec<&str> = vec!["SYS_ADMIN", "NET_ADMIN", "SYS_PTRACE", "SYSLOG"]; + if use_user_namespaces { + capabilities.extend(["SETUID", "SETGID", "DAC_READ_SEARCH"]); + } + container.insert( + "securityContext".to_string(), + serde_json::json!({ + "capabilities": { + "add": capabilities + } + }), + ); } - container.insert( - "securityContext".to_string(), - serde_json::json!({ - "capabilities": { - "add": capabilities - } - }), - ); // Mount client TLS secret for mTLS to the server, plus the projected // ServiceAccount token used to bootstrap the sandbox's gateway JWT @@ -1363,6 +1474,7 @@ fn sandbox_template_to_k8s( params.supervisor_image, params.supervisor_image_pull_policy, params.supervisor_sideload_method, + params.is_platform_mode, ); // Inject workspace persistence (init container + PVC volume mount) so @@ -1750,6 +1862,7 @@ mod tests { "custom-image:latest", "IfNotPresent", SupervisorSideloadMethod::InitContainer, + false, ); let sc = &pod_template["spec"]["containers"][0]["securityContext"]; @@ -1779,6 +1892,7 @@ mod tests { "supervisor-image:latest", "IfNotPresent", SupervisorSideloadMethod::InitContainer, + false, ); let sc = &pod_template["spec"]["containers"][0]["securityContext"]; @@ -1804,6 +1918,7 @@ mod tests { "supervisor-image:latest", "IfNotPresent", SupervisorSideloadMethod::InitContainer, + false, ); // Volume should be an emptyDir @@ -1878,6 +1993,7 @@ mod tests { "supervisor-image:latest", "IfNotPresent", SupervisorSideloadMethod::ImageVolume, + false, ); let volumes = pod_template["spec"]["volumes"] @@ -1932,6 +2048,7 @@ mod tests { "supervisor-image:latest", "", SupervisorSideloadMethod::ImageVolume, + false, ); let volume = &pod_template["spec"]["volumes"][0]; diff --git a/crates/openshell-policy/src/lib.rs b/crates/openshell-policy/src/lib.rs index 26c8fc9d3..d55a2806c 100644 --- a/crates/openshell-policy/src/lib.rs +++ b/crates/openshell-policy/src/lib.rs @@ -378,6 +378,7 @@ fn to_proto(raw: PolicyFile) -> SandboxPolicy { run_as_group: p.run_as_group, }), network_policies, + network_enforcement: 0, } } @@ -649,6 +650,7 @@ pub fn restrictive_default_policy() -> SandboxPolicy { run_as_group: "sandbox".into(), }), network_policies: HashMap::new(), + network_enforcement: 0, // NAMESPACE (default) } } @@ -1262,6 +1264,7 @@ network_policies: filesystem: None, landlock: None, network_policies: HashMap::new(), + network_enforcement: 0, }; assert!(validate_sandbox_policy(&policy).is_ok()); } diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 126416546..8e4fc9286 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -487,7 +487,10 @@ pub async fn run_sandbox( // Generate ephemeral CA and TLS state for HTTPS L7 inspection. // The CA cert is written to disk so sandbox processes can trust it. - let (tls_state, ca_file_paths) = if matches!(policy.network.mode, NetworkMode::Proxy) { + let (tls_state, ca_file_paths) = if matches!( + policy.network.mode, + NetworkMode::Proxy | NetworkMode::Platform + ) { match SandboxCa::generate() { Ok(ca) => { let tls_dir = std::path::Path::new("/etc/openshell-tls"); @@ -600,79 +603,91 @@ pub async fn run_sandbox( // the entrypoint process's /proc/net/tcp for identity binding. let entrypoint_pid = Arc::new(AtomicU32::new(0)); - let (_proxy, denial_rx, bypass_denial_tx, activity_rx, bypass_activity_tx) = - if matches!(policy.network.mode, NetworkMode::Proxy) { - let proxy_policy = policy.network.proxy.as_ref().ok_or_else(|| { - miette::miette!( - "Network mode is set to proxy but no proxy configuration was provided" - ) - })?; + let (_proxy, denial_rx, bypass_denial_tx, activity_rx, bypass_activity_tx) = if matches!( + policy.network.mode, + NetworkMode::Proxy | NetworkMode::Platform + ) { + let proxy_policy = policy.network.proxy.as_ref().ok_or_else(|| { + miette::miette!("Network mode is set to proxy but no proxy configuration was provided") + })?; - let engine = opa_engine.clone().ok_or_else(|| { - miette::miette!("Proxy mode requires an OPA engine (--rego-policy and --rego-data)") - })?; + let engine = opa_engine.clone().ok_or_else(|| { + miette::miette!("Proxy mode requires an OPA engine (--rego-policy and --rego-data)") + })?; - let cache = identity_cache.clone().ok_or_else(|| { - miette::miette!( - "Proxy mode requires an identity cache (OPA engine must be configured)" - ) - })?; + let cache = identity_cache.clone().ok_or_else(|| { + miette::miette!("Proxy mode requires an identity cache (OPA engine must be configured)") + })?; + + // If we have a network namespace, bind to the veth host IP so sandboxed + // processes can reach the proxy via TCP. + #[cfg(target_os = "linux")] + let bind_addr = netns.as_ref().map(|ns| { + let port = proxy_policy.http_addr.map_or(3128, |addr| addr.port()); + SocketAddr::new(ns.host_ip(), port) + }); - // If we have a network namespace, bind to the veth host IP so sandboxed - // processes can reach the proxy via TCP. - #[cfg(target_os = "linux")] - let bind_addr = netns.as_ref().map(|ns| { + // Platform mode: no netns, bind proxy to loopback. + #[cfg(target_os = "linux")] + let bind_addr = bind_addr.or_else(|| { + if matches!(policy.network.mode, NetworkMode::Platform) { let port = proxy_policy.http_addr.map_or(3128, |addr| addr.port()); - SocketAddr::new(ns.host_ip(), port) - }); + Some(SocketAddr::new( + std::net::IpAddr::V4(std::net::Ipv4Addr::LOCALHOST), + port, + )) + } else { + None + } + }); - #[cfg(not(target_os = "linux"))] - let bind_addr: Option = None; + #[cfg(not(target_os = "linux"))] + let bind_addr: Option = None; - // Build inference context for local routing of intercepted inference calls. - let inference_ctx = build_inference_context( - sandbox_id.as_deref(), - openshell_endpoint_for_proxy.as_deref(), - inference_routes.as_deref(), - ) - .await?; - - // Create denial aggregator channel if in gRPC mode (sandbox_id present). - // Clone the sender for the bypass monitor before passing to the proxy. - let (denial_tx, denial_rx, bypass_denial_tx) = if sandbox_id.is_some() { - let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); - let bypass_tx = tx.clone(); - (Some(tx), Some(rx), Some(bypass_tx)) - } else { - (None, None, None) - }; - let (activity_tx, activity_rx, bypass_activity_tx) = - activity_collection_channels(sandbox_id.as_deref()); - - let proxy_handle = ProxyHandle::start_with_bind_addr( - proxy_policy, - bind_addr, - engine, - cache, - entrypoint_pid.clone(), - tls_state, - inference_ctx, - Some(provider_credentials.clone()), - Some(policy_local_ctx.clone()), - denial_tx, - activity_tx, - ) - .await?; - ( - Some(proxy_handle), - denial_rx, - bypass_denial_tx, - activity_rx, - bypass_activity_tx, - ) + // Build inference context for local routing of intercepted inference calls. + let inference_ctx = build_inference_context( + sandbox_id.as_deref(), + openshell_endpoint_for_proxy.as_deref(), + inference_routes.as_deref(), + ) + .await?; + + // Create denial aggregator channel if in gRPC mode (sandbox_id present). + // Clone the sender for the bypass monitor before passing to the proxy. + let (denial_tx, denial_rx, bypass_denial_tx) = if sandbox_id.is_some() { + let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); + let bypass_tx = tx.clone(); + (Some(tx), Some(rx), Some(bypass_tx)) } else { - (None, None, None, None, None) + (None, None, None) }; + let (activity_tx, activity_rx, bypass_activity_tx) = + activity_collection_channels(sandbox_id.as_deref()); + + let proxy_handle = ProxyHandle::start_with_bind_addr( + proxy_policy, + bind_addr, + engine, + cache, + entrypoint_pid.clone(), + tls_state, + inference_ctx, + Some(provider_credentials.clone()), + Some(policy_local_ctx.clone()), + denial_tx, + activity_tx, + ) + .await?; + ( + Some(proxy_handle), + denial_rx, + bypass_denial_tx, + activity_rx, + bypass_activity_tx, + ) + } else { + (None, None, None, None, None) + }; // Spawn bypass detection monitor (Linux only, proxy mode only). // Reads /dev/kmsg for nftables log entries and emits structured @@ -705,18 +720,30 @@ pub async fn run_sandbox( #[cfg(not(target_os = "linux"))] let ssh_netns_fd: Option = None; - let ssh_proxy_url = if matches!(policy.network.mode, NetworkMode::Proxy) { + let ssh_proxy_url = if matches!( + policy.network.mode, + NetworkMode::Proxy | NetworkMode::Platform + ) { #[cfg(target_os = "linux")] { - netns.as_ref().map(|ns| { + if let Some(ns) = netns.as_ref() { let port = policy .network .proxy .as_ref() .and_then(|p| p.http_addr) .map_or(3128, |addr| addr.port()); - format!("http://{}:{port}", ns.host_ip()) - }) + Some(format!("http://{}:{port}", ns.host_ip())) + } else { + // Platform mode: proxy on loopback + let port = policy + .network + .proxy + .as_ref() + .and_then(|p| p.http_addr) + .map_or(3128, |addr| addr.port()); + Some(format!("http://127.0.0.1:{port}")) + } } #[cfg(not(target_os = "linux"))] { @@ -1729,8 +1756,10 @@ where } fn enrich_sandbox_baseline_paths(policy: &mut SandboxPolicy) { - let (ro, rw) = - active_baseline_enrichment_paths(matches!(policy.network.mode, NetworkMode::Proxy)); + let (ro, rw) = active_baseline_enrichment_paths(matches!( + policy.network.mode, + NetworkMode::Proxy | NetworkMode::Platform + )); let modified = enrich_sandbox_baseline_paths_with(policy, &ro, &rw, std::path::Path::exists); if modified { diff --git a/crates/openshell-sandbox/src/opa.rs b/crates/openshell-sandbox/src/opa.rs index f5ff5923b..cbb7b4074 100644 --- a/crates/openshell-sandbox/src/opa.rs +++ b/crates/openshell-sandbox/src/opa.rs @@ -1262,6 +1262,7 @@ mod tests { run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, } } @@ -2518,6 +2519,7 @@ network_policies: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; let engine = OpaEngine::from_proto(&proto).expect("engine from proto"); @@ -2641,6 +2643,7 @@ network_policies: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; let engine = OpaEngine::from_proto(&proto).expect("engine from proto"); @@ -2698,6 +2701,7 @@ network_policies: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; let engine = OpaEngine::from_proto(&proto).expect("engine from proto"); @@ -2755,6 +2759,7 @@ network_policies: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; let engine = OpaEngine::from_proto(&proto).expect("engine from proto"); @@ -3704,6 +3709,7 @@ process: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; let engine = OpaEngine::from_proto(&proto).expect("engine from proto"); let input = NetworkInput { @@ -3758,6 +3764,7 @@ process: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; let engine = OpaEngine::from_proto(&proto).expect("engine from proto"); let input = NetworkInput { @@ -3828,6 +3835,7 @@ process: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; let engine = OpaEngine::from_proto(&proto).expect("Failed to create engine from proto"); @@ -4058,6 +4066,7 @@ network_policies: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; let engine = OpaEngine::from_proto(&proto).unwrap(); // Port 443 @@ -5017,6 +5026,7 @@ network_policies: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; // Build engine with our PID (symlink resolution will work via /proc/self/root/) @@ -5094,6 +5104,7 @@ network_policies: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; // Initial load at pid=0 — no symlink expansion diff --git a/crates/openshell-sandbox/src/policy.rs b/crates/openshell-sandbox/src/policy.rs index 0827fa0d0..fde345b83 100644 --- a/crates/openshell-sandbox/src/policy.rs +++ b/crates/openshell-sandbox/src/policy.rs @@ -5,7 +5,8 @@ use openshell_core::proto::{ FilesystemPolicy as ProtoFilesystemPolicy, LandlockPolicy as ProtoLandlockPolicy, - ProcessPolicy as ProtoProcessPolicy, SandboxPolicy as ProtoSandboxPolicy, + NetworkEnforcementMode, ProcessPolicy as ProtoProcessPolicy, + SandboxPolicy as ProtoSandboxPolicy, }; use std::net::SocketAddr; use std::path::PathBuf; @@ -62,6 +63,9 @@ pub enum NetworkMode { Block, Proxy, Allow, + /// Platform mode: Landlock + seccomp + loopback proxy, no network namespace. + /// Compatible with restricted-v2 SCC and restricted Pod Security Standard. + Platform, } #[derive(Debug, Clone)] @@ -99,10 +103,13 @@ impl TryFrom for SandboxPolicy { type Error = miette::Report; fn try_from(proto: ProtoSandboxPolicy) -> Result { - // In cluster mode we always run with proxy networking so all egress - // can be evaluated by OPA and `inference.local` is always addressable. + let mode = match proto.network_enforcement() { + NetworkEnforcementMode::NetworkEnforcementNamespace => NetworkMode::Proxy, + NetworkEnforcementMode::NetworkEnforcementPlatform => NetworkMode::Platform, + }; + let network = NetworkPolicy { - mode: NetworkMode::Proxy, + mode, proxy: Some(ProxyPolicy { http_addr: None }), }; diff --git a/crates/openshell-sandbox/src/process.rs b/crates/openshell-sandbox/src/process.rs index d004bb7d4..97e0207f0 100644 --- a/crates/openshell-sandbox/src/process.rs +++ b/crates/openshell-sandbox/src/process.rs @@ -226,27 +226,25 @@ impl ProcessHandle { cmd.current_dir(dir); } - if matches!(policy.network.mode, NetworkMode::Proxy) { + if matches!( + policy.network.mode, + NetworkMode::Proxy | NetworkMode::Platform + ) { let proxy = policy.network.proxy.as_ref().ok_or_else(|| { miette::miette!( "Network mode is set to proxy but no proxy configuration was provided" ) })?; - // When using network namespace, set proxy URL to the veth host IP - if netns_fd.is_some() { - // The proxy is on 10.200.0.1:3128 (or configured port) - let port = proxy.http_addr.map_or(3128, |addr| addr.port()); - let proxy_url = format!("http://10.200.0.1:{port}"); - // Both uppercase and lowercase variants: curl/wget use uppercase, - // gRPC C-core (libgrpc) checks lowercase http_proxy/https_proxy. - for (key, value) in child_env::proxy_env_vars(&proxy_url) { - cmd.env(key, value); - } - } else if let Some(http_addr) = proxy.http_addr { - let proxy_url = format!("http://{http_addr}"); - for (key, value) in child_env::proxy_env_vars(&proxy_url) { - cmd.env(key, value); - } + let port = proxy.http_addr.map_or(3128, |addr| addr.port()); + let proxy_url = if netns_fd.is_some() { + // Namespace mode: proxy on veth host IP + format!("http://10.200.0.1:{port}") + } else { + // Platform mode (or non-Linux): proxy on loopback + format!("http://127.0.0.1:{port}") + }; + for (key, value) in child_env::proxy_env_vars(&proxy_url) { + cmd.env(key, value); } } @@ -368,17 +366,19 @@ impl ProcessHandle { cmd.current_dir(dir); } - if matches!(policy.network.mode, NetworkMode::Proxy) { + if matches!( + policy.network.mode, + NetworkMode::Proxy | NetworkMode::Platform + ) { let proxy = policy.network.proxy.as_ref().ok_or_else(|| { miette::miette!( "Network mode is set to proxy but no proxy configuration was provided" ) })?; - if let Some(http_addr) = proxy.http_addr { - let proxy_url = format!("http://{http_addr}"); - for (key, value) in child_env::proxy_env_vars(&proxy_url) { - cmd.env(key, value); - } + let port = proxy.http_addr.map_or(3128, |addr| addr.port()); + let proxy_url = format!("http://127.0.0.1:{port}"); + for (key, value) in child_env::proxy_env_vars(&proxy_url) { + cmd.env(key, value); } } diff --git a/crates/openshell-sandbox/src/sandbox/linux/landlock.rs b/crates/openshell-sandbox/src/sandbox/linux/landlock.rs index e7f37ce4f..4ef701680 100644 --- a/crates/openshell-sandbox/src/sandbox/linux/landlock.rs +++ b/crates/openshell-sandbox/src/sandbox/linux/landlock.rs @@ -3,10 +3,10 @@ //! Landlock filesystem sandboxing. -use crate::policy::{LandlockCompatibility, SandboxPolicy}; +use crate::policy::{LandlockCompatibility, NetworkMode, SandboxPolicy}; use landlock::{ ABI, Access, AccessFs, CompatLevel, Compatible, PathBeneath, PathFd, PathFdError, Ruleset, - RulesetAttr, RulesetCreatedAttr, + RulesetAttr, RulesetCreatedAttr, Scope, }; use miette::{IntoDiagnostic, Result}; use std::path::{Path, PathBuf}; @@ -115,7 +115,8 @@ pub fn prepare(policy: &SandboxPolicy, workdir: Option<&str>) -> Result) -> Result) -> Result Result<()> { } pub fn apply(policy: &SandboxPolicy) -> Result<()> { - let allow_inet = matches!(policy.network.mode, NetworkMode::Proxy | NetworkMode::Allow); - let main_filter = build_filter(allow_inet)?; + let allow_inet = matches!( + policy.network.mode, + NetworkMode::Proxy | NetworkMode::Allow | NetworkMode::Platform + ); + let block_udp = matches!(policy.network.mode, NetworkMode::Platform); + let main_filter = build_filter(allow_inet, block_udp)?; let clone3_filter = build_clone3_filter()?; set_no_new_privs()?; @@ -80,8 +84,8 @@ pub fn apply(policy: &SandboxPolicy) -> Result<()> { Ok(()) } -fn build_filter(allow_inet: bool) -> Result { - let rules = build_filter_rules(allow_inet)?; +fn build_filter(allow_inet: bool, block_udp: bool) -> Result { + let rules = build_filter_rules(allow_inet, block_udp)?; compile_filter(rules, SeccompAction::Errno(libc::EPERM as u32)) } @@ -181,7 +185,10 @@ fn apply_runtime_filters( Ok(()) } -fn build_filter_rules(allow_inet: bool) -> Result>> { +fn build_filter_rules( + allow_inet: bool, + block_udp: bool, +) -> Result>> { let mut rules: BTreeMap> = BTreeMap::new(); // --- Socket domain blocks --- @@ -202,6 +209,21 @@ fn build_filter_rules(allow_inet: bool) -> Result add_socket_domain_rule(&mut rules, domain)?; } + // Block UDP sockets (SOCK_DGRAM) on AF_INET/AF_INET6. + // + // The agent doesn't need UDP: all traffic goes through the CONNECT proxy + // on 127.0.0.1:3128, which resolves DNS on behalf of the agent. This + // matches Full OpenShell behavior where nftables rejects all UDP in the + // network namespace (nft_ruleset.rs:48-49 has no UDP accept rule). + // + // Without this block, an agent could exfiltrate data via DNS tunneling + // (encoding secrets in DNS subdomain labels) or send UDP packets to + // arbitrary destinations -- Landlock ABI v4 only covers TCP. + if block_udp { + add_sock_dgram_block(&mut rules, libc::AF_INET)?; + add_sock_dgram_block(&mut rules, libc::AF_INET6)?; + } + // Allow AF_NETLINK only for NETLINK_ROUTE (protocol 0). // // NETLINK_ROUTE is needed by getifaddrs(3) which is called by Node.js, @@ -339,6 +361,29 @@ fn add_netlink_non_route_rule(rules: &mut BTreeMap>) -> Re Ok(()) } +/// Block `socket(domain, SOCK_DGRAM, *)` to prevent UDP socket creation. +/// +/// Uses `MaskedEq` on arg1 with mask `0xF` (SOCK_TYPE_MASK) to match +/// `SOCK_DGRAM` (2) regardless of `SOCK_NONBLOCK` or `SOCK_CLOEXEC` flags. +#[allow(clippy::cast_sign_loss)] +fn add_sock_dgram_block(rules: &mut BTreeMap>, domain: i32) -> Result<()> { + let domain_condition = + SeccompCondition::new(0, SeccompCmpArgLen::Dword, SeccompCmpOp::Eq, domain as u64) + .into_diagnostic()?; + + let type_condition = SeccompCondition::new( + 1, // type argument + SeccompCmpArgLen::Dword, + SeccompCmpOp::MaskedEq(0xF), // SOCK_TYPE_MASK + libc::SOCK_DGRAM as u64, + ) + .into_diagnostic()?; + + let rule = SeccompRule::new(vec![domain_condition, type_condition]).into_diagnostic()?; + rules.entry(libc::SYS_socket).or_default().push(rule); + Ok(()) +} + /// Block a syscall when a specific bit pattern is set in an argument. /// /// Uses `MaskedEq` to check `(arg & flag_bit) == flag_bit`, which triggers @@ -379,14 +424,14 @@ mod tests { #[test] fn build_filter_proxy_mode_compiles() { - let filter = build_filter(true); - assert!(filter.is_ok(), "build_filter(true) should succeed"); + let filter = build_filter(true, false); + assert!(filter.is_ok(), "build_filter(true, false) should succeed"); } #[test] fn build_filter_block_mode_compiles() { - let filter = build_filter(false); - assert!(filter.is_ok(), "build_filter(false) should succeed"); + let filter = build_filter(false, false); + assert!(filter.is_ok(), "build_filter(false, false) should succeed"); } #[test] @@ -417,7 +462,7 @@ mod tests { #[test] fn unconditional_blocks_present_in_filter() { // Build a real filter and verify all unconditional blocks are present. - let filter_rules = build_filter_rules(true).unwrap(); + let filter_rules = build_filter_rules(true, false).unwrap(); // Unconditional blocks have an empty Vec (no conditions = always match). let expected = [ @@ -460,7 +505,7 @@ mod tests { fn conditional_blocks_have_rules() { // Build a real filter and verify the conditional syscalls have rule entries // (non-empty Vec means conditional match). - let filter_rules = build_filter_rules(true).unwrap(); + let filter_rules = build_filter_rules(true, false).unwrap(); for syscall in [ libc::SYS_execveat, @@ -485,7 +530,7 @@ mod tests { // AF_NETLINK+non-ROUTE filter), but it must NOT be an unconditional block // (empty Vec). An empty Vec would block ALL socket() calls, including // socket(AF_NETLINK, *, NETLINK_ROUTE=0) which getifaddrs(3) needs. - let filter_rules = build_filter_rules(true).unwrap(); + let filter_rules = build_filter_rules(true, false).unwrap(); assert!( filter_rules.contains_key(&libc::SYS_socket), @@ -560,7 +605,7 @@ mod tests { #[test] fn clone3_not_in_main_filter() { // clone3 must NOT be in the main filter; it has its own ENOSYS filter. - let filter_rules = build_filter_rules(true).unwrap(); + let filter_rules = build_filter_rules(true, false).unwrap(); assert!( !filter_rules.contains_key(&libc::SYS_clone3), "clone3 should not be in the main filter — it uses a separate ENOSYS filter" @@ -623,37 +668,37 @@ mod tests { #[test] fn behavioral_memfd_create_blocked() { - let filter = build_filter(true).unwrap(); + let filter = build_filter(true, false).unwrap(); unsafe { assert_blocked_in_child(&filter, libc::SYS_memfd_create, libc::EPERM) }; } #[test] fn behavioral_ptrace_blocked() { - let filter = build_filter(true).unwrap(); + let filter = build_filter(true, false).unwrap(); unsafe { assert_blocked_in_child(&filter, libc::SYS_ptrace, libc::EPERM) }; } #[test] fn behavioral_process_vm_writev_blocked() { - let filter = build_filter(true).unwrap(); + let filter = build_filter(true, false).unwrap(); unsafe { assert_blocked_in_child(&filter, libc::SYS_process_vm_writev, libc::EPERM) }; } #[test] fn behavioral_userfaultfd_blocked() { - let filter = build_filter(true).unwrap(); + let filter = build_filter(true, false).unwrap(); unsafe { assert_blocked_in_child(&filter, libc::SYS_userfaultfd, libc::EPERM) }; } #[test] fn behavioral_perf_event_open_blocked() { - let filter = build_filter(true).unwrap(); + let filter = build_filter(true, false).unwrap(); unsafe { assert_blocked_in_child(&filter, libc::SYS_perf_event_open, libc::EPERM) }; } #[test] fn behavioral_setns_blocked() { - let filter = build_filter(true).unwrap(); + let filter = build_filter(true, false).unwrap(); unsafe { assert_blocked_in_child(&filter, libc::SYS_setns, libc::EPERM) }; } @@ -701,7 +746,7 @@ mod tests { fn behavioral_clone3_returns_enosys() { // clone3 uses a separate filter that returns ENOSYS (not EPERM) so // glibc falls back to clone. - let main_filter = build_filter(true).unwrap(); + let main_filter = build_filter(true, false).unwrap(); let clone3_filter = build_clone3_filter().unwrap(); // Apply in the same order as apply(): clone3 filter first, main filter second. let pid = unsafe { libc::fork() }; @@ -730,7 +775,7 @@ mod tests { #[test] fn behavioral_third_filter_install_blocked_after_startup() { - let main_filter = build_filter(true).unwrap(); + let main_filter = build_filter(true, false).unwrap(); let clone3_filter = build_clone3_filter().unwrap(); let third_filter = build_clone3_filter().unwrap(); @@ -772,7 +817,7 @@ mod tests { fn behavioral_netlink_route_allowed() { // socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE=0) must succeed (not blocked). // This is the call getifaddrs(3) makes on Linux to enumerate interfaces. - let filter = build_filter(true).unwrap(); + let filter = build_filter(true, false).unwrap(); let pid = unsafe { libc::fork() }; assert!(pid >= 0, "fork failed"); if pid == 0 { @@ -807,7 +852,7 @@ mod tests { // socket(AF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG=4) must be blocked. // NETLINK_SOCK_DIAG is representative of non-ROUTE netlink protocols // that have no legitimate use inside the sandbox. - let filter = build_filter(true).unwrap(); + let filter = build_filter(true, false).unwrap(); let pid = unsafe { libc::fork() }; assert!(pid >= 0, "fork failed"); if pid == 0 { diff --git a/crates/openshell-server/src/compute/mod.rs b/crates/openshell-server/src/compute/mod.rs index 30a21c643..25e9dbc6e 100644 --- a/crates/openshell-server/src/compute/mod.rs +++ b/crates/openshell-server/src/compute/mod.rs @@ -1271,6 +1271,14 @@ fn driver_sandbox_spec_from_public(spec: &SandboxSpec) -> DriverSandboxSpec { gpu: spec.gpu, gpu_device: spec.gpu_device.clone(), sandbox_token: String::new(), + // Clamp to known NetworkEnforcementMode values. Unknown values + // default to NAMESPACE (0) for safety -- elevated capabilities are + // the more conservative posture since the supervisor can always + // create a network namespace for full isolation. + network_enforcement: match spec.policy.as_ref().map_or(0, |p| p.network_enforcement) { + 1 => 1, // PLATFORM + _ => 0, // NAMESPACE (default, reject unknown values) + }, } } diff --git a/proto/compute_driver.proto b/proto/compute_driver.proto index 610d491c7..48a7f4beb 100644 --- a/proto/compute_driver.proto +++ b/proto/compute_driver.proto @@ -6,6 +6,7 @@ syntax = "proto3"; package openshell.compute.v1; import "google/protobuf/struct.proto"; +import "sandbox.proto"; // Internal compute-driver contract used by the gateway. // @@ -96,6 +97,10 @@ message DriverSandboxSpec { // ServiceAccount token bootstrap instead). Never echoed to the public // Sandbox proto. string sandbox_token = 11; + // Network enforcement mode for this sandbox. When set to + // NETWORK_ENFORCEMENT_PLATFORM (1), the sandbox runs without elevated + // capabilities. Populated by the gateway from the SandboxPolicy. + openshell.sandbox.v1.NetworkEnforcementMode network_enforcement = 12; } // Driver-owned runtime template consumed by the compute platform. diff --git a/proto/sandbox.proto b/proto/sandbox.proto index ef0b0540f..e3330b3d0 100644 --- a/proto/sandbox.proto +++ b/proto/sandbox.proto @@ -13,6 +13,18 @@ package openshell.sandbox.v1; // - Public sandbox resource types live in `openshell.proto`. // - Internal compute-driver sandbox observation types live in `compute_driver.proto`. +// Network enforcement strategy for sandbox isolation. +enum NetworkEnforcementMode { + // Use a dedicated network namespace with veth pair and nftables bypass + // rules. Requires CAP_SYS_ADMIN and CAP_NET_ADMIN. Default. + NETWORK_ENFORCEMENT_NAMESPACE = 0; + // Rely on Kubernetes NetworkPolicy for L3/L4 egress control. The + // supervisor binds the CONNECT proxy to loopback instead of veth. No + // elevated capabilities required -- compatible with restricted-v2 SCC + // and restricted Pod Security Standard. + NETWORK_ENFORCEMENT_PLATFORM = 1; +} + // Sandbox security policy configuration. message SandboxPolicy { // Policy version. @@ -25,6 +37,9 @@ message SandboxPolicy { ProcessPolicy process = 4; // Network access policies keyed by name (e.g. "claude_code", "gitlab"). map network_policies = 5; + // Network enforcement mode. Default (0) preserves current namespace-based + // isolation for backward compatibility. + NetworkEnforcementMode network_enforcement = 6; } // Filesystem access policy. From 3d8c455c4297be62c3186b2f52c417648ef323a3 Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Wed, 24 Jun 2026 18:03:35 +0200 Subject: [PATCH 2/2] feat(sandbox): Landlock TCP port restriction in Platform mode Add Landlock ABI v4 TCP connect restriction for Platform mode. When the kernel supports ABI v4, only the proxy port (default 3128) is allowed for outbound TCP connections. On older kernels, BestEffort compat level silently degrades -- the rule has no effect but the proxy still works cooperatively. Signed-off-by: Ladislav Smola --- .../src/sandbox/linux/landlock.rs | 38 +++++++++++++++---- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/crates/openshell-sandbox/src/sandbox/linux/landlock.rs b/crates/openshell-sandbox/src/sandbox/linux/landlock.rs index 4ef701680..ef6f19ce6 100644 --- a/crates/openshell-sandbox/src/sandbox/linux/landlock.rs +++ b/crates/openshell-sandbox/src/sandbox/linux/landlock.rs @@ -5,8 +5,8 @@ use crate::policy::{LandlockCompatibility, NetworkMode, SandboxPolicy}; use landlock::{ - ABI, Access, AccessFs, CompatLevel, Compatible, PathBeneath, PathFd, PathFdError, Ruleset, - RulesetAttr, RulesetCreatedAttr, Scope, + ABI, Access, AccessFs, AccessNet, CompatLevel, Compatible, NetPort, PathBeneath, PathFd, + PathFdError, Ruleset, RulesetAttr, RulesetCreatedAttr, Scope, }; use miette::{IntoDiagnostic, Result}; use std::path::{Path, PathBuf}; @@ -185,17 +185,17 @@ pub fn prepare(policy: &SandboxPolicy, workdir: Option<&str>) -> Result) -> Result