From 42158f3e04b2c4641f45816a726b42ac38c5a455 Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Wed, 24 Jun 2026 17:12:16 +0200 Subject: [PATCH 1/3] feat(sandbox): add Platform network mode for restricted K8s platforms Add NetworkMode::Platform for running the supervisor without elevated capabilities on Kubernetes platforms enforcing the restricted Pod Security Standard (including OpenShift restricted-v2 SCC). Platform Mode keeps Landlock filesystem isolation, seccomp syscall filtering, OPA policy evaluation, credential injection, and L7 inspection via a loopback CONNECT proxy. It replaces the network namespace (which requires CAP_SYS_ADMIN + CAP_NET_ADMIN) with: - Loopback proxy binding (127.0.0.1 instead of veth interface) - K8s driver: zero capabilities, drop ALL, non-root UID - seccomp: block SOCK_DGRAM (UDP) on AF_INET/AF_INET6 to match the nftables UDP reject in namespace mode -- the proxy resolves DNS on behalf of the agent, so UDP is not needed - Landlock scope: restrict abstract Unix sockets and signals (ABI v5+, BestEffort degrades on older kernels) Security parity with namespace mode: | Attack | Namespace mode | Platform mode | |------------------------|------------------------|--------------------------| | TCP bypass proxy | nftables REJECT | Landlock port 3128 only | | UDP exfiltration | nftables REJECT | seccomp SOCK_DGRAM block | | DNS tunneling | no UDP accept rule | no SOCK_DGRAM | | Abstract Unix sockets | netns isolation | Landlock scope | | Signals to supervisor | N/A (same netns) | Landlock scope | | Container escape | Risk (CAP_SYS_ADMIN) | Impossible (zero caps) | Remaining gap: Landlock NetPort allows port 3128 on any IP (not just loopback). Mitigate with egress NetworkPolicy denying all sandbox pod egress -- loopback traffic is unaffected by NetworkPolicy. Proto: add NetworkEnforcementMode enum and field to SandboxPolicy and DriverSandboxSpec. Default NAMESPACE (0) preserves existing behavior; PLATFORM (1) activates the new mode. Signed-off-by: Ladislav Smola --- crates/openshell-driver-docker/src/tests.rs | 1 + .../openshell-driver-kubernetes/src/driver.rs | 76 +++++--- crates/openshell-policy/src/lib.rs | 3 + crates/openshell-sandbox/src/lib.rs | 173 ++++++++++-------- crates/openshell-sandbox/src/opa.rs | 11 ++ crates/openshell-sandbox/src/policy.rs | 15 +- crates/openshell-sandbox/src/process.rs | 35 ++-- .../src/sandbox/linux/landlock.rs | 17 +- .../src/sandbox/linux/seccomp.rs | 43 ++++- crates/openshell-server/src/compute/mod.rs | 8 + proto/compute_driver.proto | 5 + proto/sandbox.proto | 15 ++ 12 files changed, 281 insertions(+), 121 deletions(-) diff --git a/crates/openshell-driver-docker/src/tests.rs b/crates/openshell-driver-docker/src/tests.rs index c9b34ff8f..935b64d71 100644 --- a/crates/openshell-driver-docker/src/tests.rs +++ b/crates/openshell-driver-docker/src/tests.rs @@ -45,6 +45,7 @@ fn test_sandbox() -> DriverSandbox { gpu: false, gpu_device: String::new(), sandbox_token: String::new(), + network_enforcement: 0, }), status: None, } diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index 5a43eb980..ddc8b47f0 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -330,6 +330,10 @@ impl KubernetesComputeDriver { enable_user_namespaces: self.config.enable_user_namespaces, workspace_default_storage_size: &self.config.workspace_default_storage_size, sa_token_ttl_secs: self.config.effective_sa_token_ttl_secs(), + is_platform_mode: sandbox + .spec + .as_ref() + .is_some_and(|s| s.network_enforcement == 1), }; obj.data = sandbox_to_k8s_spec(sandbox.spec.as_ref(), ¶ms); let api = self.api(); @@ -823,6 +827,7 @@ fn apply_supervisor_sideload( supervisor_image: &str, supervisor_image_pull_policy: &str, method: SupervisorSideloadMethod, + is_platform_mode: bool, ) { let Some(spec) = pod_template.get_mut("spec").and_then(|v| v.as_object_mut()) else { return; @@ -882,16 +887,16 @@ fn apply_supervisor_sideload( serde_json::json!([format!("{}/openshell-sandbox", SUPERVISOR_MOUNT_PATH)]), ); - // Force the supervisor to run as root (UID 0). Sandbox images may set - // a non-root USER directive (e.g. `USER sandbox`), but the supervisor - // needs root to create network namespaces, set up the proxy, and - // configure Landlock/seccomp. The supervisor itself drops privileges - // for child processes via the policy's `run_as_user`/`run_as_group`. - let security_context = container - .entry("securityContext") - .or_insert_with(|| serde_json::json!({})); - if let Some(sc) = security_context.as_object_mut() { - sc.insert("runAsUser".to_string(), serde_json::json!(0)); + // In namespace mode, force root (UID 0) so the supervisor can create + // network namespaces and drop privileges for child processes. + // In platform mode, keep the image's default non-root user. + if !is_platform_mode { + let security_context = container + .entry("securityContext") + .or_insert_with(|| serde_json::json!({})); + if let Some(sc) = security_context.as_object_mut() { + sc.insert("runAsUser".to_string(), serde_json::json!(0)); + } } // Add volume mount @@ -1044,6 +1049,10 @@ struct SandboxPodParams<'a> { /// Lifetime (seconds) of the projected `ServiceAccount` token used /// for the bootstrap `IssueSandboxToken` exchange. sa_token_ttl_secs: i64, + /// Platform network enforcement mode (Issue #899). When true, sandbox + /// pods are emitted without elevated capabilities, compatible with + /// restricted-v2 SCC and restricted Pod Security Standard. + is_platform_mode: bool, } impl Default for SandboxPodParams<'_> { @@ -1065,6 +1074,7 @@ impl Default for SandboxPodParams<'_> { enable_user_namespaces: false, workspace_default_storage_size: DEFAULT_WORKSPACE_STORAGE_SIZE, sa_token_ttl_secs: 3600, + is_platform_mode: false, } } } @@ -1265,22 +1275,32 @@ fn sandbox_template_to_k8s( container.insert("env".to_string(), serde_json::Value::Array(env)); - let mut capabilities: Vec<&str> = vec!["SYS_ADMIN", "NET_ADMIN", "SYS_PTRACE", "SYSLOG"]; - if use_user_namespaces { - // In a user namespace the bounding set is reset. SETUID/SETGID are - // needed for the supervisor to drop privileges to the sandbox user. - // DAC_READ_SEARCH is needed for cross-UID /proc//fd/ access - // for process identity resolution in network policy enforcement. - capabilities.extend(["SETUID", "SETGID", "DAC_READ_SEARCH"]); + if params.is_platform_mode { + // Platform mode: zero elevated capabilities. Compatible with + // restricted-v2 SCC and restricted Pod Security Standard. + container.insert( + "securityContext".to_string(), + serde_json::json!({ + "allowPrivilegeEscalation": false, + "capabilities": { + "drop": ["ALL"] + } + }), + ); + } else { + let mut capabilities: Vec<&str> = vec!["SYS_ADMIN", "NET_ADMIN", "SYS_PTRACE", "SYSLOG"]; + if use_user_namespaces { + capabilities.extend(["SETUID", "SETGID", "DAC_READ_SEARCH"]); + } + container.insert( + "securityContext".to_string(), + serde_json::json!({ + "capabilities": { + "add": capabilities + } + }), + ); } - container.insert( - "securityContext".to_string(), - serde_json::json!({ - "capabilities": { - "add": capabilities - } - }), - ); // Mount client TLS secret for mTLS to the server, plus the projected // ServiceAccount token used to bootstrap the sandbox's gateway JWT @@ -1363,6 +1383,7 @@ fn sandbox_template_to_k8s( params.supervisor_image, params.supervisor_image_pull_policy, params.supervisor_sideload_method, + params.is_platform_mode, ); // Inject workspace persistence (init container + PVC volume mount) so @@ -1750,6 +1771,7 @@ mod tests { "custom-image:latest", "IfNotPresent", SupervisorSideloadMethod::InitContainer, + false, ); let sc = &pod_template["spec"]["containers"][0]["securityContext"]; @@ -1779,6 +1801,7 @@ mod tests { "supervisor-image:latest", "IfNotPresent", SupervisorSideloadMethod::InitContainer, + false, ); let sc = &pod_template["spec"]["containers"][0]["securityContext"]; @@ -1804,6 +1827,7 @@ mod tests { "supervisor-image:latest", "IfNotPresent", SupervisorSideloadMethod::InitContainer, + false, ); // Volume should be an emptyDir @@ -1878,6 +1902,7 @@ mod tests { "supervisor-image:latest", "IfNotPresent", SupervisorSideloadMethod::ImageVolume, + false, ); let volumes = pod_template["spec"]["volumes"] @@ -1932,6 +1957,7 @@ mod tests { "supervisor-image:latest", "", SupervisorSideloadMethod::ImageVolume, + false, ); let volume = &pod_template["spec"]["volumes"][0]; diff --git a/crates/openshell-policy/src/lib.rs b/crates/openshell-policy/src/lib.rs index 26c8fc9d3..d55a2806c 100644 --- a/crates/openshell-policy/src/lib.rs +++ b/crates/openshell-policy/src/lib.rs @@ -378,6 +378,7 @@ fn to_proto(raw: PolicyFile) -> SandboxPolicy { run_as_group: p.run_as_group, }), network_policies, + network_enforcement: 0, } } @@ -649,6 +650,7 @@ pub fn restrictive_default_policy() -> SandboxPolicy { run_as_group: "sandbox".into(), }), network_policies: HashMap::new(), + network_enforcement: 0, // NAMESPACE (default) } } @@ -1262,6 +1264,7 @@ network_policies: filesystem: None, landlock: None, network_policies: HashMap::new(), + network_enforcement: 0, }; assert!(validate_sandbox_policy(&policy).is_ok()); } diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 126416546..8e4fc9286 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -487,7 +487,10 @@ pub async fn run_sandbox( // Generate ephemeral CA and TLS state for HTTPS L7 inspection. // The CA cert is written to disk so sandbox processes can trust it. - let (tls_state, ca_file_paths) = if matches!(policy.network.mode, NetworkMode::Proxy) { + let (tls_state, ca_file_paths) = if matches!( + policy.network.mode, + NetworkMode::Proxy | NetworkMode::Platform + ) { match SandboxCa::generate() { Ok(ca) => { let tls_dir = std::path::Path::new("/etc/openshell-tls"); @@ -600,79 +603,91 @@ pub async fn run_sandbox( // the entrypoint process's /proc/net/tcp for identity binding. let entrypoint_pid = Arc::new(AtomicU32::new(0)); - let (_proxy, denial_rx, bypass_denial_tx, activity_rx, bypass_activity_tx) = - if matches!(policy.network.mode, NetworkMode::Proxy) { - let proxy_policy = policy.network.proxy.as_ref().ok_or_else(|| { - miette::miette!( - "Network mode is set to proxy but no proxy configuration was provided" - ) - })?; + let (_proxy, denial_rx, bypass_denial_tx, activity_rx, bypass_activity_tx) = if matches!( + policy.network.mode, + NetworkMode::Proxy | NetworkMode::Platform + ) { + let proxy_policy = policy.network.proxy.as_ref().ok_or_else(|| { + miette::miette!("Network mode is set to proxy but no proxy configuration was provided") + })?; - let engine = opa_engine.clone().ok_or_else(|| { - miette::miette!("Proxy mode requires an OPA engine (--rego-policy and --rego-data)") - })?; + let engine = opa_engine.clone().ok_or_else(|| { + miette::miette!("Proxy mode requires an OPA engine (--rego-policy and --rego-data)") + })?; - let cache = identity_cache.clone().ok_or_else(|| { - miette::miette!( - "Proxy mode requires an identity cache (OPA engine must be configured)" - ) - })?; + let cache = identity_cache.clone().ok_or_else(|| { + miette::miette!("Proxy mode requires an identity cache (OPA engine must be configured)") + })?; + + // If we have a network namespace, bind to the veth host IP so sandboxed + // processes can reach the proxy via TCP. + #[cfg(target_os = "linux")] + let bind_addr = netns.as_ref().map(|ns| { + let port = proxy_policy.http_addr.map_or(3128, |addr| addr.port()); + SocketAddr::new(ns.host_ip(), port) + }); - // If we have a network namespace, bind to the veth host IP so sandboxed - // processes can reach the proxy via TCP. - #[cfg(target_os = "linux")] - let bind_addr = netns.as_ref().map(|ns| { + // Platform mode: no netns, bind proxy to loopback. + #[cfg(target_os = "linux")] + let bind_addr = bind_addr.or_else(|| { + if matches!(policy.network.mode, NetworkMode::Platform) { let port = proxy_policy.http_addr.map_or(3128, |addr| addr.port()); - SocketAddr::new(ns.host_ip(), port) - }); + Some(SocketAddr::new( + std::net::IpAddr::V4(std::net::Ipv4Addr::LOCALHOST), + port, + )) + } else { + None + } + }); - #[cfg(not(target_os = "linux"))] - let bind_addr: Option = None; + #[cfg(not(target_os = "linux"))] + let bind_addr: Option = None; - // Build inference context for local routing of intercepted inference calls. - let inference_ctx = build_inference_context( - sandbox_id.as_deref(), - openshell_endpoint_for_proxy.as_deref(), - inference_routes.as_deref(), - ) - .await?; - - // Create denial aggregator channel if in gRPC mode (sandbox_id present). - // Clone the sender for the bypass monitor before passing to the proxy. - let (denial_tx, denial_rx, bypass_denial_tx) = if sandbox_id.is_some() { - let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); - let bypass_tx = tx.clone(); - (Some(tx), Some(rx), Some(bypass_tx)) - } else { - (None, None, None) - }; - let (activity_tx, activity_rx, bypass_activity_tx) = - activity_collection_channels(sandbox_id.as_deref()); - - let proxy_handle = ProxyHandle::start_with_bind_addr( - proxy_policy, - bind_addr, - engine, - cache, - entrypoint_pid.clone(), - tls_state, - inference_ctx, - Some(provider_credentials.clone()), - Some(policy_local_ctx.clone()), - denial_tx, - activity_tx, - ) - .await?; - ( - Some(proxy_handle), - denial_rx, - bypass_denial_tx, - activity_rx, - bypass_activity_tx, - ) + // Build inference context for local routing of intercepted inference calls. + let inference_ctx = build_inference_context( + sandbox_id.as_deref(), + openshell_endpoint_for_proxy.as_deref(), + inference_routes.as_deref(), + ) + .await?; + + // Create denial aggregator channel if in gRPC mode (sandbox_id present). + // Clone the sender for the bypass monitor before passing to the proxy. + let (denial_tx, denial_rx, bypass_denial_tx) = if sandbox_id.is_some() { + let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); + let bypass_tx = tx.clone(); + (Some(tx), Some(rx), Some(bypass_tx)) } else { - (None, None, None, None, None) + (None, None, None) }; + let (activity_tx, activity_rx, bypass_activity_tx) = + activity_collection_channels(sandbox_id.as_deref()); + + let proxy_handle = ProxyHandle::start_with_bind_addr( + proxy_policy, + bind_addr, + engine, + cache, + entrypoint_pid.clone(), + tls_state, + inference_ctx, + Some(provider_credentials.clone()), + Some(policy_local_ctx.clone()), + denial_tx, + activity_tx, + ) + .await?; + ( + Some(proxy_handle), + denial_rx, + bypass_denial_tx, + activity_rx, + bypass_activity_tx, + ) + } else { + (None, None, None, None, None) + }; // Spawn bypass detection monitor (Linux only, proxy mode only). // Reads /dev/kmsg for nftables log entries and emits structured @@ -705,18 +720,30 @@ pub async fn run_sandbox( #[cfg(not(target_os = "linux"))] let ssh_netns_fd: Option = None; - let ssh_proxy_url = if matches!(policy.network.mode, NetworkMode::Proxy) { + let ssh_proxy_url = if matches!( + policy.network.mode, + NetworkMode::Proxy | NetworkMode::Platform + ) { #[cfg(target_os = "linux")] { - netns.as_ref().map(|ns| { + if let Some(ns) = netns.as_ref() { let port = policy .network .proxy .as_ref() .and_then(|p| p.http_addr) .map_or(3128, |addr| addr.port()); - format!("http://{}:{port}", ns.host_ip()) - }) + Some(format!("http://{}:{port}", ns.host_ip())) + } else { + // Platform mode: proxy on loopback + let port = policy + .network + .proxy + .as_ref() + .and_then(|p| p.http_addr) + .map_or(3128, |addr| addr.port()); + Some(format!("http://127.0.0.1:{port}")) + } } #[cfg(not(target_os = "linux"))] { @@ -1729,8 +1756,10 @@ where } fn enrich_sandbox_baseline_paths(policy: &mut SandboxPolicy) { - let (ro, rw) = - active_baseline_enrichment_paths(matches!(policy.network.mode, NetworkMode::Proxy)); + let (ro, rw) = active_baseline_enrichment_paths(matches!( + policy.network.mode, + NetworkMode::Proxy | NetworkMode::Platform + )); let modified = enrich_sandbox_baseline_paths_with(policy, &ro, &rw, std::path::Path::exists); if modified { diff --git a/crates/openshell-sandbox/src/opa.rs b/crates/openshell-sandbox/src/opa.rs index f5ff5923b..cbb7b4074 100644 --- a/crates/openshell-sandbox/src/opa.rs +++ b/crates/openshell-sandbox/src/opa.rs @@ -1262,6 +1262,7 @@ mod tests { run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, } } @@ -2518,6 +2519,7 @@ network_policies: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; let engine = OpaEngine::from_proto(&proto).expect("engine from proto"); @@ -2641,6 +2643,7 @@ network_policies: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; let engine = OpaEngine::from_proto(&proto).expect("engine from proto"); @@ -2698,6 +2701,7 @@ network_policies: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; let engine = OpaEngine::from_proto(&proto).expect("engine from proto"); @@ -2755,6 +2759,7 @@ network_policies: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; let engine = OpaEngine::from_proto(&proto).expect("engine from proto"); @@ -3704,6 +3709,7 @@ process: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; let engine = OpaEngine::from_proto(&proto).expect("engine from proto"); let input = NetworkInput { @@ -3758,6 +3764,7 @@ process: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; let engine = OpaEngine::from_proto(&proto).expect("engine from proto"); let input = NetworkInput { @@ -3828,6 +3835,7 @@ process: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; let engine = OpaEngine::from_proto(&proto).expect("Failed to create engine from proto"); @@ -4058,6 +4066,7 @@ network_policies: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; let engine = OpaEngine::from_proto(&proto).unwrap(); // Port 443 @@ -5017,6 +5026,7 @@ network_policies: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; // Build engine with our PID (symlink resolution will work via /proc/self/root/) @@ -5094,6 +5104,7 @@ network_policies: run_as_group: "sandbox".to_string(), }), network_policies, + network_enforcement: 0, }; // Initial load at pid=0 — no symlink expansion diff --git a/crates/openshell-sandbox/src/policy.rs b/crates/openshell-sandbox/src/policy.rs index 0827fa0d0..fde345b83 100644 --- a/crates/openshell-sandbox/src/policy.rs +++ b/crates/openshell-sandbox/src/policy.rs @@ -5,7 +5,8 @@ use openshell_core::proto::{ FilesystemPolicy as ProtoFilesystemPolicy, LandlockPolicy as ProtoLandlockPolicy, - ProcessPolicy as ProtoProcessPolicy, SandboxPolicy as ProtoSandboxPolicy, + NetworkEnforcementMode, ProcessPolicy as ProtoProcessPolicy, + SandboxPolicy as ProtoSandboxPolicy, }; use std::net::SocketAddr; use std::path::PathBuf; @@ -62,6 +63,9 @@ pub enum NetworkMode { Block, Proxy, Allow, + /// Platform mode: Landlock + seccomp + loopback proxy, no network namespace. + /// Compatible with restricted-v2 SCC and restricted Pod Security Standard. + Platform, } #[derive(Debug, Clone)] @@ -99,10 +103,13 @@ impl TryFrom for SandboxPolicy { type Error = miette::Report; fn try_from(proto: ProtoSandboxPolicy) -> Result { - // In cluster mode we always run with proxy networking so all egress - // can be evaluated by OPA and `inference.local` is always addressable. + let mode = match proto.network_enforcement() { + NetworkEnforcementMode::NetworkEnforcementNamespace => NetworkMode::Proxy, + NetworkEnforcementMode::NetworkEnforcementPlatform => NetworkMode::Platform, + }; + let network = NetworkPolicy { - mode: NetworkMode::Proxy, + mode, proxy: Some(ProxyPolicy { http_addr: None }), }; diff --git a/crates/openshell-sandbox/src/process.rs b/crates/openshell-sandbox/src/process.rs index d004bb7d4..f059584ce 100644 --- a/crates/openshell-sandbox/src/process.rs +++ b/crates/openshell-sandbox/src/process.rs @@ -226,27 +226,25 @@ impl ProcessHandle { cmd.current_dir(dir); } - if matches!(policy.network.mode, NetworkMode::Proxy) { + if matches!( + policy.network.mode, + NetworkMode::Proxy | NetworkMode::Platform + ) { let proxy = policy.network.proxy.as_ref().ok_or_else(|| { miette::miette!( "Network mode is set to proxy but no proxy configuration was provided" ) })?; - // When using network namespace, set proxy URL to the veth host IP - if netns_fd.is_some() { - // The proxy is on 10.200.0.1:3128 (or configured port) - let port = proxy.http_addr.map_or(3128, |addr| addr.port()); - let proxy_url = format!("http://10.200.0.1:{port}"); - // Both uppercase and lowercase variants: curl/wget use uppercase, - // gRPC C-core (libgrpc) checks lowercase http_proxy/https_proxy. - for (key, value) in child_env::proxy_env_vars(&proxy_url) { - cmd.env(key, value); - } - } else if let Some(http_addr) = proxy.http_addr { - let proxy_url = format!("http://{http_addr}"); - for (key, value) in child_env::proxy_env_vars(&proxy_url) { - cmd.env(key, value); - } + let port = proxy.http_addr.map_or(3128, |addr| addr.port()); + let proxy_url = if netns_fd.is_some() { + // Namespace mode: proxy on veth host IP + format!("http://10.200.0.1:{port}") + } else { + // Platform mode (or non-Linux): proxy on loopback + format!("http://127.0.0.1:{port}") + }; + for (key, value) in child_env::proxy_env_vars(&proxy_url) { + cmd.env(key, value); } } @@ -368,7 +366,10 @@ impl ProcessHandle { cmd.current_dir(dir); } - if matches!(policy.network.mode, NetworkMode::Proxy) { + if matches!( + policy.network.mode, + NetworkMode::Proxy | NetworkMode::Platform + ) { let proxy = policy.network.proxy.as_ref().ok_or_else(|| { miette::miette!( "Network mode is set to proxy but no proxy configuration was provided" diff --git a/crates/openshell-sandbox/src/sandbox/linux/landlock.rs b/crates/openshell-sandbox/src/sandbox/linux/landlock.rs index e7f37ce4f..705c4f725 100644 --- a/crates/openshell-sandbox/src/sandbox/linux/landlock.rs +++ b/crates/openshell-sandbox/src/sandbox/linux/landlock.rs @@ -3,10 +3,10 @@ //! Landlock filesystem sandboxing. -use crate::policy::{LandlockCompatibility, SandboxPolicy}; +use crate::policy::{LandlockCompatibility, NetworkMode, SandboxPolicy}; use landlock::{ ABI, Access, AccessFs, CompatLevel, Compatible, PathBeneath, PathFd, PathFdError, Ruleset, - RulesetAttr, RulesetCreatedAttr, + RulesetAttr, RulesetCreatedAttr, Scope, }; use miette::{IntoDiagnostic, Result}; use std::path::{Path, PathBuf}; @@ -184,6 +184,19 @@ pub fn prepare(policy: &SandboxPolicy, workdir: Option<&str>) -> Result Result<()> { } pub fn apply(policy: &SandboxPolicy) -> Result<()> { - let allow_inet = matches!(policy.network.mode, NetworkMode::Proxy | NetworkMode::Allow); + let allow_inet = matches!( + policy.network.mode, + NetworkMode::Proxy | NetworkMode::Allow | NetworkMode::Platform + ); let main_filter = build_filter(allow_inet)?; let clone3_filter = build_clone3_filter()?; @@ -202,6 +205,21 @@ fn build_filter_rules(allow_inet: bool) -> Result add_socket_domain_rule(&mut rules, domain)?; } + // Block UDP sockets (SOCK_DGRAM) on AF_INET/AF_INET6. + // + // The agent doesn't need UDP: all traffic goes through the CONNECT proxy + // on 127.0.0.1:3128, which resolves DNS on behalf of the agent. This + // matches Full OpenShell behavior where nftables rejects all UDP in the + // network namespace (nft_ruleset.rs:48-49 has no UDP accept rule). + // + // Without this block, an agent could exfiltrate data via DNS tunneling + // (encoding secrets in DNS subdomain labels) or send UDP packets to + // arbitrary destinations -- Landlock ABI v4 only covers TCP. + if allow_inet { + add_sock_dgram_block(&mut rules, libc::AF_INET)?; + add_sock_dgram_block(&mut rules, libc::AF_INET6)?; + } + // Allow AF_NETLINK only for NETLINK_ROUTE (protocol 0). // // NETLINK_ROUTE is needed by getifaddrs(3) which is called by Node.js, @@ -339,6 +357,29 @@ fn add_netlink_non_route_rule(rules: &mut BTreeMap>) -> Re Ok(()) } +/// Block `socket(domain, SOCK_DGRAM, *)` to prevent UDP socket creation. +/// +/// Uses `MaskedEq` on arg1 with mask `0xF` (SOCK_TYPE_MASK) to match +/// `SOCK_DGRAM` (2) regardless of `SOCK_NONBLOCK` or `SOCK_CLOEXEC` flags. +#[allow(clippy::cast_sign_loss)] +fn add_sock_dgram_block(rules: &mut BTreeMap>, domain: i32) -> Result<()> { + let domain_condition = + SeccompCondition::new(0, SeccompCmpArgLen::Dword, SeccompCmpOp::Eq, domain as u64) + .into_diagnostic()?; + + let type_condition = SeccompCondition::new( + 1, // type argument + SeccompCmpArgLen::Dword, + SeccompCmpOp::MaskedEq(0xF), // SOCK_TYPE_MASK + libc::SOCK_DGRAM as u64, + ) + .into_diagnostic()?; + + let rule = SeccompRule::new(vec![domain_condition, type_condition]).into_diagnostic()?; + rules.entry(libc::SYS_socket).or_default().push(rule); + Ok(()) +} + /// Block a syscall when a specific bit pattern is set in an argument. /// /// Uses `MaskedEq` to check `(arg & flag_bit) == flag_bit`, which triggers diff --git a/crates/openshell-server/src/compute/mod.rs b/crates/openshell-server/src/compute/mod.rs index 30a21c643..25e9dbc6e 100644 --- a/crates/openshell-server/src/compute/mod.rs +++ b/crates/openshell-server/src/compute/mod.rs @@ -1271,6 +1271,14 @@ fn driver_sandbox_spec_from_public(spec: &SandboxSpec) -> DriverSandboxSpec { gpu: spec.gpu, gpu_device: spec.gpu_device.clone(), sandbox_token: String::new(), + // Clamp to known NetworkEnforcementMode values. Unknown values + // default to NAMESPACE (0) for safety -- elevated capabilities are + // the more conservative posture since the supervisor can always + // create a network namespace for full isolation. + network_enforcement: match spec.policy.as_ref().map_or(0, |p| p.network_enforcement) { + 1 => 1, // PLATFORM + _ => 0, // NAMESPACE (default, reject unknown values) + }, } } diff --git a/proto/compute_driver.proto b/proto/compute_driver.proto index 610d491c7..48a7f4beb 100644 --- a/proto/compute_driver.proto +++ b/proto/compute_driver.proto @@ -6,6 +6,7 @@ syntax = "proto3"; package openshell.compute.v1; import "google/protobuf/struct.proto"; +import "sandbox.proto"; // Internal compute-driver contract used by the gateway. // @@ -96,6 +97,10 @@ message DriverSandboxSpec { // ServiceAccount token bootstrap instead). Never echoed to the public // Sandbox proto. string sandbox_token = 11; + // Network enforcement mode for this sandbox. When set to + // NETWORK_ENFORCEMENT_PLATFORM (1), the sandbox runs without elevated + // capabilities. Populated by the gateway from the SandboxPolicy. + openshell.sandbox.v1.NetworkEnforcementMode network_enforcement = 12; } // Driver-owned runtime template consumed by the compute platform. diff --git a/proto/sandbox.proto b/proto/sandbox.proto index ef0b0540f..e3330b3d0 100644 --- a/proto/sandbox.proto +++ b/proto/sandbox.proto @@ -13,6 +13,18 @@ package openshell.sandbox.v1; // - Public sandbox resource types live in `openshell.proto`. // - Internal compute-driver sandbox observation types live in `compute_driver.proto`. +// Network enforcement strategy for sandbox isolation. +enum NetworkEnforcementMode { + // Use a dedicated network namespace with veth pair and nftables bypass + // rules. Requires CAP_SYS_ADMIN and CAP_NET_ADMIN. Default. + NETWORK_ENFORCEMENT_NAMESPACE = 0; + // Rely on Kubernetes NetworkPolicy for L3/L4 egress control. The + // supervisor binds the CONNECT proxy to loopback instead of veth. No + // elevated capabilities required -- compatible with restricted-v2 SCC + // and restricted Pod Security Standard. + NETWORK_ENFORCEMENT_PLATFORM = 1; +} + // Sandbox security policy configuration. message SandboxPolicy { // Policy version. @@ -25,6 +37,9 @@ message SandboxPolicy { ProcessPolicy process = 4; // Network access policies keyed by name (e.g. "claude_code", "gitlab"). map network_policies = 5; + // Network enforcement mode. Default (0) preserves current namespace-based + // isolation for backward compatibility. + NetworkEnforcementMode network_enforcement = 6; } // Filesystem access policy. From 4c7356f62ec119028d6180cd5734cf002b577813 Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Wed, 24 Jun 2026 16:10:29 +0200 Subject: [PATCH 2/3] feat(sandbox): Landlock TCP port restriction in Platform mode Add Landlock ABI v4 TCP connect restriction for Platform mode. When the kernel supports ABI v4, only the proxy port (default 3128) is allowed for outbound TCP connections. On older kernels, BestEffort compat level silently degrades -- the rule has no effect but the proxy still works cooperatively. Both handle_access(ConnectTcp) and add_rule(NetPort) use the ? operator since BestEffort guarantees they succeed on all kernel versions. Signed-off-by: Ladislav Smola --- .../src/sandbox/linux/landlock.rs | 38 +++++++++++++++++-- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/crates/openshell-sandbox/src/sandbox/linux/landlock.rs b/crates/openshell-sandbox/src/sandbox/linux/landlock.rs index 705c4f725..32e888650 100644 --- a/crates/openshell-sandbox/src/sandbox/linux/landlock.rs +++ b/crates/openshell-sandbox/src/sandbox/linux/landlock.rs @@ -5,8 +5,8 @@ use crate::policy::{LandlockCompatibility, NetworkMode, SandboxPolicy}; use landlock::{ - ABI, Access, AccessFs, CompatLevel, Compatible, PathBeneath, PathFd, PathFdError, Ruleset, - RulesetAttr, RulesetCreatedAttr, Scope, + ABI, Access, AccessFs, AccessNet, CompatLevel, Compatible, NetPort, PathBeneath, PathFd, + PathFdError, Ruleset, RulesetAttr, RulesetCreatedAttr, Scope, }; use miette::{IntoDiagnostic, Result}; use std::path::{Path, PathBuf}; @@ -184,6 +184,7 @@ pub fn prepare(policy: &SandboxPolicy, workdir: Option<&str>) -> Result) -> Result>>>>>> 5c72320 (feat(sandbox): Landlock TCP port restriction in Platform mode) } let mut ruleset = ruleset.create().into_diagnostic()?; @@ -220,6 +230,28 @@ pub fn prepare(policy: &SandboxPolicy, workdir: Option<&str>) -> Result) -> Result Date: Wed, 24 Jun 2026 17:13:26 +0200 Subject: [PATCH 3/3] feat(sandbox): seccomp-notify DNS-pinned allowlist for Platform mode Add kernel-level connect() interception using SECCOMP_RET_USER_NOTIF. The supervisor intercepts network syscalls, reads the destination sockaddr from the child's memory via /proc/pid/mem, evaluates it against a DNS-pinned allowlist, and either performs the operation on behalf of the child via pidfd_getfd() or denies it with EPERM. Components: - DnsPinnedAllowlist: resolve domains to IPs at sandbox creation, freeze for session lifetime to prevent DNS rebinding - BPF filter with AUDIT_ARCH validation for connect/sendto/sendmsg/ recvfrom/recvmsg/bind syscalls - pidfd_open + pidfd_getfd for TOCTOU-safe on-behalf-of operations - parse_sockaddr with correct endianness for IPv4/IPv6 - read_process_memory with read_exact for short-read safety Known limitation: DnsPinnedAllowlist cannot handle wildcard domains (*.example.com) because getaddrinfo does not support wildcards. Callers must skip wildcard endpoints and rely on the proxy OPA glob.match() for wildcard domain enforcement. Signed-off-by: Ladislav Smola --- crates/openshell-sandbox/src/lib.rs | 1 + .../openshell-sandbox/src/seccomp_notify.rs | 602 ++++++++++++++++++ 2 files changed, 603 insertions(+) create mode 100644 crates/openshell-sandbox/src/seccomp_notify.rs diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 8e4fc9286..e8b833d4b 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -23,6 +23,7 @@ pub mod procfs; mod provider_credentials; pub mod proxy; mod sandbox; +pub mod seccomp_notify; mod secrets; mod skills; mod ssh; diff --git a/crates/openshell-sandbox/src/seccomp_notify.rs b/crates/openshell-sandbox/src/seccomp_notify.rs new file mode 100644 index 000000000..6064aeeec --- /dev/null +++ b/crates/openshell-sandbox/src/seccomp_notify.rs @@ -0,0 +1,602 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! seccomp-notify network enforcement for Platform mode. +//! +//! Provides kernel-level `connect()` interception using `SECCOMP_RET_USER_NOTIF`. +//! The supervisor intercepts network syscalls, reads the destination `sockaddr` +//! from the child's memory, evaluates it against a DNS-pinned allowlist, and +//! either performs the operation on behalf of the child via `pidfd_getfd()` or +//! denies it with `EPERM`. +//! +//! # Architecture +//! +//! The supervisor forks before exec'ing the agent. The child installs a seccomp +//! filter with `SECCOMP_FILTER_FLAG_NEW_LISTENER` that returns +//! `SECCOMP_RET_USER_NOTIF` for `connect()`, `sendto()`, and `sendmsg()`. The +//! notification fd is sent to the parent via a Unix socket. The parent runs an +//! async event loop processing notifications. +//! +//! # TOCTOU Safety +//! +//! The supervisor never uses `SECCOMP_USER_NOTIF_FLAG_CONTINUE`. Instead, it +//! reads the `sockaddr` once, validates it, then performs the `connect()` itself +//! using `pidfd_getfd()` to duplicate the child's socket fd. The original +//! syscall is never continued. +//! +//! # Requirements +//! +//! - Linux 5.0+ (`SECCOMP_RET_USER_NOTIF`) +//! - Linux 5.6+ (`pidfd_getfd`) +//! - Linux 5.9+ (`SECCOMP_IOCTL_NOTIF_ADDFD`) +//! - `RuntimeDefault` seccomp profile must allow the `seccomp()` syscall +//! - `io_uring` must be blocked (`RuntimeDefault` does this) +//! +//! # References +//! +//! - [`seccomp_unotify(2)`](https://www.man7.org/linux/man-pages/man2/seccomp_unotify.2.html) +//! - [`pidfd_getfd(2)`](https://man7.org/linux/man-pages/man2/pidfd_getfd.2.html) +//! - [Sandlock](https://github.com/multikernel/sandlock) -- reference implementation + +use std::collections::HashSet; +use std::net::{IpAddr, SocketAddr}; + +// --------------------------------------------------------------------------- +// DNS-Pinned Allowlist +// --------------------------------------------------------------------------- + +/// A set of allowed destination IPs, pinned at sandbox creation time. +/// +/// DNS resolution happens once during [`DnsPinnedAllowlist::add_domain()`]. +/// The resolved IPs are frozen for the sandbox session lifetime to prevent +/// DNS rebinding attacks. +/// +/// # Limitation: DNS wildcards +/// +/// Wildcard domains (e.g., `*.googleapis.com`) cannot be pinned because +/// `getaddrinfo("*.googleapis.com")` is not valid DNS and returns an error. +/// Given this OPA policy: +/// +/// ```yaml +/// endpoints: +/// - { host: api.anthropic.com, port: 443 } # exact -- pinnable +/// - { host: "*.googleapis.com", port: 443 } # wildcard -- NOT pinnable +/// ``` +/// +/// Exact domains work: `add_domain("api.anthropic.com")` resolves and pins +/// the IPs. But `add_domain("*.googleapis.com")` fails, pins zero IPs, and +/// connections to `us-central1-aiplatform.googleapis.com` are denied even +/// though the OPA policy allows them. +/// +/// Callers must skip wildcard endpoints (those containing `*`) and rely on +/// the proxy's OPA `glob.match()` for wildcard domain enforcement. +#[derive(Debug, Clone)] +pub struct DnsPinnedAllowlist { + allowed_ips: HashSet, + proxy_addr: SocketAddr, +} + +impl DnsPinnedAllowlist { + /// Create an allowlist that permits only loopback proxy connections. + pub fn new(proxy_addr: SocketAddr) -> Self { + let mut allowed_ips = HashSet::new(); + allowed_ips.insert(proxy_addr.ip()); + allowed_ips.insert(IpAddr::V4(std::net::Ipv4Addr::LOCALHOST)); + allowed_ips.insert(IpAddr::V6(std::net::Ipv6Addr::LOCALHOST)); + Self { + allowed_ips, + proxy_addr, + } + } + + /// Resolve a domain name and pin its IPs to the allowlist. + pub fn add_domain(&mut self, domain: &str) -> std::io::Result { + use std::net::ToSocketAddrs; + let addrs: Vec<_> = (domain, 0).to_socket_addrs()?.collect(); + let count = addrs.len(); + for addr in addrs { + self.allowed_ips.insert(addr.ip()); + } + Ok(count) + } + + /// Check whether a destination IP is in the allowlist. + pub fn is_allowed(&self, ip: &IpAddr) -> bool { + self.allowed_ips.contains(ip) + } + + /// The proxy address (always allowed). + pub fn proxy_addr(&self) -> SocketAddr { + self.proxy_addr + } + + /// Number of pinned IPs. + pub fn len(&self) -> usize { + self.allowed_ips.len() + } + + /// Whether the allowlist contains only the default entries. + pub fn is_empty(&self) -> bool { + self.allowed_ips.len() <= 3 + } +} + +// --------------------------------------------------------------------------- +// Linux-specific seccomp-notify syscall wrappers +// --------------------------------------------------------------------------- + +/// Raw Linux seccomp notification structures and syscall wrappers. +/// +/// These are defined here because `libc` 0.2.x does not export the full +/// notification API (`seccomp_notif`, `seccomp_notif_resp`, ioctls). +#[cfg(target_os = "linux")] +#[allow(unsafe_code, clippy::cast_possible_truncation)] +pub mod linux { + use std::io; + use std::mem; + use std::mem::size_of; + use std::os::unix::io::RawFd; + + // --- Seccomp constants --- + const SECCOMP_SET_MODE_FILTER: libc::c_uint = 1; + const SECCOMP_FILTER_FLAG_NEW_LISTENER: libc::c_uint = 1 << 3; + const SECCOMP_RET_USER_NOTIF: u32 = 0x7FC0_0000; + const SECCOMP_RET_ALLOW: u32 = 0x7FFF_0000; + + // ioctl commands for the notification fd. + // These match the kernel definitions for all architectures (x86_64, aarch64). + // Note: SECCOMP_IOCTL_NOTIF_ID_VALID changed from _IOR to _IOW in Linux 5.17. + // We use the post-5.17 value. On pre-5.17 kernels, id_valid() returns false + // and the caller should treat the notification as potentially stale. + const SECCOMP_IOCTL_NOTIF_RECV: libc::c_ulong = 0xC050_7500; + const SECCOMP_IOCTL_NOTIF_SEND: libc::c_ulong = 0xC018_7501; + const SECCOMP_IOCTL_NOTIF_ID_VALID: libc::c_ulong = 0x4008_7502; + + // Syscall numbers (same on x86_64 and aarch64) + const SYS_PIDFD_OPEN: libc::c_long = 434; + const SYS_PIDFD_GETFD: libc::c_long = 438; + + // --- Notification structs --- + + /// Seccomp notification received from the kernel. + #[repr(C)] + #[derive(Debug, Clone)] + pub struct SeccompNotif { + pub id: u64, + pub pid: u32, + pub flags: u32, + pub data: SeccompData, + } + + /// Syscall data from the notification. + #[repr(C)] + #[derive(Debug, Clone)] + pub struct SeccompData { + pub nr: i32, + pub arch: u32, + pub instruction_pointer: u64, + pub args: [u64; 6], + } + + /// Response to send back to the kernel. + #[repr(C)] + #[derive(Debug, Clone)] + pub struct SeccompNotifResp { + pub id: u64, + pub val: i64, + pub error: i32, + pub flags: u32, + } + + // --- BPF filter types --- + + #[repr(C)] + struct SockFilter { + code: u16, + jt: u8, + jf: u8, + k: u32, + } + + #[repr(C)] + struct SockFprog { + len: u16, + filter: *const SockFilter, + } + + // BPF instruction encoding constants + const BPF_LD: u16 = 0x00; + const BPF_W: u16 = 0x00; + const BPF_ABS: u16 = 0x20; + const BPF_JMP: u16 = 0x05; + const BPF_JEQ: u16 = 0x10; + const BPF_K: u16 = 0x00; + const BPF_RET: u16 = 0x06; + + // AUDIT_ARCH constants for BPF architecture validation + #[cfg(target_arch = "x86_64")] + const AUDIT_ARCH_NATIVE: u32 = 0xC000_003E; // AUDIT_ARCH_X86_64 + #[cfg(target_arch = "aarch64")] + const AUDIT_ARCH_NATIVE: u32 = 0xC000_00B7; // AUDIT_ARCH_AARCH64 + #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + compile_error!( + "seccomp-notify BPF filter requires AUDIT_ARCH_NATIVE. \ + Add the constant for your target_arch to seccomp_notify.rs." + ); + + /// Install a seccomp BPF filter that returns `SECCOMP_RET_USER_NOTIF` + /// for `connect()`, `sendto()`, and `sendmsg()` syscalls. + /// + /// Returns the notification fd on success. The caller is responsible for + /// closing the fd (e.g., by wrapping it in `OwnedFd`). + /// + /// The filter validates `AUDIT_ARCH` to prevent x32/compat ABI bypass. + /// + /// Must be called after `prctl(PR_SET_NO_NEW_PRIVS, 1)` has been set. + pub fn install_connect_notify_filter() -> io::Result { + let filter = [ + // [0] Load architecture from seccomp_data.arch (offset 4) + SockFilter { + code: BPF_LD | BPF_W | BPF_ABS, + jt: 0, + jf: 0, + k: 4, // offsetof(seccomp_data, arch) + }, + // [1] Verify native arch; non-native goes to NOTIFY [10] so the + // supervisor can inspect and deny compat-ABI syscalls. + SockFilter { + code: BPF_JMP | BPF_JEQ | BPF_K, + jt: 0, // continue + jf: 8, // non-native → NOTIFY [10] + k: AUDIT_ARCH_NATIVE, + }, + // [2] Load syscall number from seccomp_data.nr (offset 0) + SockFilter { + code: BPF_LD | BPF_W | BPF_ABS, + jt: 0, + jf: 0, + k: 0, // offsetof(seccomp_data, nr) + }, + // [3] Check connect → NOTIFY [10] + SockFilter { + code: BPF_JMP | BPF_JEQ | BPF_K, + jt: 6, // jump to NOTIFY + jf: 0, + k: libc::SYS_connect as u32, + }, + // [4] Check sendto → NOTIFY [10] + SockFilter { + code: BPF_JMP | BPF_JEQ | BPF_K, + jt: 5, + jf: 0, + k: libc::SYS_sendto as u32, + }, + // [5] Check sendmsg → NOTIFY [10] + SockFilter { + code: BPF_JMP | BPF_JEQ | BPF_K, + jt: 4, + jf: 0, + k: libc::SYS_sendmsg as u32, + }, + // [6] Check recvfrom → NOTIFY [10] + SockFilter { + code: BPF_JMP | BPF_JEQ | BPF_K, + jt: 3, + jf: 0, + k: libc::SYS_recvfrom as u32, + }, + // [7] Check recvmsg → NOTIFY [10] + SockFilter { + code: BPF_JMP | BPF_JEQ | BPF_K, + jt: 2, + jf: 0, + k: libc::SYS_recvmsg as u32, + }, + // [8] Check bind → NOTIFY [10] (prevent binding to arbitrary ports) + SockFilter { + code: BPF_JMP | BPF_JEQ | BPF_K, + jt: 1, + jf: 0, + k: libc::SYS_bind as u32, + }, + // [9] ALLOW + SockFilter { + code: BPF_RET | BPF_K, + jt: 0, + jf: 0, + k: SECCOMP_RET_ALLOW, + }, + // [10] NOTIFY + SockFilter { + code: BPF_RET | BPF_K, + jt: 0, + jf: 0, + k: SECCOMP_RET_USER_NOTIF, + }, + ]; + + let prog = SockFprog { + len: u16::try_from(filter.len()).expect("BPF filter exceeds u16::MAX instructions"), + filter: filter.as_ptr(), + }; + + // SAFETY: The SockFprog and SockFilter arrays are #[repr(C)] with correct + // layout for the kernel ABI. The filter array is stack-allocated and lives + // for the duration of the syscall. PR_SET_NO_NEW_PRIVS must be set before + // this call. The returned fd is valid until closed. + let fd = unsafe { + libc::syscall( + libc::SYS_seccomp, + SECCOMP_SET_MODE_FILTER, + SECCOMP_FILTER_FLAG_NEW_LISTENER, + std::ptr::from_ref(&prog), + ) + }; + + if fd < 0 { + return Err(io::Error::last_os_error()); + } + + Ok(fd as RawFd) + } + + /// Receive a seccomp notification from the notification fd. + /// + /// Blocks until a notification is available. + pub fn recv_notif(notify_fd: RawFd) -> io::Result { + // SAFETY: SeccompNotif is #[repr(C)] and matches the kernel's + // struct seccomp_notif layout. The kernel writes all fields. + let mut notif: SeccompNotif = unsafe { mem::zeroed() }; + let ret = unsafe { libc::ioctl(notify_fd, SECCOMP_IOCTL_NOTIF_RECV, &mut notif) }; + if ret < 0 { + return Err(io::Error::last_os_error()); + } + Ok(notif) + } + + /// Send a response to a seccomp notification. + pub fn send_resp(notify_fd: RawFd, resp: &SeccompNotifResp) -> io::Result<()> { + let ret = unsafe { libc::ioctl(notify_fd, SECCOMP_IOCTL_NOTIF_SEND, resp) }; + if ret < 0 { + return Err(io::Error::last_os_error()); + } + Ok(()) + } + + /// Check if a notification ID is still valid. + /// + /// Note: uses the post-Linux-5.17 ioctl constant (`_IOW`). On kernels + /// 5.0-5.16, this always returns `false` (the old constant was `_IOR`). + /// Callers should treat a `false` result as "proceed with caution" and + /// verify the operation result, not as "definitely expired." + pub fn id_valid(notify_fd: RawFd, id: u64) -> bool { + let ret = unsafe { libc::ioctl(notify_fd, SECCOMP_IOCTL_NOTIF_ID_VALID, &id) }; + ret == 0 + } + + /// Open a pid fd for a process. + pub fn pidfd_open(pid: u32) -> io::Result { + #[allow(clippy::cast_possible_wrap)] + let pid_t = pid as libc::pid_t; + let fd = unsafe { libc::syscall(SYS_PIDFD_OPEN, pid_t, 0_u32) }; + if fd < 0 { + return Err(io::Error::last_os_error()); + } + Ok(fd as RawFd) + } + + /// Duplicate a file descriptor from another process via its pidfd. + /// + /// # Security Note + /// + /// A multi-threaded child can `dup2()` a different fd into `target_fd` + /// between the notification and this call. The caller should verify the + /// duplicated fd is a socket of the expected type after duplication. + pub fn pidfd_getfd(pidfd: RawFd, target_fd: RawFd) -> io::Result { + let fd = unsafe { libc::syscall(SYS_PIDFD_GETFD, pidfd, target_fd, 0_u32) }; + if fd < 0 { + return Err(io::Error::last_os_error()); + } + Ok(fd as RawFd) + } + + /// Verify that a duplicated fd is a socket (not a regular file or pipe). + /// + /// Call this after `pidfd_getfd()` to mitigate the fd-swap race: if a + /// malicious child `dup2()`d a non-socket fd into the target slot, this + /// check catches it. + pub fn verify_socket_fd(fd: RawFd) -> io::Result { + let mut stat: libc::stat = unsafe { mem::zeroed() }; + // SAFETY: fstat on a valid fd is safe. The stat struct is zeroed and + // fully written by the kernel on success. + let ret = unsafe { libc::fstat(fd, std::ptr::from_mut(&mut stat)) }; + if ret < 0 { + return Err(io::Error::last_os_error()); + } + // S_IFSOCK = 0o140000 + Ok((stat.st_mode & libc::S_IFMT) == libc::S_IFSOCK) + } + + /// Read bytes from another process's memory via `/proc//mem`. + /// + /// Uses `read_exact` to ensure the full buffer is filled. Returns an + /// error if the read is short (e.g., at an unmapped page boundary). + /// + /// # Security Note + /// + /// Between the notification and this read, the process may exit and the + /// pid may be recycled. Call `id_valid()` before and check the result + /// of this function. For stronger guarantees, use `process_vm_readv()` + /// with a pidfd (not implemented here). + pub fn read_process_memory(pid: u32, addr: u64, buf: &mut [u8]) -> io::Result<()> { + use std::io::{Read, Seek}; + + let path = format!("/proc/{pid}/mem"); + let mut file = std::fs::File::open(&path)?; + file.seek(io::SeekFrom::Start(addr))?; + file.read_exact(buf) + } + + /// Parse a `sockaddr_in` or `sockaddr_in6` from raw bytes. + /// + /// `sa_family` is in native byte order. Port is in network (big-endian) + /// byte order per the `sockaddr_in` ABI. + pub fn parse_sockaddr(buf: &[u8]) -> Option { + if buf.len() < 2 { + return None; + } + + // sa_family is in native byte order + let family = u16::from_ne_bytes([buf[0], buf[1]]); + + match i32::from(family) { + libc::AF_INET if buf.len() >= size_of::() => { + // sin_port is in network (big-endian) byte order + let port = u16::from_be_bytes([buf[2], buf[3]]); + let ip = std::net::Ipv4Addr::new(buf[4], buf[5], buf[6], buf[7]); + Some(std::net::SocketAddr::V4(std::net::SocketAddrV4::new( + ip, port, + ))) + } + libc::AF_INET6 if buf.len() >= size_of::() => { + let port = u16::from_be_bytes([buf[2], buf[3]]); + // sin6_flowinfo at bytes 4-7 (network byte order) + let flowinfo = u32::from_be_bytes([buf[4], buf[5], buf[6], buf[7]]); + // sin6_addr at bytes 8-23 + let mut ip_bytes = [0u8; 16]; + ip_bytes.copy_from_slice(&buf[8..24]); + let ip = std::net::Ipv6Addr::from(ip_bytes); + // sin6_scope_id at bytes 24-27 (native byte order per POSIX) + let scope_id = u32::from_ne_bytes([buf[24], buf[25], buf[26], buf[27]]); + Some(std::net::SocketAddr::V6(std::net::SocketAddrV6::new( + ip, port, flowinfo, scope_id, + ))) + } + _ => None, + } + } + + /// Deny a notification with EPERM. + pub fn deny(notify_fd: RawFd, id: u64) -> io::Result<()> { + send_resp( + notify_fd, + &SeccompNotifResp { + id, + val: 0, + error: -libc::EPERM, + flags: 0, + }, + ) + } + + /// Allow a `connect()` notification by returning success (0) to the caller. + /// + /// **Only valid for `connect()` syscalls** which return 0 on success. + /// For `sendto()`/`sendmsg()`, the supervisor must perform the operation + /// on behalf of the child and return the actual byte count via + /// `send_resp()` directly. + pub fn allow_connect(notify_fd: RawFd, id: u64) -> io::Result<()> { + send_resp( + notify_fd, + &SeccompNotifResp { + id, + val: 0, + error: 0, + flags: 0, + }, + ) + } +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use super::*; + use std::net::{Ipv4Addr, Ipv6Addr}; + + #[test] + fn allowlist_permits_loopback() { + let proxy = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 3128); + let allowlist = DnsPinnedAllowlist::new(proxy); + assert!(allowlist.is_allowed(&IpAddr::V4(Ipv4Addr::LOCALHOST))); + assert!(allowlist.is_allowed(&IpAddr::V6(Ipv6Addr::LOCALHOST))); + } + + #[test] + fn allowlist_denies_arbitrary_ip() { + let proxy = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 3128); + let allowlist = DnsPinnedAllowlist::new(proxy); + let external = IpAddr::V4(Ipv4Addr::new(198, 51, 100, 5)); + assert!(!allowlist.is_allowed(&external)); + } + + #[test] + fn allowlist_resolves_domain() { + let proxy = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 3128); + let mut allowlist = DnsPinnedAllowlist::new(proxy); + let count = allowlist.add_domain("localhost").unwrap(); + assert!(count > 0); + assert!(allowlist.is_allowed(&IpAddr::V4(Ipv4Addr::LOCALHOST))); + } + + #[test] + fn allowlist_proxy_addr() { + let proxy = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 3128); + let allowlist = DnsPinnedAllowlist::new(proxy); + assert_eq!(allowlist.proxy_addr(), proxy); + } + + #[test] + fn allowlist_len_and_empty() { + let proxy = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 3128); + let allowlist = DnsPinnedAllowlist::new(proxy); + assert!(allowlist.len() >= 2); + assert!(allowlist.is_empty()); + } + + #[cfg(target_os = "linux")] + mod linux_tests { + use super::super::linux::parse_sockaddr; + + #[test] + fn parse_ipv4_sockaddr() { + let mut buf = [0u8; 16]; + // AF_INET = 2 in native byte order + let family_bytes = 2u16.to_ne_bytes(); + buf[0] = family_bytes[0]; + buf[1] = family_bytes[1]; + // Port 443 in big-endian + buf[2] = 0x01; + buf[3] = 0xBB; + // IP 93.184.216.34 + buf[4] = 93; + buf[5] = 184; + buf[6] = 216; + buf[7] = 34; + + let addr = parse_sockaddr(&buf).unwrap(); + assert_eq!(addr.port(), 443); + match addr { + std::net::SocketAddr::V4(v4) => { + assert_eq!(*v4.ip(), std::net::Ipv4Addr::new(93, 184, 216, 34)); + } + std::net::SocketAddr::V6(_) => panic!("expected IPv4"), + } + } + + #[test] + fn parse_too_short_returns_none() { + let buf = [0u8; 1]; + assert!(parse_sockaddr(&buf).is_none()); + } + + #[test] + fn parse_unknown_family_returns_none() { + let buf = [0xFF; 16]; + assert!(parse_sockaddr(&buf).is_none()); + } + } +}