nucleus/security/
capabilities.rs

1use crate::error::{NucleusError, Result};
2use caps::{CapSet, Capability, CapsHashSet};
3use tracing::{debug, info};
4
5/// Security context that tracks capability state
6pub struct CapabilityManager {
7    phase: CapPhase,
8}
9
10/// Tracks which phase of the two-phase cap drop we're in.
11///
12/// Docker/runc convention: the identity switch (setuid/setgid) must happen
13/// between bounding-set cleanup and final cap clear. This is because:
14/// - PR_CAPBSET_DROP requires CAP_SETPCAP in the effective set
15/// - setuid/setgid require CAP_SETUID/CAP_SETGID in the effective set
16/// - After setuid to non-zero UID, the kernel auto-clears permitted/effective
17///
18/// So the ordering is: drop bounding → setuid/setgid → clear remaining caps.
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20enum CapPhase {
21    /// No caps have been modified yet
22    Initial,
23    /// Bounding set dropped; effective/permitted still intact for identity switch
24    BoundingDropped,
25    /// All caps fully dropped (terminal state)
26    Dropped,
27}
28
29#[derive(Debug, Clone, PartialEq, Eq)]
30pub struct CapabilitySets {
31    pub bounding: Vec<Capability>,
32    pub permitted: Vec<Capability>,
33    pub effective: Vec<Capability>,
34    pub inheritable: Vec<Capability>,
35    pub ambient: Vec<Capability>,
36}
37
38impl CapabilityManager {
39    pub fn new() -> Self {
40        Self {
41            phase: CapPhase::Initial,
42        }
43    }
44
45    /// Phase 1: Drop the bounding set and clear ambient/inheritable caps.
46    ///
47    /// After this call, CAP_SETUID and CAP_SETGID remain in the effective set
48    /// so the caller can perform the identity switch (setuid/setgid). Call
49    /// [`finalize_drop`] after the identity switch to clear remaining caps.
50    ///
51    /// This follows Docker/runc convention: bounding set is cleared first
52    /// while CAP_SETPCAP is still in the effective set.
53    pub fn drop_bounding_set(&mut self) -> Result<()> {
54        if self.phase != CapPhase::Initial {
55            debug!("Bounding set already dropped, skipping");
56            return Ok(());
57        }
58
59        info!("Phase 1: dropping bounding set and ambient/inheritable caps");
60
61        // 1. Clear bounding set (requires CAP_SETPCAP in effective set).
62        //    Prevents regaining capabilities through exec of setuid binaries.
63        for cap in caps::all() {
64            if let Err(e) = caps::drop(None, CapSet::Bounding, cap) {
65                debug!(
66                    "Failed to drop bounding cap {:?}: {} (may not be present)",
67                    cap, e
68                );
69            }
70        }
71
72        // M4: Verify the bounding set is actually empty after the drop loop
73        let bounding = caps::read(None, CapSet::Bounding).map_err(|e| {
74            NucleusError::CapabilityError(format!("Failed to read bounding set after drop: {}", e))
75        })?;
76        if !bounding.is_empty() {
77            let leaked: Vec<String> = bounding.iter().map(|c| format!("{:?}", c)).collect();
78            return Err(NucleusError::CapabilityError(format!(
79                "Bounding set still contains capabilities after drop: [{}]",
80                leaked.join(", ")
81            )));
82        }
83
84        // 2. Clear ambient set (constrained to permitted ∩ inheritable).
85        caps::clear(None, CapSet::Ambient).map_err(|e| {
86            NucleusError::CapabilityError(format!("Failed to clear ambient caps: {}", e))
87        })?;
88
89        // 3. Clear inheritable (prevents caps leaking across exec).
90        caps::clear(None, CapSet::Inheritable).map_err(|e| {
91            NucleusError::CapabilityError(format!("Failed to clear inheritable caps: {}", e))
92        })?;
93
94        // Effective/permitted are intentionally kept – they hold CAP_SETUID,
95        // CAP_SETGID, and CAP_SETPCAP needed for the identity switch.
96
97        self.phase = CapPhase::BoundingDropped;
98        info!("Phase 1 complete: bounding/ambient/inheritable cleared, effective/permitted retained for identity switch");
99
100        Ok(())
101    }
102
103    /// Phase 2: Clear all remaining capabilities (permitted + effective).
104    ///
105    /// Call this AFTER the identity switch (setuid/setgid). If the process
106    /// switched to a non-root UID, the kernel already cleared these sets;
107    /// this call makes it explicit and verifies the result.
108    ///
109    /// If no identity switch was needed (process stays root), this performs
110    /// the actual clear.
111    pub fn finalize_drop(&mut self) -> Result<()> {
112        if self.phase == CapPhase::Dropped {
113            debug!("Capabilities already fully dropped, skipping");
114            return Ok(());
115        }
116
117        if self.phase == CapPhase::Initial {
118            // Caller skipped phase 1 – do full drop for backwards compat
119            self.drop_bounding_set()?;
120        }
121
122        info!("Phase 2: clearing permitted and effective caps");
123
124        caps::clear(None, CapSet::Permitted).map_err(|e| {
125            NucleusError::CapabilityError(format!("Failed to clear permitted caps: {}", e))
126        })?;
127
128        caps::clear(None, CapSet::Effective).map_err(|e| {
129            NucleusError::CapabilityError(format!("Failed to clear effective caps: {}", e))
130        })?;
131
132        self.phase = CapPhase::Dropped;
133        info!("Successfully dropped all capabilities (including bounding set)");
134
135        Ok(())
136    }
137
138    /// Drop all capabilities in a single call (convenience wrapper).
139    ///
140    /// Equivalent to calling [`drop_bounding_set`] then [`finalize_drop`].
141    /// Use the two-phase API when an identity switch is needed between phases.
142    pub fn drop_all(&mut self) -> Result<()> {
143        self.drop_bounding_set()?;
144        self.finalize_drop()
145    }
146
147    /// Drop all capabilities except the specified ones
148    ///
149    /// For most use cases, we drop ALL capabilities. This method is provided
150    /// for special cases where specific capabilities are needed.
151    pub fn drop_except(&mut self, keep: &[Capability]) -> Result<()> {
152        if self.phase == CapPhase::Dropped {
153            debug!("Capabilities already dropped, skipping");
154            return Ok(());
155        }
156
157        info!("Dropping capabilities except: {:?}", keep);
158
159        let all_caps = caps::all();
160
161        // 1. Drop bounding set entries FIRST (requires CAP_SETPCAP in effective).
162        for cap in &all_caps {
163            if !keep.contains(cap) {
164                if let Err(e) = caps::drop(None, CapSet::Bounding, *cap) {
165                    debug!(
166                        "Failed to drop bounding cap {:?}: {} (may not be present)",
167                        cap, e
168                    );
169                }
170            }
171        }
172
173        // 2. Clear ambient set (constrained to permitted ∩ inheritable).
174        caps::clear(None, CapSet::Ambient).map_err(|e| {
175            NucleusError::CapabilityError(format!("Failed to clear ambient caps: {}", e))
176        })?;
177
178        // 3. Drop from inheritable, permitted, effective for each non-kept cap.
179        for cap in &all_caps {
180            if !keep.contains(cap) {
181                caps::drop(None, CapSet::Inheritable, *cap).map_err(|e| {
182                    NucleusError::CapabilityError(format!("Failed to drop {cap:?}: {e}"))
183                })?;
184
185                caps::drop(None, CapSet::Permitted, *cap).map_err(|e| {
186                    NucleusError::CapabilityError(format!("Failed to drop {cap:?}: {e}"))
187                })?;
188
189                caps::drop(None, CapSet::Effective, *cap).map_err(|e| {
190                    NucleusError::CapabilityError(format!("Failed to drop {cap:?}: {e}"))
191                })?;
192            }
193        }
194
195        self.phase = CapPhase::Dropped;
196        info!("Successfully dropped capabilities");
197
198        Ok(())
199    }
200
201    /// Apply explicit capability sets.
202    ///
203    /// Bounding is handled as a drop-only upper bound; the remaining sets are
204    /// set exactly to the provided values.
205    pub fn apply_sets(&mut self, sets: &CapabilitySets) -> Result<()> {
206        if self.phase == CapPhase::Dropped {
207            debug!("Capabilities already dropped, skipping");
208            return Ok(());
209        }
210
211        info!("Applying explicit capability sets");
212
213        for cap in caps::all() {
214            if !sets.bounding.contains(&cap) {
215                if let Err(e) = caps::drop(None, CapSet::Bounding, cap) {
216                    debug!(
217                        "Failed to drop bounding cap {:?}: {} (may not be present)",
218                        cap, e
219                    );
220                }
221            }
222        }
223
224        // M5: Set Permitted first, then Effective immediately after to avoid a
225        // window where the old effective set exceeds the new permitted set.
226        caps::set(None, CapSet::Permitted, &to_caps_hash_set(&sets.permitted)).map_err(|e| {
227            NucleusError::CapabilityError(format!("Failed to set permitted caps: {}", e))
228        })?;
229        caps::set(None, CapSet::Effective, &to_caps_hash_set(&sets.effective)).map_err(|e| {
230            NucleusError::CapabilityError(format!("Failed to set effective caps: {}", e))
231        })?;
232        caps::set(
233            None,
234            CapSet::Inheritable,
235            &to_caps_hash_set(&sets.inheritable),
236        )
237        .map_err(|e| {
238            NucleusError::CapabilityError(format!("Failed to set inheritable caps: {}", e))
239        })?;
240        caps::set(None, CapSet::Ambient, &to_caps_hash_set(&sets.ambient)).map_err(|e| {
241            NucleusError::CapabilityError(format!("Failed to set ambient caps: {}", e))
242        })?;
243
244        self.phase = CapPhase::Dropped;
245        info!("Successfully applied capability sets");
246        Ok(())
247    }
248
249    /// Check if capabilities have been dropped
250    pub fn is_dropped(&self) -> bool {
251        self.phase == CapPhase::Dropped
252    }
253
254    /// Verify that namespace-creating capabilities are actually absent from
255    /// the effective set. This is a runtime guard for the clone3 seccomp
256    /// invariant: clone3 cannot be argument-filtered at the BPF level, so
257    /// we rely on CAP_SYS_ADMIN (et al.) being dropped to prevent namespace
258    /// creation. If the check fails in production mode, it returns an error;
259    /// otherwise it emits a warning.
260    pub fn verify_no_namespace_caps(production: bool) -> Result<()> {
261        use caps::Capability;
262        let ns_caps = [
263            Capability::CAP_SYS_ADMIN,
264            Capability::CAP_NET_ADMIN,
265            Capability::CAP_SYS_PTRACE,
266        ];
267        let effective = caps::read(None, CapSet::Effective).map_err(|e| {
268            NucleusError::CapabilityError(format!("Failed to read effective caps: {}", e))
269        })?;
270        let mut leaked = Vec::new();
271        for cap in &ns_caps {
272            if effective.contains(cap) {
273                leaked.push(format!("{:?}", cap));
274            }
275        }
276        if !leaked.is_empty() {
277            let msg = format!(
278                "SEC-CLONE3: namespace-creating capabilities still present after drop: [{}]. \
279                 clone3 syscall is allowed without argument filtering – these caps \
280                 must be absent to prevent namespace escape.",
281                leaked.join(", ")
282            );
283            if production {
284                return Err(NucleusError::CapabilityError(msg));
285            }
286            tracing::warn!("{}", msg);
287        }
288        Ok(())
289    }
290}
291
292impl Default for CapabilityManager {
293    fn default() -> Self {
294        Self::new()
295    }
296}
297
298fn to_caps_hash_set(caps_list: &[Capability]) -> CapsHashSet {
299    caps_list.iter().copied().collect()
300}
301
302#[cfg(test)]
303mod tests {
304    use super::*;
305
306    #[test]
307    fn test_capability_manager_initial_state() {
308        let mgr = CapabilityManager::new();
309        assert!(!mgr.is_dropped());
310    }
311
312    #[test]
313    fn test_drop_idempotent() {
314        let mut mgr = CapabilityManager::new();
315        // First drop may fail in unprivileged test environments (M4 verification).
316        // That's expected – the important thing is idempotency of the dropped flag.
317        match mgr.drop_all() {
318            Ok(()) => {
319                assert!(mgr.is_dropped());
320                // Second drop should also succeed (idempotent)
321                let result = mgr.drop_all();
322                assert!(result.is_ok());
323                assert!(mgr.is_dropped());
324            }
325            Err(_) => {
326                // In unprivileged tests, bounding set verification may fail.
327                // This is expected and not a test failure.
328            }
329        }
330    }
331
332    #[test]
333    fn test_two_phase_drop() {
334        let mut mgr = CapabilityManager::new();
335        // Phase 1 may fail in unprivileged tests; that's fine
336        match mgr.drop_bounding_set() {
337            Ok(()) => {
338                assert!(!mgr.is_dropped()); // not fully dropped yet
339                match mgr.finalize_drop() {
340                    Ok(()) => assert!(mgr.is_dropped()),
341                    Err(_) => {} // clear may fail in test env
342                }
343            }
344            Err(_) => {}
345        }
346    }
347}
nucleus/security/capabilities.rs

nucleus/security/
capabilities.rs