Skip to main content

nucleus/security/
capabilities.rs

1use crate::error::{NucleusError, Result};
2use caps::{CapSet, Capability, CapsHashSet};
3use tracing::{debug, info};
4
5/// Security context that tracks capability state
6pub struct CapabilityManager {
7    phase: CapPhase,
8}
9
10/// Tracks which phase of the two-phase cap drop we're in.
11///
12/// Docker/runc convention: the identity switch (setuid/setgid) must happen
13/// between bounding-set cleanup and final cap clear. This is because:
14/// - PR_CAPBSET_DROP requires CAP_SETPCAP in the effective set
15/// - setuid/setgid require CAP_SETUID/CAP_SETGID in the effective set
16/// - After setuid to non-zero UID, the kernel auto-clears permitted/effective
17///
18/// So the ordering is: drop bounding → setuid/setgid → clear remaining caps.
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20enum CapPhase {
21    /// No caps have been modified yet
22    Initial,
23    /// Bounding set dropped; effective/permitted still intact for identity switch
24    BoundingDropped,
25    /// All caps fully dropped (terminal state)
26    Dropped,
27}
28
29#[derive(Debug, Clone, PartialEq, Eq)]
30pub struct CapabilitySets {
31    pub bounding: Vec<Capability>,
32    pub permitted: Vec<Capability>,
33    pub effective: Vec<Capability>,
34    pub inheritable: Vec<Capability>,
35    pub ambient: Vec<Capability>,
36}
37
38impl CapabilityManager {
39    pub fn new() -> Self {
40        Self {
41            phase: CapPhase::Initial,
42        }
43    }
44
45    /// Phase 1: Drop the bounding set and clear ambient/inheritable caps.
46    ///
47    /// After this call, CAP_SETUID and CAP_SETGID remain in the effective set
48    /// so the caller can perform the identity switch (setuid/setgid). Call
49    /// [`finalize_drop`] after the identity switch to clear remaining caps.
50    ///
51    /// This follows Docker/runc convention: bounding set is cleared first
52    /// while CAP_SETPCAP is still in the effective set.
53    pub fn drop_bounding_set(&mut self) -> Result<()> {
54        if self.phase != CapPhase::Initial {
55            debug!("Bounding set already dropped, skipping");
56            return Ok(());
57        }
58
59        info!("Phase 1: dropping bounding set and ambient/inheritable caps");
60
61        // 1. Clear bounding set (requires CAP_SETPCAP in effective set).
62        //    Prevents regaining capabilities through exec of setuid binaries.
63        for cap in caps::all() {
64            if let Err(e) = caps::drop(None, CapSet::Bounding, cap) {
65                debug!(
66                    "Failed to drop bounding cap {:?}: {} (may not be present)",
67                    cap, e
68                );
69            }
70        }
71
72        // M4: Verify the bounding set is actually empty after the drop loop
73        let bounding = caps::read(None, CapSet::Bounding).map_err(|e| {
74            NucleusError::CapabilityError(format!("Failed to read bounding set after drop: {}", e))
75        })?;
76        if !bounding.is_empty() {
77            let leaked: Vec<String> = bounding.iter().map(|c| format!("{:?}", c)).collect();
78            return Err(NucleusError::CapabilityError(format!(
79                "Bounding set still contains capabilities after drop: [{}]",
80                leaked.join(", ")
81            )));
82        }
83
84        // 2. Clear ambient set (constrained to permitted ∩ inheritable).
85        caps::clear(None, CapSet::Ambient).map_err(|e| {
86            NucleusError::CapabilityError(format!("Failed to clear ambient caps: {}", e))
87        })?;
88
89        // 3. Clear inheritable (prevents caps leaking across exec).
90        caps::clear(None, CapSet::Inheritable).map_err(|e| {
91            NucleusError::CapabilityError(format!("Failed to clear inheritable caps: {}", e))
92        })?;
93
94        // Effective/permitted are intentionally kept – they hold CAP_SETUID,
95        // CAP_SETGID, and CAP_SETPCAP needed for the identity switch.
96
97        self.phase = CapPhase::BoundingDropped;
98        info!("Phase 1 complete: bounding/ambient/inheritable cleared, effective/permitted retained for identity switch");
99
100        Ok(())
101    }
102
103    /// Phase 2: Clear all remaining capabilities (permitted + effective).
104    ///
105    /// Call this AFTER the identity switch (setuid/setgid). If the process
106    /// switched to a non-root UID, the kernel already cleared these sets;
107    /// this call makes it explicit and verifies the result.
108    ///
109    /// If no identity switch was needed (process stays root), this performs
110    /// the actual clear.
111    pub fn finalize_drop(&mut self) -> Result<()> {
112        if self.phase == CapPhase::Dropped {
113            debug!("Capabilities already fully dropped, skipping");
114            return Ok(());
115        }
116
117        if self.phase == CapPhase::Initial {
118            // Caller skipped phase 1 – do full drop for backwards compat
119            self.drop_bounding_set()?;
120        }
121
122        info!("Phase 2: clearing permitted and effective caps");
123
124        caps::clear(None, CapSet::Permitted).map_err(|e| {
125            NucleusError::CapabilityError(format!("Failed to clear permitted caps: {}", e))
126        })?;
127
128        caps::clear(None, CapSet::Effective).map_err(|e| {
129            NucleusError::CapabilityError(format!("Failed to clear effective caps: {}", e))
130        })?;
131
132        self.phase = CapPhase::Dropped;
133        info!("Successfully dropped all capabilities (including bounding set)");
134
135        Ok(())
136    }
137
138    /// Drop all capabilities in a single call (convenience wrapper).
139    ///
140    /// Equivalent to calling [`drop_bounding_set`] then [`finalize_drop`].
141    /// Use the two-phase API when an identity switch is needed between phases.
142    pub fn drop_all(&mut self) -> Result<()> {
143        self.drop_bounding_set()?;
144        self.finalize_drop()
145    }
146
147    /// Drop all capabilities except the specified ones
148    ///
149    /// For most use cases, we drop ALL capabilities. This method is provided
150    /// for special cases where specific capabilities are needed.
151    pub fn drop_except(&mut self, keep: &[Capability]) -> Result<()> {
152        if self.phase == CapPhase::Dropped {
153            debug!("Capabilities already dropped, skipping");
154            return Ok(());
155        }
156
157        info!("Dropping capabilities except: {:?}", keep);
158
159        let all_caps = caps::all();
160
161        // 1. Drop bounding set entries FIRST (requires CAP_SETPCAP in effective).
162        for cap in &all_caps {
163            if !keep.contains(cap) {
164                if let Err(e) = caps::drop(None, CapSet::Bounding, *cap) {
165                    debug!(
166                        "Failed to drop bounding cap {:?}: {} (may not be present)",
167                        cap, e
168                    );
169                }
170            }
171        }
172
173        // 2. Clear ambient set (constrained to permitted ∩ inheritable).
174        caps::clear(None, CapSet::Ambient).map_err(|e| {
175            NucleusError::CapabilityError(format!("Failed to clear ambient caps: {}", e))
176        })?;
177
178        // 3. Drop from inheritable, permitted, effective for each non-kept cap.
179        for cap in &all_caps {
180            if !keep.contains(cap) {
181                caps::drop(None, CapSet::Inheritable, *cap).map_err(|e| {
182                    NucleusError::CapabilityError(format!("Failed to drop {cap:?}: {e}"))
183                })?;
184
185                caps::drop(None, CapSet::Permitted, *cap).map_err(|e| {
186                    NucleusError::CapabilityError(format!("Failed to drop {cap:?}: {e}"))
187                })?;
188
189                caps::drop(None, CapSet::Effective, *cap).map_err(|e| {
190                    NucleusError::CapabilityError(format!("Failed to drop {cap:?}: {e}"))
191                })?;
192            }
193        }
194
195        self.phase = CapPhase::Dropped;
196        info!("Successfully dropped capabilities");
197
198        Ok(())
199    }
200
201    /// Apply explicit capability sets.
202    ///
203    /// Bounding is handled as a drop-only upper bound; the remaining sets are
204    /// set exactly to the provided values.
205    pub fn apply_sets(&mut self, sets: &CapabilitySets) -> Result<()> {
206        if self.phase == CapPhase::Dropped {
207            debug!("Capabilities already dropped, skipping");
208            return Ok(());
209        }
210
211        info!("Applying explicit capability sets");
212
213        for cap in caps::all() {
214            if !sets.bounding.contains(&cap) {
215                if let Err(e) = caps::drop(None, CapSet::Bounding, cap) {
216                    debug!(
217                        "Failed to drop bounding cap {:?}: {} (may not be present)",
218                        cap, e
219                    );
220                }
221            }
222        }
223
224        // M5: Set Permitted first, then Effective immediately after to avoid a
225        // window where the old effective set exceeds the new permitted set.
226        caps::set(None, CapSet::Permitted, &to_caps_hash_set(&sets.permitted)).map_err(|e| {
227            NucleusError::CapabilityError(format!("Failed to set permitted caps: {}", e))
228        })?;
229        caps::set(None, CapSet::Effective, &to_caps_hash_set(&sets.effective)).map_err(|e| {
230            NucleusError::CapabilityError(format!("Failed to set effective caps: {}", e))
231        })?;
232        caps::set(
233            None,
234            CapSet::Inheritable,
235            &to_caps_hash_set(&sets.inheritable),
236        )
237        .map_err(|e| {
238            NucleusError::CapabilityError(format!("Failed to set inheritable caps: {}", e))
239        })?;
240        caps::set(None, CapSet::Ambient, &to_caps_hash_set(&sets.ambient)).map_err(|e| {
241            NucleusError::CapabilityError(format!("Failed to set ambient caps: {}", e))
242        })?;
243
244        self.phase = CapPhase::Dropped;
245        info!("Successfully applied capability sets");
246        Ok(())
247    }
248
249    /// Check if capabilities have been dropped
250    pub fn is_dropped(&self) -> bool {
251        self.phase == CapPhase::Dropped
252    }
253
254    /// Verify that namespace-creating capabilities are actually absent from
255    /// the effective set. Seccomp blocks unshare, filters clone namespace flags,
256    /// and returns ENOSYS for clone3; dropping these capabilities is the
257    /// independent capability-layer guard. If the check fails in production
258    /// mode, it returns an error; otherwise it emits a warning.
259    pub fn verify_no_namespace_caps(production: bool) -> Result<()> {
260        use caps::Capability;
261        let ns_caps = [
262            Capability::CAP_SYS_ADMIN,
263            Capability::CAP_NET_ADMIN,
264            Capability::CAP_SYS_PTRACE,
265        ];
266        let effective = caps::read(None, CapSet::Effective).map_err(|e| {
267            NucleusError::CapabilityError(format!("Failed to read effective caps: {}", e))
268        })?;
269        let mut leaked = Vec::new();
270        for cap in &ns_caps {
271            if effective.contains(cap) {
272                leaked.push(format!("{:?}", cap));
273            }
274        }
275        if !leaked.is_empty() {
276            let msg = format!(
277                "SEC-CLONE3: namespace-creating capabilities still present after drop: [{}]. \
278                 seccomp denies unfilterable clone3, but these caps must still be \
279                 absent for defense in depth.",
280                leaked.join(", ")
281            );
282            if production {
283                return Err(NucleusError::CapabilityError(msg));
284            }
285            tracing::warn!("{}", msg);
286        }
287        Ok(())
288    }
289}
290
291impl Default for CapabilityManager {
292    fn default() -> Self {
293        Self::new()
294    }
295}
296
297fn to_caps_hash_set(caps_list: &[Capability]) -> CapsHashSet {
298    caps_list.iter().copied().collect()
299}
300
301#[cfg(test)]
302mod tests {
303    use super::*;
304
305    #[test]
306    fn test_capability_manager_initial_state() {
307        let mgr = CapabilityManager::new();
308        assert!(!mgr.is_dropped());
309    }
310
311    #[test]
312    fn test_drop_idempotent() {
313        let mut mgr = CapabilityManager::new();
314        // First drop may fail in unprivileged test environments (M4 verification).
315        // That's expected – the important thing is idempotency of the dropped flag.
316        match mgr.drop_all() {
317            Ok(()) => {
318                assert!(mgr.is_dropped());
319                // Second drop should also succeed (idempotent)
320                let result = mgr.drop_all();
321                assert!(result.is_ok());
322                assert!(mgr.is_dropped());
323            }
324            Err(_) => {
325                // In unprivileged tests, bounding set verification may fail.
326                // This is expected and not a test failure.
327            }
328        }
329    }
330
331    #[test]
332    fn test_two_phase_drop() {
333        let mut mgr = CapabilityManager::new();
334        // Phase 1 may fail in unprivileged tests; that's fine
335        if let Ok(()) = mgr.drop_bounding_set() {
336            assert!(!mgr.is_dropped()); // not fully dropped yet
337            if let Ok(()) = mgr.finalize_drop() {
338                assert!(mgr.is_dropped())
339            }
340            // clear may fail in test env
341        }
342    }
343}