1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
use crate::error::{NucleusError, Result};
use caps::{CapSet, Capability, CapsHashSet};
use tracing::{debug, info};
/// Security context that tracks capability state
pub struct CapabilityManager {
phase: CapPhase,
}
/// Tracks which phase of the two-phase cap drop we're in.
///
/// Docker/runc convention: the identity switch (setuid/setgid) must happen
/// between bounding-set cleanup and final cap clear. This is because:
/// - PR_CAPBSET_DROP requires CAP_SETPCAP in the effective set
/// - setuid/setgid require CAP_SETUID/CAP_SETGID in the effective set
/// - After setuid to non-zero UID, the kernel auto-clears permitted/effective
///
/// So the ordering is: drop bounding → setuid/setgid → clear remaining caps.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum CapPhase {
/// No caps have been modified yet
Initial,
/// Bounding set dropped; effective/permitted still intact for identity switch
BoundingDropped,
/// All caps fully dropped (terminal state)
Dropped,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CapabilitySets {
pub bounding: Vec<Capability>,
pub permitted: Vec<Capability>,
pub effective: Vec<Capability>,
pub inheritable: Vec<Capability>,
pub ambient: Vec<Capability>,
}
impl CapabilityManager {
pub fn new() -> Self {
Self {
phase: CapPhase::Initial,
}
}
/// Phase 1: Drop the bounding set and clear ambient/inheritable caps.
///
/// After this call, CAP_SETUID and CAP_SETGID remain in the effective set
/// so the caller can perform the identity switch (setuid/setgid). Call
/// [`finalize_drop`] after the identity switch to clear remaining caps.
///
/// This follows Docker/runc convention: bounding set is cleared first
/// while CAP_SETPCAP is still in the effective set.
pub fn drop_bounding_set(&mut self) -> Result<()> {
if self.phase != CapPhase::Initial {
debug!("Bounding set already dropped, skipping");
return Ok(());
}
info!("Phase 1: dropping bounding set and ambient/inheritable caps");
// 1. Clear bounding set (requires CAP_SETPCAP in effective set).
// Prevents regaining capabilities through exec of setuid binaries.
for cap in caps::all() {
if let Err(e) = caps::drop(None, CapSet::Bounding, cap) {
debug!(
"Failed to drop bounding cap {:?}: {} (may not be present)",
cap, e
);
}
}
// M4: Verify the bounding set is actually empty after the drop loop
let bounding = caps::read(None, CapSet::Bounding).map_err(|e| {
NucleusError::CapabilityError(format!("Failed to read bounding set after drop: {}", e))
})?;
if !bounding.is_empty() {
let leaked: Vec<String> = bounding.iter().map(|c| format!("{:?}", c)).collect();
return Err(NucleusError::CapabilityError(format!(
"Bounding set still contains capabilities after drop: [{}]",
leaked.join(", ")
)));
}
// 2. Clear ambient set (constrained to permitted ∩ inheritable).
caps::clear(None, CapSet::Ambient).map_err(|e| {
NucleusError::CapabilityError(format!("Failed to clear ambient caps: {}", e))
})?;
// 3. Clear inheritable (prevents caps leaking across exec).
caps::clear(None, CapSet::Inheritable).map_err(|e| {
NucleusError::CapabilityError(format!("Failed to clear inheritable caps: {}", e))
})?;
// Effective/permitted are intentionally kept – they hold CAP_SETUID,
// CAP_SETGID, and CAP_SETPCAP needed for the identity switch.
self.phase = CapPhase::BoundingDropped;
info!("Phase 1 complete: bounding/ambient/inheritable cleared, effective/permitted retained for identity switch");
Ok(())
}
/// Phase 2: Clear all remaining capabilities (permitted + effective).
///
/// Call this AFTER the identity switch (setuid/setgid). If the process
/// switched to a non-root UID, the kernel already cleared these sets;
/// this call makes it explicit and verifies the result.
///
/// If no identity switch was needed (process stays root), this performs
/// the actual clear.
pub fn finalize_drop(&mut self) -> Result<()> {
if self.phase == CapPhase::Dropped {
debug!("Capabilities already fully dropped, skipping");
return Ok(());
}
if self.phase == CapPhase::Initial {
// Caller skipped phase 1 – do full drop for backwards compat
self.drop_bounding_set()?;
}
info!("Phase 2: clearing permitted and effective caps");
caps::clear(None, CapSet::Permitted).map_err(|e| {
NucleusError::CapabilityError(format!("Failed to clear permitted caps: {}", e))
})?;
caps::clear(None, CapSet::Effective).map_err(|e| {
NucleusError::CapabilityError(format!("Failed to clear effective caps: {}", e))
})?;
self.phase = CapPhase::Dropped;
info!("Successfully dropped all capabilities (including bounding set)");
Ok(())
}
/// Drop all capabilities in a single call (convenience wrapper).
///
/// Equivalent to calling [`drop_bounding_set`] then [`finalize_drop`].
/// Use the two-phase API when an identity switch is needed between phases.
pub fn drop_all(&mut self) -> Result<()> {
self.drop_bounding_set()?;
self.finalize_drop()
}
/// Drop all capabilities except the specified ones
///
/// For most use cases, we drop ALL capabilities. This method is provided
/// for special cases where specific capabilities are needed.
pub fn drop_except(&mut self, keep: &[Capability]) -> Result<()> {
if self.phase == CapPhase::Dropped {
debug!("Capabilities already dropped, skipping");
return Ok(());
}
info!("Dropping capabilities except: {:?}", keep);
let all_caps = caps::all();
// 1. Drop bounding set entries FIRST (requires CAP_SETPCAP in effective).
for cap in &all_caps {
if !keep.contains(cap) {
if let Err(e) = caps::drop(None, CapSet::Bounding, *cap) {
debug!(
"Failed to drop bounding cap {:?}: {} (may not be present)",
cap, e
);
}
}
}
// 2. Clear ambient set (constrained to permitted ∩ inheritable).
caps::clear(None, CapSet::Ambient).map_err(|e| {
NucleusError::CapabilityError(format!("Failed to clear ambient caps: {}", e))
})?;
// 3. Drop from inheritable, permitted, effective for each non-kept cap.
for cap in &all_caps {
if !keep.contains(cap) {
caps::drop(None, CapSet::Inheritable, *cap).map_err(|e| {
NucleusError::CapabilityError(format!("Failed to drop {cap:?}: {e}"))
})?;
caps::drop(None, CapSet::Permitted, *cap).map_err(|e| {
NucleusError::CapabilityError(format!("Failed to drop {cap:?}: {e}"))
})?;
caps::drop(None, CapSet::Effective, *cap).map_err(|e| {
NucleusError::CapabilityError(format!("Failed to drop {cap:?}: {e}"))
})?;
}
}
self.phase = CapPhase::Dropped;
info!("Successfully dropped capabilities");
Ok(())
}
/// Apply explicit capability sets.
///
/// Bounding is handled as a drop-only upper bound; the remaining sets are
/// set exactly to the provided values.
pub fn apply_sets(&mut self, sets: &CapabilitySets) -> Result<()> {
if self.phase == CapPhase::Dropped {
debug!("Capabilities already dropped, skipping");
return Ok(());
}
info!("Applying explicit capability sets");
for cap in caps::all() {
if !sets.bounding.contains(&cap) {
if let Err(e) = caps::drop(None, CapSet::Bounding, cap) {
debug!(
"Failed to drop bounding cap {:?}: {} (may not be present)",
cap, e
);
}
}
}
// M5: Set Permitted first, then Effective immediately after to avoid a
// window where the old effective set exceeds the new permitted set.
caps::set(None, CapSet::Permitted, &to_caps_hash_set(&sets.permitted)).map_err(|e| {
NucleusError::CapabilityError(format!("Failed to set permitted caps: {}", e))
})?;
caps::set(None, CapSet::Effective, &to_caps_hash_set(&sets.effective)).map_err(|e| {
NucleusError::CapabilityError(format!("Failed to set effective caps: {}", e))
})?;
caps::set(
None,
CapSet::Inheritable,
&to_caps_hash_set(&sets.inheritable),
)
.map_err(|e| {
NucleusError::CapabilityError(format!("Failed to set inheritable caps: {}", e))
})?;
caps::set(None, CapSet::Ambient, &to_caps_hash_set(&sets.ambient)).map_err(|e| {
NucleusError::CapabilityError(format!("Failed to set ambient caps: {}", e))
})?;
self.phase = CapPhase::Dropped;
info!("Successfully applied capability sets");
Ok(())
}
/// Check if capabilities have been dropped
pub fn is_dropped(&self) -> bool {
self.phase == CapPhase::Dropped
}
/// Verify that namespace-creating capabilities are actually absent from
/// the effective set. This is a runtime guard for the clone3 seccomp
/// invariant: clone3 cannot be argument-filtered at the BPF level, so
/// we rely on CAP_SYS_ADMIN (et al.) being dropped to prevent namespace
/// creation. If the check fails in production mode, it returns an error;
/// otherwise it emits a warning.
pub fn verify_no_namespace_caps(production: bool) -> Result<()> {
use caps::Capability;
let ns_caps = [
Capability::CAP_SYS_ADMIN,
Capability::CAP_NET_ADMIN,
Capability::CAP_SYS_PTRACE,
];
let effective = caps::read(None, CapSet::Effective).map_err(|e| {
NucleusError::CapabilityError(format!("Failed to read effective caps: {}", e))
})?;
let mut leaked = Vec::new();
for cap in &ns_caps {
if effective.contains(cap) {
leaked.push(format!("{:?}", cap));
}
}
if !leaked.is_empty() {
let msg = format!(
"SEC-CLONE3: namespace-creating capabilities still present after drop: [{}]. \
clone3 syscall is allowed without argument filtering – these caps \
must be absent to prevent namespace escape.",
leaked.join(", ")
);
if production {
return Err(NucleusError::CapabilityError(msg));
}
tracing::warn!("{}", msg);
}
Ok(())
}
}
impl Default for CapabilityManager {
fn default() -> Self {
Self::new()
}
}
fn to_caps_hash_set(caps_list: &[Capability]) -> CapsHashSet {
caps_list.iter().copied().collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_capability_manager_initial_state() {
let mgr = CapabilityManager::new();
assert!(!mgr.is_dropped());
}
#[test]
fn test_drop_idempotent() {
let mut mgr = CapabilityManager::new();
// First drop may fail in unprivileged test environments (M4 verification).
// That's expected – the important thing is idempotency of the dropped flag.
match mgr.drop_all() {
Ok(()) => {
assert!(mgr.is_dropped());
// Second drop should also succeed (idempotent)
let result = mgr.drop_all();
assert!(result.is_ok());
assert!(mgr.is_dropped());
}
Err(_) => {
// In unprivileged tests, bounding set verification may fail.
// This is expected and not a test failure.
}
}
}
#[test]
fn test_two_phase_drop() {
let mut mgr = CapabilityManager::new();
// Phase 1 may fail in unprivileged tests; that's fine
match mgr.drop_bounding_set() {
Ok(()) => {
assert!(!mgr.is_dropped()); // not fully dropped yet
match mgr.finalize_drop() {
Ok(()) => assert!(mgr.is_dropped()),
Err(_) => {} // clear may fail in test env
}
}
Err(_) => {}
}
}
}