pelagos 0.1.1

Fast Linux container runtime — OCI-compatible, namespaces, cgroups v2, seccomp, networking, image management
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
//! `remora exec` — run a command inside a running container.

use super::{check_liveness, parse_user, read_state, ContainerStatus};
use pelagos::container::{Command, Namespace, Stdio};
use std::os::unix::io::AsRawFd;
use std::path::PathBuf;
use std::sync::{
    atomic::{AtomicI32, Ordering},
    Arc,
};

#[derive(Debug, clap::Args)]
pub struct ExecArgs {
    /// Container name
    pub name: String,

    /// Allocate a PTY for interactive use
    #[clap(long, short = 'i')]
    pub interactive: bool,

    /// Environment variable KEY=VALUE (repeatable)
    #[clap(long = "env", short = 'e')]
    pub env: Vec<String>,

    /// Working directory inside the container
    #[clap(long = "workdir", short = 'w')]
    pub workdir: Option<String>,

    /// `UID[:GID]` to run as (e.g. 1000 or 1000:1000)
    #[clap(long = "user", short = 'u')]
    pub user: Option<String>,

    /// Command and arguments to run
    #[clap(multiple_values = true, required = true, allow_hyphen_values = true)]
    pub args: Vec<String>,
}

pub fn cmd_exec(args: ExecArgs) -> Result<(), Box<dyn std::error::Error>> {
    // 1. Validate container is running
    let state = read_state(&args.name)
        .map_err(|e| format!("container '{}' not found: {}", args.name, e))?;

    if state.status != ContainerStatus::Running || !check_liveness(state.pid) {
        return Err(format!("container '{}' is not running", args.name).into());
    }

    let pid = state.pid;

    // 2. Discover which namespaces the container has
    let ns_entries = discover_namespaces(pid)?;

    // 3. Read the container's environment
    let container_env = read_proc_environ(pid);

    // 4. Build Command
    let exe = &args.args[0];
    let rest = &args.args[1..];

    let mut cmd = Command::new(exe).args(rest);

    // The pre_exec order in container.rs is:
    //   chroot (step 4) → user callback (step 5) → setns (step 6)
    //
    // For exec we need setns(MOUNT) BEFORE chroot so the container's mount
    // table is active.  We handle mount-ns join via a pre_exec callback that
    // does: setns(mnt_fd) → fchdir(root_fd) → chroot(".") → chdir("/").
    // Non-mount namespaces use the normal with_namespace_join() path.
    let mut has_mount_ns = false;
    for (path, ns) in &ns_entries {
        if *ns == Namespace::MOUNT {
            has_mount_ns = true;
        } else {
            cmd = cmd.with_namespace_join(path, *ns);
        }
    }

    // Capture workdir for use in the pre_exec callback.
    let exec_workdir = args.workdir.clone();

    if has_mount_ns {
        // Open both fds in the parent (before fork) — inherited across fork.
        let mnt_ns_path = format!("/proc/{}/ns/mnt", pid);
        let mnt_ns_file = std::fs::File::open(&mnt_ns_path)
            .map_err(|e| format!("open {}: {}", mnt_ns_path, e))?;
        let mnt_ns_fd = mnt_ns_file.as_raw_fd();

        // Open the container's root directory as an fd.  After setns(MOUNT),
        // path-based resolution uses the host root (unchanged by setns).
        // fchdir(root_fd) + chroot(".") is the correct way to enter the
        // container's root — same technique as nsenter(1).
        //
        // IMPORTANT: with PID namespace enabled, state.pid = P (intermediate
        // process), which never called pivot_root — so /proc/P/root is the HOST
        // root.  Use find_root_pid() to find C (P's only child), which did
        // pivot_root and whose /proc/C/root is the container overlay root.
        let root_pid = find_root_pid(pid);
        let root_path = format!("/proc/{}/root", root_pid);
        let root_file =
            std::fs::File::open(&root_path).map_err(|e| format!("open {}: {}", root_path, e))?;
        let root_fd = root_file.as_raw_fd();

        cmd = cmd.with_pre_exec(move || {
            // Keep File objects alive so fds remain valid.
            let _keep_mnt = &mnt_ns_file;
            let _keep_root = &root_file;
            unsafe {
                if libc::setns(mnt_ns_fd, libc::CLONE_NEWNS) != 0 {
                    return Err(std::io::Error::last_os_error());
                }
                if libc::fchdir(root_fd) != 0 {
                    return Err(std::io::Error::last_os_error());
                }
                let dot = std::ffi::CString::new(".").unwrap();
                if libc::chroot(dot.as_ptr()) != 0 {
                    return Err(std::io::Error::last_os_error());
                }
                // chdir to the requested workdir (or "/" if none).
                let target = exec_workdir.as_deref().unwrap_or("/");
                let target_c = std::ffi::CString::new(target).unwrap();
                if libc::chdir(target_c.as_ptr()) != 0 {
                    return Err(std::io::Error::last_os_error());
                }
            }
            Ok(())
        });
    } else {
        // No mount namespace to join — access rootfs via procfs.
        let root_pid = find_root_pid(pid);
        cmd = cmd.with_chroot(format!("/proc/{}/root", root_pid));
        // For non-mount-ns exec, use the normal with_cwd mechanism.
        if let Some(ref w) = exec_workdir {
            cmd = cmd.with_cwd(w);
        }
    }

    // Apply container environment as base
    for (k, v) in &container_env {
        cmd = cmd.env(k, v);
    }

    // Apply CLI -e overrides
    for e in &args.env {
        if let Some((k, v)) = e.split_once('=') {
            cmd = cmd.env(k, v);
        } else if let Ok(v) = std::env::var(e) {
            cmd = cmd.env(e, v);
        }
    }

    // User
    if let Some(ref u) = args.user {
        let (uid, gid) = parse_user(u)?;
        cmd = cmd.with_uid(uid);
        if let Some(g) = gid {
            cmd = cmd.with_gid(g);
        }
    }

    // 5. Spawn
    if args.interactive {
        let session = cmd
            .spawn_interactive()
            .map_err(|e| format!("spawn_interactive failed: {}", e))?;
        match session.run() {
            Ok(status) => {
                let code = status.code().unwrap_or(0);
                std::process::exit(code);
            }
            Err(e) => Err(format!("interactive session failed: {}", e).into()),
        }
    } else {
        cmd = cmd
            .stdin(Stdio::Inherit)
            .stdout(Stdio::Inherit)
            .stderr(Stdio::Inherit);

        let mut child = cmd
            .spawn()
            .map_err(|e| format!("exec spawn failed: {}", e))?;
        let exit = child
            .wait()
            .map_err(|e| format!("exec wait failed: {}", e))?;
        let code = exit.code().unwrap_or(1);
        std::process::exit(code);
    }
}

/// Run `args` in the container identified by `pid`'s namespaces.
///
/// Returns:
/// - `Some(true)` — command exited with status 0
/// - `Some(false)` — command exited non-zero
/// - `None` — the container is gone (pid dead, namespaces unreachable)
///
/// Discards all output (stdin/stdout/stderr → /dev/null).
pub fn exec_in_container(pid: i32, args: &[String]) -> Option<bool> {
    if args.is_empty() || pid <= 0 {
        return None;
    }

    let ns_entries = discover_namespaces(pid).ok()?;
    // If we can't discover any namespaces the container is probably gone.
    // But allow proceeding (ns_entries may be empty if no namespaces differ).

    let mut cmd = Command::new(&args[0]).args(&args[1..]);
    cmd = cmd
        .stdin(Stdio::Null)
        .stdout(Stdio::Null)
        .stderr(Stdio::Null);

    let mut has_mount_ns = false;
    for (path, ns) in &ns_entries {
        if *ns == Namespace::MOUNT {
            has_mount_ns = true;
        } else {
            cmd = cmd.with_namespace_join(path, *ns);
        }
    }

    if has_mount_ns {
        let mnt_ns_path = format!("/proc/{}/ns/mnt", pid);
        let mnt_ns_file = std::fs::File::open(&mnt_ns_path).ok()?;
        let mnt_ns_fd = mnt_ns_file.as_raw_fd();

        // See cmd_exec for the PID-namespace / intermediate-process explanation.
        let root_pid = find_root_pid(pid);
        let root_path = format!("/proc/{}/root", root_pid);
        let root_file = std::fs::File::open(&root_path).ok()?;
        let root_fd = root_file.as_raw_fd();

        cmd = cmd.with_pre_exec(move || {
            let _keep_mnt = &mnt_ns_file;
            let _keep_root = &root_file;
            unsafe {
                if libc::setns(mnt_ns_fd, libc::CLONE_NEWNS) != 0 {
                    return Err(std::io::Error::last_os_error());
                }
                if libc::fchdir(root_fd) != 0 {
                    return Err(std::io::Error::last_os_error());
                }
                let dot = std::ffi::CString::new(".").unwrap();
                if libc::chroot(dot.as_ptr()) != 0 {
                    return Err(std::io::Error::last_os_error());
                }
                let root_c = std::ffi::CString::new("/").unwrap();
                if libc::chdir(root_c.as_ptr()) != 0 {
                    return Err(std::io::Error::last_os_error());
                }
            }
            Ok(())
        });
    } else {
        let root_pid = find_root_pid(pid);
        cmd = cmd.with_chroot(format!("/proc/{}/root", root_pid));
    }

    match cmd.spawn() {
        Ok(mut child) => child.wait().map(|s| s.success()).ok(),
        Err(_) => None,
    }
}

/// Like [`exec_in_container`] but stores the spawned child's host PID into
/// `child_pid_sink` (via `Relaxed` store) before blocking on `wait()`.
///
/// This lets a caller that enforces a timeout read the PID and send `SIGKILL`
/// to the child if the wait does not complete in time.  The sink is set to
/// `0` if spawn fails.
pub fn exec_in_container_with_pid_sink(
    pid: i32,
    args: &[String],
    child_pid_sink: Arc<AtomicI32>,
) -> Option<bool> {
    if args.is_empty() || pid <= 0 {
        return None;
    }

    let ns_entries = discover_namespaces(pid).ok()?;

    let mut cmd = Command::new(&args[0]).args(&args[1..]);
    cmd = cmd
        .stdin(Stdio::Null)
        .stdout(Stdio::Null)
        .stderr(Stdio::Null);

    let mut has_mount_ns = false;
    for (path, ns) in &ns_entries {
        if *ns == Namespace::MOUNT {
            has_mount_ns = true;
        } else {
            cmd = cmd.with_namespace_join(path, *ns);
        }
    }

    if has_mount_ns {
        let mnt_ns_path = format!("/proc/{}/ns/mnt", pid);
        let mnt_ns_file = std::fs::File::open(&mnt_ns_path).ok()?;
        let mnt_ns_fd = mnt_ns_file.as_raw_fd();

        let root_pid = find_root_pid(pid);
        let root_path = format!("/proc/{}/root", root_pid);
        let root_file = std::fs::File::open(&root_path).ok()?;
        let root_fd = root_file.as_raw_fd();

        cmd = cmd.with_pre_exec(move || {
            let _keep_mnt = &mnt_ns_file;
            let _keep_root = &root_file;
            unsafe {
                if libc::setns(mnt_ns_fd, libc::CLONE_NEWNS) != 0 {
                    return Err(std::io::Error::last_os_error());
                }
                if libc::fchdir(root_fd) != 0 {
                    return Err(std::io::Error::last_os_error());
                }
                let dot = std::ffi::CString::new(".").unwrap();
                if libc::chroot(dot.as_ptr()) != 0 {
                    return Err(std::io::Error::last_os_error());
                }
                let root_c = std::ffi::CString::new("/").unwrap();
                if libc::chdir(root_c.as_ptr()) != 0 {
                    return Err(std::io::Error::last_os_error());
                }
            }
            Ok(())
        });
    } else {
        let root_pid = find_root_pid(pid);
        cmd = cmd.with_chroot(format!("/proc/{}/root", root_pid));
    }

    match cmd.spawn() {
        Ok(mut child) => {
            child_pid_sink.store(child.pid(), Ordering::Relaxed);
            child.wait().map(|s| s.success()).ok()
        }
        Err(_) => None,
    }
}

/// Given a PID, return the PID of the process that actually performed chroot/pivot_root.
///
/// With PID namespace enabled, `state.pid` is the intermediate process P, which
/// never calls pivot_root (that is done by C, PID 1 inside the container).  P has
/// exactly one child (C), so `/proc/P/root` still points at the HOST root — not the
/// container's overlay.  We detect this by checking `/proc/{pid}/task/{pid}/children`:
/// if there is exactly one child, that child is C and its `/proc/{child}/root` is the
/// correct container root.
///
/// Without PID namespace `state.pid` IS the container process, which may or may not
/// have children.  In that case we use `pid` directly (and if it has children those
/// processes are also inside the container, so either PID's root is correct).
fn find_root_pid(pid: i32) -> i32 {
    let path = format!("/proc/{}/task/{}/children", pid, pid);
    if let Ok(content) = std::fs::read_to_string(&path) {
        let children: Vec<i32> = content
            .split_whitespace()
            .filter_map(|s| s.parse().ok())
            .collect();
        if children.len() == 1 {
            return children[0];
        }
    }
    pid
}

/// Compare `/proc/{pid}/ns/{type}` inodes against `/proc/1/ns/{type}` to discover
/// which namespaces the container process is in (i.e., different from init).
///
/// PID namespace special case: when a PID namespace is active, `pid` may be the
/// intermediate process P (which lives in the host PID namespace). P's
/// `/proc/P/ns/pid` matches init, so the normal check misses it. P's children
/// (the container's PID 1) inhabit the namespace pointed to by
/// `/proc/P/ns/pid_for_children`. We check that symlink after the main loop and
/// add it as `Namespace::PID` if it differs from init's PID namespace. Calling
/// `setns(pid_for_children_fd, CLONE_NEWPID)` in pre_exec followed by `exec()`
/// then moves the exec'd process into the container's PID namespace.
pub fn discover_namespaces(
    pid: i32,
) -> Result<Vec<(PathBuf, Namespace)>, Box<dyn std::error::Error>> {
    let ns_map: &[(&str, Namespace)] = &[
        ("mnt", Namespace::MOUNT),
        ("uts", Namespace::UTS),
        ("ipc", Namespace::IPC),
        ("net", Namespace::NET),
        ("pid", Namespace::PID),
        ("user", Namespace::USER),
        ("cgroup", Namespace::CGROUP),
    ];

    let mut result = Vec::new();

    for &(ns_name, ns_flag) in ns_map {
        let container_ns = format!("/proc/{}/ns/{}", pid, ns_name);
        let init_ns = format!("/proc/1/ns/{}", ns_name);

        let container_ino = match std::fs::metadata(&container_ns) {
            Ok(m) => {
                use std::os::unix::fs::MetadataExt;
                m.ino()
            }
            Err(_) => continue,
        };
        let init_ino = match std::fs::metadata(&init_ns) {
            Ok(m) => {
                use std::os::unix::fs::MetadataExt;
                m.ino()
            }
            Err(_) => continue,
        };

        if container_ino != init_ino {
            result.push((PathBuf::from(container_ns), ns_flag));
        }
    }

    // If PID namespace was not found above (because `pid` is the intermediate
    // process P that lives in the host PID namespace), check pid_for_children.
    // This symlink points to the namespace that P's children (the container's
    // PID 1) actually inhabit.
    let pid_already_found = result.iter().any(|(_, ns)| *ns == Namespace::PID);
    if !pid_already_found {
        let pfc_path = format!("/proc/{}/ns/pid_for_children", pid);
        let init_pid_path = "/proc/1/ns/pid";
        let pfc_ino = std::fs::metadata(&pfc_path).ok().map(|m| {
            use std::os::unix::fs::MetadataExt;
            m.ino()
        });
        let init_pid_ino = std::fs::metadata(init_pid_path).ok().map(|m| {
            use std::os::unix::fs::MetadataExt;
            m.ino()
        });
        if let (Some(pfc), Some(init)) = (pfc_ino, init_pid_ino) {
            if pfc != init {
                result.push((PathBuf::from(pfc_path), Namespace::PID));
            }
        }
    }

    Ok(result)
}

/// Read `/proc/{pid}/environ` — NUL-separated KEY=VALUE pairs.
fn read_proc_environ(pid: i32) -> Vec<(String, String)> {
    let path = format!("/proc/{}/environ", pid);
    let data = match std::fs::read(&path) {
        Ok(d) => d,
        Err(_) => return Vec::new(),
    };

    data.split(|&b| b == 0)
        .filter(|s| !s.is_empty())
        .filter_map(|entry| {
            let s = String::from_utf8_lossy(entry);
            let (k, v) = s.split_once('=')?;
            Some((k.to_string(), v.to_string()))
        })
        .collect()
}