pathrs 0.2.4

C-friendly API to make path resolution safer on Linux.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
// SPDX-License-Identifier: MPL-2.0 OR LGPL-3.0-or-later
/*
 * libpathrs: safe path resolution on Linux
 * Copyright (C) 2019-2025 SUSE LLC
 * Copyright (C) 2026 Aleksa Sarai <cyphar@cyphar.com>
 *
 * == MPL-2.0 ==
 *
 *  This Source Code Form is subject to the terms of the Mozilla Public
 *  License, v. 2.0. If a copy of the MPL was not distributed with this
 *  file, You can obtain one at https://mozilla.org/MPL/2.0/.
 *
 * Alternatively, this Source Code Form may also (at your option) be used
 * under the terms of the GNU Lesser General Public License Version 3, as
 * described below:
 *
 * == LGPL-3.0-or-later ==
 *
 *  This program is free software: you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or (at
 *  your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY  or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
 * Public License  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program. If not, see <https://www.gnu.org/licenses/>.
 */

//! libpathrs::opath implements an emulated version of openat2(RESOLVE_IN_ROOT).
//! The primary method by which this is done is through shameless abuse of
//! procfs and O_PATH magic-links. The basic idea is that we need to perform all
//! of the path resolution steps (walking down the set of components, handling
//! the effect of symlinks on the resolution, etc).
//!
//! In order to do this safely we need to verify after the walk is done whether
//! the path of the final file descriptor is what we expected (most importantly,
//! is it inside the root which we started the walk with?). This check is done
//! through readlink(/proc/self/fd/$n), which is a magic kernel interface which
//! gives you the kernel's view of the path -- and in cases where the kernel is
//! unsure or otherwise unhappy you get "/".
//!
//! If the check fails, we assume we are being attacked and return an error (and
//! the caller can decide to re-try if they want). The kernel implementation
//! will fail in fewer cases because it has access to in-kernel locks and other
//! measures, but the final check through procfs should block all attack
//! attempts.

use crate::{
    error::{Error, ErrorExt, ErrorImpl},
    flags::{OpenFlags, ResolverFlags},
    procfs::ProcfsHandle,
    resolvers::{opath::SymlinkStack, PartialLookup, MAX_SYMLINK_TRAVERSALS},
    syscalls,
    utils::{self, FdExt, PathIterExt},
    Handle,
};

use std::{
    collections::VecDeque,
    ffi::{OsStr, OsString},
    io::Error as IOError,
    iter,
    os::unix::{
        ffi::OsStrExt,
        fs::MetadataExt,
        io::{AsFd, OwnedFd},
    },
    path::{Path, PathBuf},
    rc::Rc,
};

use itertools::Itertools;
use once_cell::sync::Lazy;

/// Ensure that the expected path within the root matches the current fd.
fn check_current(
    procfs: &ProcfsHandle,
    current: impl AsFd,
    root: impl AsFd,
    expected: impl AsRef<Path>,
) -> Result<(), Error> {
    // SAFETY: as_unsafe_path is safe here since we're using it to build a path
    //         for a string-based check as part of a larger safety setup. This
    //         path will be re-checked after the unsafe "current_path" is
    //         generated.
    let root_path = root
        .as_unsafe_path(procfs)
        .wrap("get root path to construct expected path")?;

    // Combine the root path and our expected_path to get the full path to
    // compare current against.
    let full_path: PathBuf = root_path.join(
        // Path::join() has the unfortunate behaviour that a leading "/" will
        // result in the prefix path being removed. In practice we don't ever
        // hit this case (probably because RawComponents doesn't explicitly have
        // an equivalent of Components::RootDir), but just to be sure prepend a
        // "." component anyway.
        iter::once(OsStr::from_bytes(b"."))
            .chain(expected.as_ref().raw_components())
            // NOTE: PathBuf::push() does not normalise components.
            .collect::<PathBuf>(),
    );

    // Does /proc/self/fd agree with us? There are several circumstances where
    // this check might give a false positive (namely, if the kernel decides
    // that the path is not ordinarily resolveable). But if this check passes,
    // then we can be fairly sure (barring kernel bugs) that the path was safe
    // at least one point in time.
    // SAFETY: as_unsafe_path is safe here since we're explicitly doing a
    //         string-based check to see whether the path we want is correct.
    let current_path = current
        .as_unsafe_path(procfs)
        .wrap("check fd against expected path")?;

    // The paths should be identical.
    if current_path != full_path {
        Err(ErrorImpl::SafetyViolation {
            description: format!(
                "fd doesn't match expected path ({} != {})",
                current_path.display(),
                full_path.display()
            )
            .into(),
        })?
    }

    // And the root should not have moved. Note that this check could (in
    // theory) be bypassed by an attacker -- so it important that users be aware
    // that allowing roots to be moved by an attacker is a very bad idea.
    // SAFETY: as_unsafe_path path is safe here because it's just used in a
    //         string check -- and it's known that this check isn't perfect.
    let new_root_path = root
        .as_unsafe_path(procfs)
        .wrap("get root path to double-check it hasn't moved")?;
    if root_path != new_root_path {
        Err(ErrorImpl::SafetyViolation {
            description: "root moved during lookup".into(),
        })?
    }

    Ok(())
}

/// Cached copy of `fs.protected_symlinks` sysctl.
// TODO: In theory this value could change during the lifetime of the
// program, but there's no nice way of detecting that, and the overhead of
// checking this for every symlink lookup is more likely to be an issue.
// MSRV(1.80): Use LazyLock.
static PROTECTED_SYMLINKS_SYSCTL: Lazy<u32> = Lazy::new(|| {
    let procfs = ProcfsHandle::new().expect("should be able to get a procfs handle");
    utils::sysctl_read_parse(&procfs, "fs.protected_symlinks")
        .expect("should be able to parse fs.protected_symlinks")
});

/// Verify that we should follow the symlink as per `fs.protected_symlinks`.
///
/// Because we emulate symlink following in userspace, the kernel cannot apply
/// `fs.protected_symlinks` restrictions so we need to emulate them ourselves.
fn may_follow_link(dir: impl AsFd, link: impl AsFd) -> Result<(), Error> {
    let link = link.as_fd();

    // Not exposed by rustix. rustix::fs::StatVfs has a proper bitflags type but
    // StatVfsMountFlags doesn't provide ST_NOSYMFOLLOW because it's
    // Linux-specific.
    //
    // NOTE: We also can't use a const here because the exact type depends on
    // both the architecture and the backend used by rustix -- it's simpler to
    // just let Rust pick the right integer size. It would be really nice if we
    // could do something like "const A: typeof<B> = foo".
    #[allow(non_snake_case)]
    let ST_NOSYMFOLLOW = 0x2000; // From <linux/statfs.h>.

    // If the symlink is on an MS_NOSYMFOLLOW mountpoint, we should block
    // resolution to match the behaviour of openat2.
    let link_statfs = syscalls::fstatfs(link).map_err(|err| ErrorImpl::RawOsError {
        operation: "fetch mount flags of symlink".into(),
        source: err,
    })?;
    if link_statfs.f_flags & ST_NOSYMFOLLOW == ST_NOSYMFOLLOW {
        Err(ErrorImpl::OsError {
            operation: "emulated MS_NOSYMFOLLOW".into(),
            source: IOError::from_raw_os_error(libc::ELOOP),
        })?
    }

    // Check that we aren't violating fs.protected_symlinks.
    let fsuid = syscalls::geteuid();
    let dir_meta = dir.metadata().wrap("fetch directory metadata")?;
    let link_meta = link.metadata().wrap("fetch symlink metadata")?;

    const STICKY_WRITABLE: libc::mode_t = libc::S_ISVTX | libc::S_IWOTH;

    // We only do this if fs.protected_symlinks is enabled.
    if *PROTECTED_SYMLINKS_SYSCTL == 0 ||
        // Allowed if owner and follower match.
        link_meta.uid() == fsuid ||
        // Allowed if the directory is not sticky and world-writable.
        dir_meta.mode() & STICKY_WRITABLE != STICKY_WRITABLE ||
        // Allowed if parent directory and link owner match.
        link_meta.uid() == dir_meta.uid()
    {
        Ok(())
    } else {
        Err(ErrorImpl::OsError {
            operation: "emulated fs.protected_symlinks".into(),
            source: IOError::from_raw_os_error(libc::EACCES),
        }
        .into())
    }
}

/// Common implementation used by `resolve_partial()` and `resolve()`. The main
/// difference is that if `symlink_stack` is `true`, the returned paths
// TODO: Make (flags, no_follow_trailing, symlink_stack) a single struct to
//       avoid possible issues with passing a bool to the wrong argument.
fn do_resolve(
    root: impl AsFd,
    path: impl AsRef<Path>,
    flags: ResolverFlags,
    no_follow_trailing: bool,
    mut symlink_stack: Option<&mut SymlinkStack<OwnedFd>>,
) -> Result<PartialLookup<Rc<OwnedFd>>, Error> {
    // We always need procfs for validation when using this resolver.
    let procfs = ProcfsHandle::new()?;

    // What is the final path we expect to get after we do the final open? This
    // allows us to track any attacker moving path components around and we can
    // sanity-check at the very end. This does not include rootpath.
    let mut expected_path = PathBuf::from("/");

    // We only need to keep track of our current dirfd, since we are applying
    // the components one-by-one, and can always switch back to the root
    // if we hit an absolute symlink.
    let root = Rc::new(
        root.as_fd()
            .try_clone_to_owned()
            .map_err(|err| ErrorImpl::OsError {
                operation: "dup root handle as starting point of resolution".into(),
                source: err,
            })?,
    );
    let mut current = Rc::clone(&root);

    // Get initial set of components from the passed path. We remove components
    // as we do the path walk, and update them with the contents of any symlinks
    // we encounter. Path walking terminates when there are no components left.
    let mut remaining_components = path
        .raw_components()
        .map(|p| p.to_os_string())
        .collect::<VecDeque<_>>();

    let mut symlink_traversals = 0;
    while let Some(part) = remaining_components.pop_front() {
        // Stash a copy of the real remaining path. We can't just use
        // ::collect<PathBuf> because we might have "" components, which
        // std::path::PathBuf don't like.
        let remaining: PathBuf = Itertools::intersperse(
            iter::once(&part)
                .chain(remaining_components.iter())
                .map(OsString::as_os_str),
            OsStr::new("/"),
        )
        .collect::<OsString>()
        .into();

        let part = match part.as_bytes() {
            // If we hit an empty component, we need to treat it as though it is
            // "." so that trailing "/" and "//" components on a non-directory
            // correctly return the right error code.
            b"" => ".".into(),
            // For "." component we don't touch expected_path, but we do try to
            // do the open (to return the correct openat2-compliant error if the
            // current path is a not directory).
            b"." => part,
            b".." => {
                // All of expected_path is non-symlinks, so we can treat ".."
                // lexically. If pop() fails, then we are at the root.
                // should .
                if !expected_path.pop() {
                    // If we hit ".." due to the symlink we need to drop it from
                    // the stack like we would if we walked into a real
                    // component. Otherwise walking into ".." will result in a
                    // broken symlink stack error.
                    if let Some(ref mut stack) = symlink_stack {
                        stack
                            .pop_part(&part)
                            .map_err(|err| ErrorImpl::BadSymlinkStackError {
                                description: "walking into component".into(),
                                source: err,
                            })?;
                    }
                    current = Rc::clone(&root);
                    continue;
                }
                part
            }
            _ => {
                // This part might be a symlink, but we clean that up later.
                expected_path.push(&part);

                // Ensure that part doesn't contain any "/"s. It's critical we
                // are only touching the final component in the path. If there
                // are any other path components we must bail. This shouldn't
                // ever happen, but it's better to be safe.
                if part.as_bytes().contains(&b'/') {
                    Err(ErrorImpl::SafetyViolation {
                        description: "component of path resolution contains '/'".into(),
                    })?
                }

                part
            }
        };

        // Get our next element.
        // MSRV(1.69): Remove &*.
        match syscalls::openat(
            &*current,
            &part,
            OpenFlags::O_PATH | OpenFlags::O_NOFOLLOW,
            0,
        )
        .map_err(|err| {
            ErrorImpl::RawOsError {
                operation: "open next component of resolution".into(),
                source: err,
            }
            .into()
        }) {
            Err(err) => {
                return Ok(PartialLookup::Partial {
                    handle: current,
                    remaining,
                    last_error: err,
                });
            }
            Ok(next) => {
                // Make sure that the path is what we expect. If not, there was
                // a racing rename and we should bail out here -- otherwise we
                // might be tricked into revealing information outside the
                // rootfs through error or timing-related attacks.
                //
                // The safety argument for only needing to check ".." is
                // identical to the kernel implementation (namely, walking down
                // is safe by-definition). However, unlike the in-kernel version
                // we don't have the luxury of only doing this check when there
                // was a racing rename -- we have to do it every time.
                if part.as_bytes() == b".." {
                    // MSRV(1.69): Remove &*.
                    check_current(&procfs, &next, &*root, &expected_path)
                        .wrap("check next '..' component didn't escape")?;
                }

                // Is the next dirfd a symlink or an ordinary path? If we're an
                // ordinary dirent, we just update current and move on to the
                // next component. Nothing special here.
                if !next
                    .metadata()
                    .wrap("fstat of next component")?
                    .is_symlink()
                {
                    // We hit a non-symlink component, so clear it from the
                    // symlink stack.
                    if let Some(ref mut stack) = symlink_stack {
                        stack
                            .pop_part(&part)
                            .map_err(|err| ErrorImpl::BadSymlinkStackError {
                                description: "walking into component".into(),
                                source: err,
                            })?;
                    }
                    // Just keep walking.
                    current = next.into();
                    continue;
                } else {
                    // If we hit the last component and we were told to not follow
                    // the trailing symlink, just return the link we have.
                    if remaining_components.is_empty() && no_follow_trailing {
                        current = next.into();
                        break;
                    }

                    // Don't continue walking if user asked for no symlinks.
                    if flags.contains(ResolverFlags::NO_SYMLINKS) {
                        return Ok(PartialLookup::Partial {
                            handle: current,
                            remaining,
                            // Construct a fake OS error containing ELOOP.
                            last_error: ErrorImpl::OsError {
                                operation: "emulated symlink resolution".into(),
                                source: IOError::from_raw_os_error(libc::ELOOP),
                            }
                            .wrap(format!(
                                "component {part:?} is a symlink but symlink resolution is disabled",
                            ))
                            .into(),
                        });
                    }

                    // Verify that we can follow the link.
                    // MSRV(1.69): Remove &*.
                    if let Err(err) = may_follow_link(&*current, &next) {
                        return Ok(PartialLookup::Partial {
                            handle: current,
                            remaining,
                            last_error: err
                                .wrap(format!("component {part:?} is a symlink we cannot follow")),
                        });
                    }

                    // We need a limit on the number of symlinks we traverse to
                    // avoid hitting filesystem loops and DoSing.
                    symlink_traversals += 1;
                    if symlink_traversals >= MAX_SYMLINK_TRAVERSALS {
                        return Ok(PartialLookup::Partial {
                            handle: current,
                            remaining,
                            // Construct a fake OS error containing ELOOP.
                            last_error: ErrorImpl::OsError {
                                operation: "emulated symlink resolution".into(),
                                source: IOError::from_raw_os_error(libc::ELOOP),
                            }
                            .wrap("exceeded symlink limit")
                            .into(),
                        });
                    }

                    let link_target =
                        syscalls::readlinkat(&next, "").map_err(|err| ErrorImpl::RawOsError {
                            operation: "readlink next symlink component".into(),
                            source: err,
                        })?;

                    // Check if it's a good idea to walk this symlink. If we are on
                    // a filesystem that supports magic-links and we've hit an
                    // absolute symlink, it is incredibly likely that this component
                    // is a magic-link and it makes no sense to try to resolve it in
                    // userspace.
                    //
                    // NOTE: There are some pseudo-magic-links like /proc/self
                    // (which dynamically generates the symlink contents but doesn't
                    // use nd_jump_link). In the case of procfs, these are always
                    // relative, and they are reasonable for us to walk.
                    //
                    // In procfs, all magic-links use d_path() to generate
                    // readlink() and thus are all absolute paths. (Unfortunately,
                    // apparmorfs uses nd_jump_link to make
                    // /sys/kernel/security/apparmor/policy dynamic using actual
                    // nd_jump_link() and their readlink give us a dummy relative
                    // path like "apparmorfs:[123]". But in that case we will just
                    // get an error.)
                    if link_target.is_absolute()
                        && next
                            .is_magiclink_filesystem()
                            .wrap("check if next is on a dangerous filesystem")?
                    {
                        Err(ErrorImpl::OsError {
                            operation: "emulated RESOLVE_NO_MAGICLINKS".into(),
                            source: IOError::from_raw_os_error(libc::ELOOP),
                        })
                        .wrap("walked into a potential magic-link")?
                    }

                    // Swap out the symlink component in the symlink stack with
                    // a new entry for the link target.
                    if let Some(ref mut stack) = symlink_stack {
                        stack
                            .swap_link(&part, (&current, remaining), link_target.clone())
                            .map_err(|err| ErrorImpl::BadSymlinkStackError {
                                description: "walking into symlink".into(),
                                source: err,
                            })?;
                    }

                    // Remove the link component from our expectex path.
                    expected_path.pop();

                    // Add contents of the symlink to the set of components we are
                    // looping over.
                    link_target
                        .raw_components()
                        .prepend(&mut remaining_components);

                    // Absolute symlinks reset our current state back to /.
                    if link_target.is_absolute() {
                        current = Rc::clone(&root);
                        expected_path = PathBuf::from("/");
                    }
                }
            }
        }
    }

    // Make sure that the path is what we expect...
    // MSRV(1.69): Remove &*.
    check_current(&procfs, &*current, &*root, &expected_path)
        .wrap("check final handle didn't escape")?;

    // We finished the lookup with no remaining components.
    Ok(PartialLookup::Complete(current))
}

/// Resolve as many components as possible in `path` within `root` through
/// user-space emulation.
pub(crate) fn resolve_partial(
    root: impl AsFd,
    path: impl AsRef<Path>,
    flags: ResolverFlags,
    no_follow_trailing: bool,
) -> Result<PartialLookup<Rc<OwnedFd>>, Error> {
    // For partial lookups, we need to use a SymlinkStack to match openat2.
    let mut symlink_stack = SymlinkStack::new();

    match do_resolve(
        root,
        path,
        flags,
        no_follow_trailing,
        Some(&mut symlink_stack),
    ) {
        // For complete and error paths, just return what we got.
        ret @ Ok(PartialLookup::Complete(_)) => ret,
        err @ Err(_) => err,

        // If the lookup failed part-way through, modify the (handle, remaining)
        // based on the symlink stack if applicable.
        Ok(PartialLookup::Partial {
            handle,
            remaining,
            last_error,
        }) => match symlink_stack.pop_top_symlink() {
            // We were in the middle of symlink resolution, so return the error
            // from the context of the top symlink in the resolution, to match
            // openat2(2).
            Some((handle, remaining)) => Ok(PartialLookup::Partial {
                handle,
                remaining,
                last_error,
            }),
            // Nothing in the symlink stack, return what we got.
            None => Ok(PartialLookup::Partial {
                handle,
                remaining,
                last_error,
            }),
        },
    }
}

/// Resolve `path` within `root` through user-space emulation.
pub(crate) fn resolve(
    root: impl AsFd,
    path: impl AsRef<Path>,
    flags: ResolverFlags,
    no_follow_trailing: bool,
) -> Result<Handle, Error> {
    do_resolve(root, path, flags, no_follow_trailing, None).and_then(TryInto::try_into)
}