Skip to main content

secure_exec_kernel/
fd_table.rs

1use std::collections::{btree_map::Values, BTreeMap, BTreeSet};
2use std::error::Error;
3use std::fmt;
4use std::sync::atomic::{AtomicU32, AtomicU64, AtomicUsize, Ordering};
5use std::sync::{Arc, Condvar, Mutex, MutexGuard};
6
7pub const MAX_FDS_PER_PROCESS: usize = 256;
8
9pub const O_RDONLY: u32 = 0;
10pub const O_WRONLY: u32 = 1;
11pub const O_RDWR: u32 = 2;
12pub const O_CREAT: u32 = 0o100;
13pub const O_EXCL: u32 = 0o200;
14pub const O_TRUNC: u32 = 0o1000;
15pub const O_APPEND: u32 = 0o2000;
16pub const O_NONBLOCK: u32 = 0o4000;
17pub const F_DUPFD: u32 = 0;
18pub const F_GETFD: u32 = 1;
19pub const F_SETFD: u32 = 2;
20pub const F_GETFL: u32 = 3;
21pub const F_SETFL: u32 = 4;
22pub const FD_CLOEXEC: u32 = 1;
23pub const LOCK_SH: u32 = 1;
24pub const LOCK_EX: u32 = 2;
25pub const LOCK_NB: u32 = 4;
26pub const LOCK_UN: u32 = 8;
27
28pub const FILETYPE_UNKNOWN: u8 = 0;
29pub const FILETYPE_CHARACTER_DEVICE: u8 = 2;
30pub const FILETYPE_DIRECTORY: u8 = 3;
31pub const FILETYPE_REGULAR_FILE: u8 = 4;
32pub const FILETYPE_PIPE: u8 = 6;
33pub const FILETYPE_SYMBOLIC_LINK: u8 = 7;
34
35pub type FdResult<T> = Result<T, FdTableError>;
36pub type SharedFileDescription = Arc<FileDescription>;
37
38#[derive(Debug, Clone, PartialEq, Eq)]
39pub struct FdTableError {
40    code: &'static str,
41    message: String,
42}
43
44impl FdTableError {
45    pub fn code(&self) -> &'static str {
46        self.code
47    }
48
49    fn bad_file_descriptor(fd: u32) -> Self {
50        Self {
51            code: "EBADF",
52            message: format!("bad file descriptor {fd}"),
53        }
54    }
55
56    fn too_many_open_files() -> Self {
57        Self {
58            code: "EMFILE",
59            message: String::from("too many open files"),
60        }
61    }
62
63    fn invalid_argument(message: impl Into<String>) -> Self {
64        Self {
65            code: "EINVAL",
66            message: message.into(),
67        }
68    }
69
70    fn would_block(message: impl Into<String>) -> Self {
71        Self {
72            code: "EWOULDBLOCK",
73            message: message.into(),
74        }
75    }
76}
77
78impl fmt::Display for FdTableError {
79    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
80        write!(f, "{}: {}", self.code, self.message)
81    }
82}
83
84impl Error for FdTableError {}
85
86#[derive(Debug)]
87pub struct FileDescription {
88    id: u64,
89    path: String,
90    lock_target: Option<FileLockTarget>,
91    cursor: AtomicU64,
92    flags: AtomicU32,
93    ref_count: AtomicUsize,
94}
95
96impl FileDescription {
97    pub fn new(id: u64, path: impl Into<String>, flags: u32) -> Self {
98        Self::with_ref_count_and_lock(id, path, flags, 1, None)
99    }
100
101    pub fn new_with_lock(
102        id: u64,
103        path: impl Into<String>,
104        flags: u32,
105        lock_target: Option<FileLockTarget>,
106    ) -> Self {
107        Self::with_ref_count_and_lock(id, path, flags, 1, lock_target)
108    }
109
110    pub fn with_ref_count(id: u64, path: impl Into<String>, flags: u32, ref_count: usize) -> Self {
111        Self::with_ref_count_and_lock(id, path, flags, ref_count, None)
112    }
113
114    pub fn with_ref_count_and_lock(
115        id: u64,
116        path: impl Into<String>,
117        flags: u32,
118        ref_count: usize,
119        lock_target: Option<FileLockTarget>,
120    ) -> Self {
121        Self {
122            id,
123            path: path.into(),
124            lock_target,
125            cursor: AtomicU64::new(0),
126            flags: AtomicU32::new(flags),
127            ref_count: AtomicUsize::new(ref_count),
128        }
129    }
130
131    pub fn id(&self) -> u64 {
132        self.id
133    }
134
135    pub fn path(&self) -> &str {
136        &self.path
137    }
138
139    pub fn lock_target(&self) -> Option<FileLockTarget> {
140        self.lock_target
141    }
142
143    pub fn cursor(&self) -> u64 {
144        self.cursor.load(Ordering::SeqCst)
145    }
146
147    pub fn set_cursor(&self, cursor: u64) {
148        self.cursor.store(cursor, Ordering::SeqCst);
149    }
150
151    pub fn flags(&self) -> u32 {
152        self.flags.load(Ordering::SeqCst)
153    }
154
155    pub fn update_flags(&self, mask: u32, flags: u32) -> u32 {
156        let mut current = self.flags();
157        loop {
158            let next = (current & !mask) | (flags & mask);
159            match self
160                .flags
161                .compare_exchange(current, next, Ordering::SeqCst, Ordering::SeqCst)
162            {
163                Ok(_) => return next,
164                Err(observed) => current = observed,
165            }
166        }
167    }
168
169    pub fn ref_count(&self) -> usize {
170        self.ref_count.load(Ordering::SeqCst)
171    }
172
173    pub fn increment_ref_count(&self) -> usize {
174        self.ref_count.fetch_add(1, Ordering::SeqCst) + 1
175    }
176
177    pub fn decrement_ref_count(&self) -> usize {
178        let mut current = self.ref_count.load(Ordering::SeqCst);
179        loop {
180            let next = current.saturating_sub(1);
181            match self
182                .ref_count
183                .compare_exchange(current, next, Ordering::SeqCst, Ordering::SeqCst)
184            {
185                Ok(_) => return next,
186                Err(observed) => current = observed,
187            }
188        }
189    }
190}
191
192#[derive(Debug, Clone)]
193pub struct FdEntry {
194    pub fd: u32,
195    pub description: SharedFileDescription,
196    pub status_flags: u32,
197    pub fd_flags: u32,
198    pub rights: u64,
199    pub filetype: u8,
200}
201
202#[derive(Debug, Clone, Copy, PartialEq, Eq)]
203pub struct FdStat {
204    pub filetype: u8,
205    pub flags: u32,
206    pub rights: u64,
207}
208
209#[derive(Debug, Clone)]
210pub struct StdioOverride {
211    pub description: SharedFileDescription,
212    pub filetype: u8,
213}
214
215#[derive(Debug, Clone)]
216struct DescriptionFactory {
217    next_description_id: Arc<AtomicU64>,
218}
219
220impl DescriptionFactory {
221    fn new(starting_id: u64) -> Self {
222        Self {
223            next_description_id: Arc::new(AtomicU64::new(starting_id)),
224        }
225    }
226
227    fn allocate(&self, path: &str, flags: u32) -> SharedFileDescription {
228        self.allocate_with_lock(path, flags, None)
229    }
230
231    fn allocate_with_lock(
232        &self,
233        path: &str,
234        flags: u32,
235        lock_target: Option<FileLockTarget>,
236    ) -> SharedFileDescription {
237        let next_id = self.next_description_id.fetch_add(1, Ordering::SeqCst);
238        Arc::new(FileDescription::new_with_lock(
239            next_id,
240            path,
241            flags,
242            lock_target,
243        ))
244    }
245}
246
247#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
248pub struct FileLockTarget {
249    ino: u64,
250}
251
252impl FileLockTarget {
253    pub const fn new(ino: u64) -> Self {
254        Self { ino }
255    }
256
257    pub const fn ino(self) -> u64 {
258        self.ino
259    }
260}
261
262#[derive(Debug, Clone, Copy, PartialEq, Eq)]
263enum FileLockMode {
264    Shared,
265    Exclusive,
266}
267
268#[derive(Debug, Clone, Copy, PartialEq, Eq)]
269pub enum FlockOperation {
270    Shared { nonblocking: bool },
271    Exclusive { nonblocking: bool },
272    Unlock,
273}
274
275impl FlockOperation {
276    pub fn from_bits(operation: u32) -> FdResult<Self> {
277        let nonblocking = operation & LOCK_NB != 0;
278        match operation & !LOCK_NB {
279            LOCK_SH => Ok(Self::Shared { nonblocking }),
280            LOCK_EX => Ok(Self::Exclusive { nonblocking }),
281            LOCK_UN => Ok(Self::Unlock),
282            _ => Err(FdTableError::invalid_argument(format!(
283                "invalid flock operation {operation:#x}"
284            ))),
285        }
286    }
287}
288
289#[derive(Debug, Clone)]
290pub struct ProcessFdTable {
291    entries: BTreeMap<u32, FdEntry>,
292    next_fd: u32,
293    alloc_desc: DescriptionFactory,
294    max_fds: usize,
295}
296
297impl ProcessFdTable {
298    fn new(alloc_desc: DescriptionFactory, max_fds: usize) -> Self {
299        Self {
300            entries: BTreeMap::new(),
301            next_fd: 3,
302            alloc_desc,
303            max_fds,
304        }
305    }
306
307    pub fn max_fds(&self) -> usize {
308        self.max_fds
309    }
310
311    pub fn init_stdio(
312        &mut self,
313        stdin_desc: SharedFileDescription,
314        stdout_desc: SharedFileDescription,
315        stderr_desc: SharedFileDescription,
316    ) {
317        self.entries.insert(
318            0,
319            FdEntry {
320                fd: 0,
321                description: stdin_desc,
322                status_flags: 0,
323                fd_flags: 0,
324                rights: 0,
325                filetype: FILETYPE_CHARACTER_DEVICE,
326            },
327        );
328        self.entries.insert(
329            1,
330            FdEntry {
331                fd: 1,
332                description: stdout_desc,
333                status_flags: 0,
334                fd_flags: 0,
335                rights: 0,
336                filetype: FILETYPE_CHARACTER_DEVICE,
337            },
338        );
339        self.entries.insert(
340            2,
341            FdEntry {
342                fd: 2,
343                description: stderr_desc,
344                status_flags: 0,
345                fd_flags: 0,
346                rights: 0,
347                filetype: FILETYPE_CHARACTER_DEVICE,
348            },
349        );
350    }
351
352    pub fn init_stdio_with_types(
353        &mut self,
354        stdin_desc: SharedFileDescription,
355        stdin_type: u8,
356        stdout_desc: SharedFileDescription,
357        stdout_type: u8,
358        stderr_desc: SharedFileDescription,
359        stderr_type: u8,
360    ) {
361        stdin_desc.increment_ref_count();
362        stdout_desc.increment_ref_count();
363        stderr_desc.increment_ref_count();
364        self.entries.insert(
365            0,
366            FdEntry {
367                fd: 0,
368                description: stdin_desc,
369                status_flags: 0,
370                fd_flags: 0,
371                rights: 0,
372                filetype: stdin_type,
373            },
374        );
375        self.entries.insert(
376            1,
377            FdEntry {
378                fd: 1,
379                description: stdout_desc,
380                status_flags: 0,
381                fd_flags: 0,
382                rights: 0,
383                filetype: stdout_type,
384            },
385        );
386        self.entries.insert(
387            2,
388            FdEntry {
389                fd: 2,
390                description: stderr_desc,
391                status_flags: 0,
392                fd_flags: 0,
393                rights: 0,
394                filetype: stderr_type,
395            },
396        );
397    }
398
399    pub fn open(&mut self, path: &str, flags: u32) -> FdResult<u32> {
400        self.open_with_details(path, flags, FILETYPE_REGULAR_FILE, None)
401    }
402
403    pub fn open_with_filetype(&mut self, path: &str, flags: u32, filetype: u8) -> FdResult<u32> {
404        self.open_with_details(path, flags, filetype, None)
405    }
406
407    pub fn open_with_details(
408        &mut self,
409        path: &str,
410        flags: u32,
411        filetype: u8,
412        lock_target: Option<FileLockTarget>,
413    ) -> FdResult<u32> {
414        let fd = self.allocate_fd()?;
415        let description =
416            self.alloc_desc
417                .allocate_with_lock(path, description_flags(flags), lock_target);
418        self.entries.insert(
419            fd,
420            FdEntry {
421                fd,
422                description,
423                status_flags: status_flags(flags),
424                fd_flags: 0,
425                rights: 0,
426                filetype,
427            },
428        );
429        Ok(fd)
430    }
431
432    pub fn open_with(
433        &mut self,
434        description: SharedFileDescription,
435        filetype: u8,
436        target_fd: Option<u32>,
437    ) -> FdResult<u32> {
438        let fd = match target_fd {
439            Some(fd) => {
440                self.validate_fd_bounds(fd)?;
441                if self.entries.contains_key(&fd) {
442                    self.close(fd);
443                }
444                fd
445            }
446            None => self.allocate_fd()?,
447        };
448        description.increment_ref_count();
449        self.entries.insert(
450            fd,
451            FdEntry {
452                fd,
453                description,
454                status_flags: 0,
455                fd_flags: 0,
456                rights: 0,
457                filetype,
458            },
459        );
460        Ok(fd)
461    }
462
463    pub fn get(&self, fd: u32) -> Option<&FdEntry> {
464        self.entries.get(&fd)
465    }
466
467    pub fn close(&mut self, fd: u32) -> bool {
468        let Some(entry) = self.entries.remove(&fd) else {
469            return false;
470        };
471        entry.description.decrement_ref_count();
472        true
473    }
474
475    pub fn dup(&mut self, fd: u32) -> FdResult<u32> {
476        self.dup_with_status_flags(fd, None)
477    }
478
479    pub fn dup_with_status_flags(
480        &mut self,
481        fd: u32,
482        status_flags_override: Option<u32>,
483    ) -> FdResult<u32> {
484        let entry = self
485            .entries
486            .get(&fd)
487            .cloned()
488            .ok_or_else(|| FdTableError::bad_file_descriptor(fd))?;
489        let new_fd = self.allocate_fd()?;
490        self.duplicate_entry(
491            &entry,
492            new_fd,
493            status_flags_override.unwrap_or(entry.status_flags),
494            0,
495        )
496    }
497
498    pub fn dup2(&mut self, old_fd: u32, new_fd: u32) -> FdResult<()> {
499        let entry = self
500            .entries
501            .get(&old_fd)
502            .cloned()
503            .ok_or_else(|| FdTableError::bad_file_descriptor(old_fd))?;
504        self.validate_fd_bounds(new_fd)?;
505        if old_fd == new_fd {
506            return Ok(());
507        }
508
509        if self.entries.contains_key(&new_fd) {
510            self.close(new_fd);
511        }
512
513        self.duplicate_entry(&entry, new_fd, entry.status_flags, 0)?;
514        Ok(())
515    }
516
517    pub fn stat(&self, fd: u32) -> FdResult<FdStat> {
518        let entry = self
519            .entries
520            .get(&fd)
521            .ok_or_else(|| FdTableError::bad_file_descriptor(fd))?;
522        Ok(FdStat {
523            filetype: entry.filetype,
524            flags: visible_fd_flags(entry.description.flags(), entry.status_flags),
525            rights: entry.rights,
526        })
527    }
528
529    pub fn fcntl(&mut self, fd: u32, command: u32, arg: u32) -> FdResult<u32> {
530        match command {
531            F_DUPFD => {
532                let entry = self
533                    .entries
534                    .get(&fd)
535                    .cloned()
536                    .ok_or_else(|| FdTableError::bad_file_descriptor(fd))?;
537                let min_fd = self.validate_fcntl_dup_min(arg)?;
538                let new_fd = self.allocate_fd_from(min_fd)?;
539                self.duplicate_entry(&entry, new_fd, entry.status_flags, 0)
540            }
541            F_GETFD => {
542                let entry = self
543                    .entries
544                    .get(&fd)
545                    .ok_or_else(|| FdTableError::bad_file_descriptor(fd))?;
546                Ok(entry.fd_flags & FD_CLOEXEC)
547            }
548            F_SETFD => {
549                let entry = self
550                    .entries
551                    .get_mut(&fd)
552                    .ok_or_else(|| FdTableError::bad_file_descriptor(fd))?;
553                entry.fd_flags = arg & FD_CLOEXEC;
554                Ok(0)
555            }
556            F_GETFL => {
557                let entry = self
558                    .entries
559                    .get(&fd)
560                    .ok_or_else(|| FdTableError::bad_file_descriptor(fd))?;
561                Ok(visible_fd_flags(
562                    entry.description.flags(),
563                    entry.status_flags,
564                ))
565            }
566            F_SETFL => {
567                let entry = self
568                    .entries
569                    .get_mut(&fd)
570                    .ok_or_else(|| FdTableError::bad_file_descriptor(fd))?;
571                entry.status_flags = arg & ENTRY_STATUS_FLAG_MASK;
572                entry.description.update_flags(SHARED_STATUS_FLAG_MASK, arg);
573                Ok(0)
574            }
575            _ => Err(FdTableError::invalid_argument(format!(
576                "unsupported fcntl command {command}"
577            ))),
578        }
579    }
580
581    pub fn fork(&self) -> Self {
582        let mut child = Self::new(self.alloc_desc.clone(), self.max_fds);
583        child.next_fd = self.next_fd;
584
585        for (fd, entry) in &self.entries {
586            entry.description.increment_ref_count();
587            child.entries.insert(
588                *fd,
589                FdEntry {
590                    fd: *fd,
591                    description: Arc::clone(&entry.description),
592                    status_flags: entry.status_flags,
593                    fd_flags: entry.fd_flags,
594                    rights: entry.rights,
595                    filetype: entry.filetype,
596                },
597            );
598        }
599
600        child
601    }
602
603    pub fn close_all(&mut self) {
604        let fds: Vec<u32> = self.entries.keys().copied().collect();
605        for fd in fds {
606            self.close(fd);
607        }
608    }
609
610    pub fn len(&self) -> usize {
611        self.entries.len()
612    }
613
614    pub fn is_empty(&self) -> bool {
615        self.entries.is_empty()
616    }
617
618    pub fn iter(&self) -> Values<'_, u32, FdEntry> {
619        self.entries.values()
620    }
621
622    fn allocate_fd(&mut self) -> FdResult<u32> {
623        if self.entries.len() >= self.max_fds {
624            return Err(FdTableError::too_many_open_files());
625        }
626
627        let start = usize::try_from(self.next_fd).unwrap_or(0) % self.max_fds;
628        for offset in 0..self.max_fds {
629            let candidate = ((start + offset) % self.max_fds) as u32;
630            if !self.entries.contains_key(&candidate) {
631                self.next_fd = candidate.saturating_add(1);
632                return Ok(candidate);
633            }
634        }
635
636        Err(FdTableError::too_many_open_files())
637    }
638
639    fn allocate_fd_from(&mut self, min_fd: u32) -> FdResult<u32> {
640        if self.entries.len() >= self.max_fds {
641            return Err(FdTableError::too_many_open_files());
642        }
643
644        if min_fd as usize >= self.max_fds {
645            return Err(FdTableError::invalid_argument(format!(
646                "fd {min_fd} exceeds process fd limit"
647            )));
648        }
649
650        for candidate in min_fd..self.max_fds as u32 {
651            if !self.entries.contains_key(&candidate) {
652                self.next_fd = candidate.saturating_add(1);
653                return Ok(candidate);
654            }
655        }
656
657        Err(FdTableError::too_many_open_files())
658    }
659
660    fn duplicate_entry(
661        &mut self,
662        entry: &FdEntry,
663        new_fd: u32,
664        status_flags: u32,
665        fd_flags: u32,
666    ) -> FdResult<u32> {
667        entry.description.increment_ref_count();
668        self.entries.insert(
669            new_fd,
670            FdEntry {
671                fd: new_fd,
672                description: Arc::clone(&entry.description),
673                status_flags,
674                fd_flags,
675                rights: entry.rights,
676                filetype: entry.filetype,
677            },
678        );
679        Ok(new_fd)
680    }
681
682    fn validate_fd_bounds(&self, fd: u32) -> FdResult<()> {
683        if fd as usize >= self.max_fds {
684            return Err(FdTableError::bad_file_descriptor(fd));
685        }
686        Ok(())
687    }
688
689    fn validate_fcntl_dup_min(&self, min_fd: u32) -> FdResult<u32> {
690        if min_fd as usize >= self.max_fds {
691            return Err(FdTableError::invalid_argument(format!(
692                "fd {min_fd} exceeds process fd limit"
693            )));
694        }
695        Ok(min_fd)
696    }
697}
698
699fn description_flags(flags: u32) -> u32 {
700    flags & !status_flags(flags)
701}
702
703fn status_flags(flags: u32) -> u32 {
704    flags & ENTRY_STATUS_FLAG_MASK
705}
706
707fn visible_fd_flags(description_flags: u32, entry_status_flags: u32) -> u32 {
708    (description_flags & (0b11 | SHARED_STATUS_FLAG_MASK))
709        | (entry_status_flags & ENTRY_STATUS_FLAG_MASK)
710}
711
712const SHARED_STATUS_FLAG_MASK: u32 = O_APPEND;
713const ENTRY_STATUS_FLAG_MASK: u32 = O_NONBLOCK;
714
715impl<'a> IntoIterator for &'a ProcessFdTable {
716    type Item = &'a FdEntry;
717    type IntoIter = Values<'a, u32, FdEntry>;
718
719    fn into_iter(self) -> Self::IntoIter {
720        self.entries.values()
721    }
722}
723
724#[derive(Debug, Clone)]
725pub struct FdTableManager {
726    tables: BTreeMap<u32, ProcessFdTable>,
727    alloc_desc: DescriptionFactory,
728    max_fds: usize,
729}
730
731impl Default for FdTableManager {
732    fn default() -> Self {
733        Self {
734            tables: BTreeMap::new(),
735            alloc_desc: DescriptionFactory::new(1),
736            max_fds: MAX_FDS_PER_PROCESS,
737        }
738    }
739}
740
741impl FdTableManager {
742    pub fn new() -> Self {
743        Self::default()
744    }
745
746    pub fn with_max_fds(max_fds: usize) -> Self {
747        Self {
748            max_fds,
749            ..Self::default()
750        }
751    }
752
753    pub fn create(&mut self, pid: u32) -> &mut ProcessFdTable {
754        let mut table = ProcessFdTable::new(self.alloc_desc.clone(), self.max_fds);
755        table.init_stdio(
756            self.alloc_desc.allocate("/dev/stdin", O_RDONLY),
757            self.alloc_desc.allocate("/dev/stdout", O_WRONLY),
758            self.alloc_desc.allocate("/dev/stderr", O_WRONLY),
759        );
760        self.remove(pid);
761        self.tables.insert(pid, table);
762        self.tables
763            .get_mut(&pid)
764            .expect("newly created FD table should be stored")
765    }
766
767    pub fn create_with_stdio(
768        &mut self,
769        pid: u32,
770        stdin_override: Option<StdioOverride>,
771        stdout_override: Option<StdioOverride>,
772        stderr_override: Option<StdioOverride>,
773    ) -> &mut ProcessFdTable {
774        let mut table = ProcessFdTable::new(self.alloc_desc.clone(), self.max_fds);
775        let stdin_desc = stdin_override
776            .as_ref()
777            .map(|entry| Arc::clone(&entry.description))
778            .unwrap_or_else(|| self.alloc_desc.allocate("/dev/stdin", O_RDONLY));
779        let stdout_desc = stdout_override
780            .as_ref()
781            .map(|entry| Arc::clone(&entry.description))
782            .unwrap_or_else(|| self.alloc_desc.allocate("/dev/stdout", O_WRONLY));
783        let stderr_desc = stderr_override
784            .as_ref()
785            .map(|entry| Arc::clone(&entry.description))
786            .unwrap_or_else(|| self.alloc_desc.allocate("/dev/stderr", O_WRONLY));
787
788        table.init_stdio_with_types(
789            stdin_desc,
790            stdin_override
791                .as_ref()
792                .map(|entry| entry.filetype)
793                .unwrap_or(FILETYPE_CHARACTER_DEVICE),
794            stdout_desc,
795            stdout_override
796                .as_ref()
797                .map(|entry| entry.filetype)
798                .unwrap_or(FILETYPE_CHARACTER_DEVICE),
799            stderr_desc,
800            stderr_override
801                .as_ref()
802                .map(|entry| entry.filetype)
803                .unwrap_or(FILETYPE_CHARACTER_DEVICE),
804        );
805        self.remove(pid);
806        self.tables.insert(pid, table);
807        self.tables
808            .get_mut(&pid)
809            .expect("newly created FD table should be stored")
810    }
811
812    pub fn fork(&mut self, parent_pid: u32, child_pid: u32) -> &mut ProcessFdTable {
813        if !self.tables.contains_key(&parent_pid) {
814            return self.create(child_pid);
815        }
816
817        let child = self
818            .tables
819            .get(&parent_pid)
820            .expect("parent table presence was checked")
821            .fork();
822        self.remove(child_pid);
823        self.tables.insert(child_pid, child);
824        self.tables
825            .get_mut(&child_pid)
826            .expect("forked FD table should be stored")
827    }
828
829    pub fn get(&self, pid: u32) -> Option<&ProcessFdTable> {
830        self.tables.get(&pid)
831    }
832
833    pub fn get_mut(&mut self, pid: u32) -> Option<&mut ProcessFdTable> {
834        self.tables.get_mut(&pid)
835    }
836
837    pub fn has(&self, pid: u32) -> bool {
838        self.tables.contains_key(&pid)
839    }
840
841    pub fn len(&self) -> usize {
842        self.tables.len()
843    }
844
845    pub fn is_empty(&self) -> bool {
846        self.tables.is_empty()
847    }
848
849    pub fn total_open_fds(&self) -> usize {
850        self.tables.values().map(ProcessFdTable::len).sum()
851    }
852
853    pub fn pids(&self) -> Vec<u32> {
854        self.tables.keys().copied().collect()
855    }
856
857    pub fn remove(&mut self, pid: u32) {
858        if let Some(mut table) = self.tables.remove(&pid) {
859            table.close_all();
860        }
861    }
862}
863
864#[derive(Debug, Clone, Default)]
865pub struct FileLockManager {
866    inner: Arc<FileLockManagerInner>,
867}
868
869#[derive(Debug, Default)]
870struct FileLockManagerInner {
871    state: Mutex<FileLockState>,
872    wake: Condvar,
873}
874
875#[derive(Debug, Default)]
876struct FileLockState {
877    entries: BTreeMap<FileLockTarget, FileLockEntry>,
878}
879
880#[derive(Debug, Default)]
881struct FileLockEntry {
882    shared: BTreeSet<u64>,
883    exclusive: Option<u64>,
884}
885
886impl FileLockManager {
887    pub fn new() -> Self {
888        Self::default()
889    }
890
891    pub fn apply(
892        &self,
893        owner_id: u64,
894        target: FileLockTarget,
895        operation: FlockOperation,
896    ) -> FdResult<()> {
897        match operation {
898            FlockOperation::Shared { nonblocking } => {
899                self.acquire(owner_id, target, FileLockMode::Shared, nonblocking)
900            }
901            FlockOperation::Exclusive { nonblocking } => {
902                self.acquire(owner_id, target, FileLockMode::Exclusive, nonblocking)
903            }
904            FlockOperation::Unlock => {
905                self.release_owner(owner_id);
906                Ok(())
907            }
908        }
909    }
910
911    pub fn release_owner(&self, owner_id: u64) -> bool {
912        let mut state = lock_or_recover(&self.inner.state);
913        let mut released = false;
914        state.entries.retain(|_, entry| {
915            let entry_changed = entry.shared.remove(&owner_id) || entry.exclusive == Some(owner_id);
916            if entry.exclusive == Some(owner_id) {
917                entry.exclusive = None;
918            }
919            released |= entry_changed;
920            !entry.is_empty()
921        });
922        drop(state);
923        if released {
924            self.inner.wake.notify_all();
925        }
926        released
927    }
928
929    fn acquire(
930        &self,
931        owner_id: u64,
932        target: FileLockTarget,
933        mode: FileLockMode,
934        nonblocking: bool,
935    ) -> FdResult<()> {
936        let mut state = lock_or_recover(&self.inner.state);
937        loop {
938            let entry = state.entries.entry(target).or_default();
939            if entry.can_grant(owner_id, mode) {
940                entry.grant(owner_id, mode);
941                return Ok(());
942            }
943
944            if nonblocking {
945                return Err(FdTableError::would_block(
946                    "advisory file lock is unavailable",
947                ));
948            }
949
950            state = wait_or_recover(&self.inner.wake, state);
951        }
952    }
953}
954
955impl FileLockEntry {
956    fn can_grant(&self, owner_id: u64, mode: FileLockMode) -> bool {
957        match mode {
958            FileLockMode::Shared => self.exclusive.is_none_or(|owner| owner == owner_id),
959            FileLockMode::Exclusive => {
960                self.exclusive.is_none_or(|owner| owner == owner_id)
961                    && self.shared.iter().all(|owner| *owner == owner_id)
962            }
963        }
964    }
965
966    fn grant(&mut self, owner_id: u64, mode: FileLockMode) {
967        match mode {
968            FileLockMode::Shared => {
969                self.exclusive = None;
970                self.shared.insert(owner_id);
971            }
972            FileLockMode::Exclusive => {
973                self.shared.retain(|owner| *owner != owner_id);
974                self.exclusive = Some(owner_id);
975            }
976        }
977    }
978
979    fn is_empty(&self) -> bool {
980        self.exclusive.is_none() && self.shared.is_empty()
981    }
982}
983
984fn lock_or_recover<T>(mutex: &Mutex<T>) -> MutexGuard<'_, T> {
985    mutex
986        .lock()
987        .unwrap_or_else(|poisoned| poisoned.into_inner())
988}
989
990fn wait_or_recover<'a, T>(condvar: &Condvar, guard: MutexGuard<'a, T>) -> MutexGuard<'a, T> {
991    condvar
992        .wait(guard)
993        .unwrap_or_else(|poisoned| poisoned.into_inner())
994}