tracepoint/
native.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT license.
3
4use core::ffi;
5use core::marker;
6use core::mem::size_of;
7use core::pin::Pin;
8use core::sync::atomic::AtomicI32;
9use core::sync::atomic::AtomicU32;
10use core::sync::atomic::Ordering;
11
12use crate::descriptors::EventDataDescriptor;
13
14#[cfg(all(target_os = "linux", feature = "user_events"))]
15use libc as linux;
16
17// Note: this is intentionally leaked.
18static USER_EVENTS_DATA_FILE: UserEventsDataFile = UserEventsDataFile::new();
19
20/// Requires: an errno-setting operation has failed.
21///
22/// Returns the current value of `linux::errno`.
23/// Debug-asserts that `errno > 0`.
24#[cfg(all(target_os = "linux", feature = "user_events"))]
25fn get_failure_errno() -> i32 {
26    let errno = unsafe { *linux::__errno_location() };
27    debug_assert!(errno > 0); // Shouldn't call this unless an errno-based operation failed.
28    return errno;
29}
30
31/// Sets `linux::errno` to 0.
32#[cfg(all(target_os = "linux", feature = "user_events"))]
33fn clear_errno() {
34    unsafe { *linux::__errno_location() = 0 };
35}
36
37/// linux::open(path0, O_WRONLY)
38#[cfg(all(target_os = "linux", feature = "user_events"))]
39fn open_wronly(path0: &[u8]) -> ffi::c_int {
40    assert!(path0.ends_with(&[0]));
41    return unsafe { linux::open(path0.as_ptr().cast::<ffi::c_char>(), linux::O_WRONLY) };
42}
43
44struct UserEventsDataFile {
45    /// Initial value is -EAGAIN.
46    /// Negative value is -errno with the error code from failed open.
47    /// Non-negative value is file descriptor for the "user_events_data" file.
48    file_or_error: AtomicI32,
49}
50
51impl UserEventsDataFile {
52    const EAGAIN_ERROR: i32 = -11;
53
54    #[cfg(all(target_os = "linux", feature = "user_events"))]
55    const fn is_space_char(ch: u8) -> bool {
56        return ch == b' ' || ch == b'\t';
57    }
58
59    #[cfg(all(target_os = "linux", feature = "user_events"))]
60    const fn is_nonspace_char(ch: u8) -> bool {
61        return ch != b'\0' && !Self::is_space_char(ch);
62    }
63
64    /// Opens a file descriptor to the `user_events_data` file.
65    /// Atomically updates `self.file_or_error` to either a negative
66    /// value (-errno returned from `linux::open`) or a non-negative value
67    /// (the file descriptor). If `self.file_or_error` already contains a
68    /// non-negative value, the existing value is retained and the new
69    /// descriptor is closed. In all cases, returns the final value of
70    /// `self.file_or_error`.
71    fn update(&self) -> i32 {
72        let new_file_or_error;
73
74        #[cfg(not(all(target_os = "linux", feature = "user_events")))]
75        {
76            new_file_or_error = 0;
77        }
78        #[cfg(all(target_os = "linux", feature = "user_events"))]
79        {
80            // Need to find the ".../tracing/user_events_data" file in tracefs or debugfs.
81
82            // First, try the usual tracefs mount point.
83            if let new_file @ 0.. = open_wronly(b"/sys/kernel/tracing/user_events_data\0") {
84                new_file_or_error = new_file;
85            } else {
86                // Determine tracefs/debugfs mount point by parsing "/proc/mounts".
87                // Prefer "tracefs" over "debugfs": if we find a debugfs, save the path but
88                // keep looking in case we find a tracefs later.
89                clear_errno();
90                let mounts_file = unsafe {
91                    linux::fopen(
92                        "/proc/mounts\0".as_ptr().cast::<ffi::c_char>(),
93                        "r\0".as_ptr().cast::<ffi::c_char>(),
94                    )
95                };
96                if mounts_file.is_null() {
97                    new_file_or_error = -get_failure_errno();
98                } else {
99                    let mut path = [0u8; 274]; // 256 + sizeof("/user_events_data")
100                    let mut line = [0u8; 4097];
101                    loop {
102                        let fgets_result = unsafe {
103                            linux::fgets(
104                                line.as_mut_ptr().cast::<ffi::c_char>(),
105                                line.len() as ffi::c_int,
106                                mounts_file,
107                            )
108                        };
109                        if fgets_result.is_null() {
110                            break;
111                        }
112
113                        // line is "device_name mount_point file_system other_stuff..."
114
115                        let mut line_pos = 0;
116
117                        // device_name
118                        while Self::is_nonspace_char(line[line_pos]) {
119                            line_pos += 1;
120                        }
121
122                        // whitespace
123                        while Self::is_space_char(line[line_pos]) {
124                            line_pos += 1;
125                        }
126
127                        // mount_point
128                        let mount_begin = line_pos;
129                        while Self::is_nonspace_char(line[line_pos]) {
130                            line_pos += 1;
131                        }
132
133                        let mount_end = line_pos;
134
135                        // whitespace
136                        while Self::is_space_char(line[line_pos]) {
137                            line_pos += 1;
138                        }
139
140                        // file_system
141                        let fs_begin = line_pos;
142                        while Self::is_nonspace_char(line[line_pos]) {
143                            line_pos += 1;
144                        }
145
146                        let fs_end = line_pos;
147
148                        if !Self::is_space_char(line[line_pos]) {
149                            // Ignore line if no whitespace after file_system.
150                            continue;
151                        }
152
153                        let path_suffix: &[u8]; // Includes NUL
154                        let fs = &line[fs_begin..fs_end];
155                        let keep_looking;
156                        if fs == b"tracefs" {
157                            // "tracefsMountPoint/user_events_data"
158                            path_suffix = b"/user_events_data\0";
159                            keep_looking = false; // prefer "tracefs" over "debugfs"
160                        } else if path[0] == 0 && fs == b"debugfs" {
161                            // "debugfsMountPoint/tracing/user_events_data"
162                            path_suffix = b"/tracing/user_events_data\0";
163                            keep_looking = true; // prefer "tracefs" over "debugfs"
164                        } else {
165                            continue;
166                        }
167
168                        let mount_len = mount_end - mount_begin;
169                        let path_len = mount_len + path_suffix.len(); // Includes NUL
170                        if path_len > path.len() {
171                            continue;
172                        }
173
174                        // path = mountpoint + suffix
175                        path[0..mount_len].copy_from_slice(&line[mount_begin..mount_end]);
176                        path[mount_len..path_len].copy_from_slice(path_suffix); // Includes NUL
177
178                        if !keep_looking {
179                            break;
180                        }
181                    }
182
183                    unsafe { linux::fclose(mounts_file) };
184
185                    if path[0] == 0 {
186                        new_file_or_error = -linux::ENOTSUP;
187                    } else {
188                        // path is now something like "/sys/kernel/tracing/user_events_data\0" or
189                        // "/sys/kernel/debug/tracing/user_events_data\0".
190                        clear_errno();
191                        new_file_or_error = if let new_file @ 0.. = open_wronly(&path) {
192                            new_file
193                        } else {
194                            -get_failure_errno()
195                        };
196                    }
197                }
198            }
199        }
200
201        let mut old_file_or_error = Self::EAGAIN_ERROR;
202        loop {
203            match self.file_or_error.compare_exchange(
204                old_file_or_error,
205                new_file_or_error,
206                Ordering::Relaxed,
207                Ordering::Relaxed,
208            ) {
209                Ok(_) => {
210                    // We updated FILE_OR_ERROR to new.
211                    return new_file_or_error;
212                }
213                Err(current_file_or_error) => {
214                    // Somebody else updated FILE_OR_ERROR to current.
215                    if current_file_or_error >= 0 || new_file_or_error < 0 {
216                        // prefer current.
217                        #[cfg(all(target_os = "linux", feature = "user_events"))]
218                        if new_file_or_error >= 0 {
219                            unsafe { linux::close(new_file_or_error) };
220                        }
221                        return current_file_or_error;
222                    }
223
224                    // current is an error, new is a file, try again.
225                    old_file_or_error = current_file_or_error;
226                }
227            }
228        }
229    }
230
231    // Initial state is -EAGAIN.
232    pub const fn new() -> Self {
233        return Self {
234            file_or_error: AtomicI32::new(Self::EAGAIN_ERROR),
235        };
236    }
237
238    // If file is open, closes it. Sets state to -EAGAIN.
239    pub fn close(&self) {
240        let file_or_error = self
241            .file_or_error
242            .swap(Self::EAGAIN_ERROR, Ordering::Relaxed);
243        if file_or_error >= 0 {
244            #[cfg(all(target_os = "linux", feature = "user_events"))]
245            unsafe {
246                linux::close(file_or_error)
247            };
248        }
249    }
250
251    // Returns existing state. This will be non-negative user_events_data file
252    // descriptor or -errno if file is not currently open.
253    #[cfg(all(target_os = "linux", feature = "user_events"))]
254    pub fn peek(&self) -> i32 {
255        return self.file_or_error.load(Ordering::Relaxed);
256    }
257
258    // If we have not already tried to open the `user_events_data` file, try
259    // to open it, atomically update state, and return the new state. Otherwise,
260    // return the existing state. Returns non-negative user_events_data file
261    // descriptor on success or -errno for error.
262    #[inline]
263    pub fn get(&self) -> i32 {
264        let file_or_error = self.file_or_error.load(Ordering::Relaxed);
265        return if file_or_error == Self::EAGAIN_ERROR {
266            self.update()
267        } else {
268            file_or_error
269        };
270    }
271}
272
273impl Drop for UserEventsDataFile {
274    fn drop(&mut self) {
275        self.close();
276    }
277}
278
279/// Low-level API: Represents a tracepoint registration.
280pub struct TracepointState {
281    /// The kernel will update this variable with tracepoint enable/disable state.
282    /// It will be 0 if tracepoint is disabled, nonzero if tracepoint is enabled.
283    enable_status: AtomicU32,
284
285    /// This will be a kernel-assigned value if registered,
286    /// `UNREGISTERED_WRITE_INDEX` or `BUSY_WRITE_INDEX` if not registered.
287    write_index: AtomicU32,
288
289    _pinned: marker::PhantomPinned,
290}
291
292impl TracepointState {
293    const UNREGISTERED_WRITE_INDEX: u32 = u32::MAX;
294    const BUSY_WRITE_INDEX: u32 = u32::MAX - 1;
295    const HIGHEST_VALID_WRITE_INDEX: u32 = u32::MAX - 2;
296
297    #[cfg(all(target_os = "linux", feature = "user_events"))]
298    const IOC_WRITE: ffi::c_ulong = 1;
299
300    #[cfg(all(target_os = "linux", feature = "user_events"))]
301    const IOC_READ: ffi::c_ulong = 2;
302
303    #[cfg(all(target_os = "linux", feature = "user_events"))]
304    const DIAG_IOC_MAGIC: ffi::c_ulong = '*' as ffi::c_ulong;
305
306    #[cfg(all(target_os = "linux", feature = "user_events"))]
307    const DIAG_IOCSREG: ffi::c_ulong =
308        Self::ioc(Self::IOC_WRITE | Self::IOC_READ, Self::DIAG_IOC_MAGIC, 0);
309
310    #[cfg(all(target_os = "linux", feature = "user_events"))]
311    const DIAG_IOCSUNREG: ffi::c_ulong = Self::ioc(Self::IOC_WRITE, Self::DIAG_IOC_MAGIC, 2);
312
313    #[cfg(all(target_os = "linux", feature = "user_events"))]
314    const fn ioc(dir: ffi::c_ulong, typ: ffi::c_ulong, nr: ffi::c_ulong) -> ffi::c_ulong {
315        const IOC_NRBITS: u8 = 8;
316        const IOC_TYPEBITS: u8 = 8;
317        const IOC_SIZEBITS: u8 = 14;
318        const IOC_NRSHIFT: u8 = 0;
319        const IOC_TYPESHIFT: u8 = IOC_NRSHIFT + IOC_NRBITS;
320        const IOC_SIZESHIFT: u8 = IOC_TYPESHIFT + IOC_TYPEBITS;
321        const IOC_DIRSHIFT: u8 = IOC_SIZESHIFT + IOC_SIZEBITS;
322
323        return (dir << IOC_DIRSHIFT)
324            | (typ << IOC_TYPESHIFT)
325            | (nr << IOC_NRSHIFT)
326            | ((size_of::<usize>() as ffi::c_ulong) << IOC_SIZESHIFT);
327    }
328
329    /// Creates a new unregistered tracepoint.
330    ///
331    /// initial_enable_status is normally 0, since an unregistered tracepoint will
332    /// normally be considered disabled.
333    pub const fn new(initial_enable_status: u32) -> Self {
334        return Self {
335            enable_status: AtomicU32::new(initial_enable_status),
336            write_index: AtomicU32::new(Self::UNREGISTERED_WRITE_INDEX),
337            _pinned: marker::PhantomPinned,
338        };
339    }
340
341    /// Returns true if this tracepoint is enabled, i.e. `enable_status != 0`.
342    #[inline(always)]
343    pub fn enabled(&self) -> bool {
344        return 0 != self.enable_status.load(Ordering::Relaxed);
345    }
346
347    /// Unregisters this tracepoint.
348    ///
349    /// Returns 0 for success, error code (e.g. EBUSY, EALREADY) for error.
350    /// Error code is usually ignored in retail code, but may be helpful during
351    /// development to understand behavior or track down issues.
352    pub fn unregister(&self) -> i32 {
353        let error;
354
355        let old_write_index = self
356            .write_index
357            .swap(Self::BUSY_WRITE_INDEX, Ordering::Relaxed);
358        match old_write_index {
359            Self::BUSY_WRITE_INDEX => {
360                error = 16; // EBUSY: Another thread is registering/unregistering. Do nothing.
361                return error; // Return immediately, need to leave write_index = BUSY.
362            }
363            Self::UNREGISTERED_WRITE_INDEX => {
364                error = 116; // EALREADY: Already unregistered. No action needed.
365            }
366            _ => {
367                #[cfg(not(all(target_os = "linux", feature = "user_events")))]
368                {
369                    error = 0;
370                }
371
372                #[cfg(all(target_os = "linux", feature = "user_events"))]
373                {
374                    #[repr(C, packed)]
375                    #[allow(non_camel_case_types)]
376                    struct user_unreg {
377                        size: u32,
378                        disable_bit: u8,
379                        reserved1: u8,
380                        reserved2: u16,
381                        disable_addr: u64,
382                    }
383
384                    let unreg = user_unreg {
385                        size: size_of::<user_unreg>() as u32,
386                        disable_bit: 0,
387                        reserved1: 0,
388                        reserved2: 0,
389                        disable_addr: &self.enable_status as *const AtomicU32 as usize as u64,
390                    };
391
392                    clear_errno();
393                    let ioctl_result = unsafe {
394                        linux::ioctl(USER_EVENTS_DATA_FILE.peek(), Self::DIAG_IOCSUNREG, &unreg)
395                    };
396                    if 0 > ioctl_result {
397                        error = get_failure_errno();
398                    } else {
399                        error = 0;
400                    }
401                }
402            }
403        }
404
405        let old_write_index = self
406            .write_index
407            .swap(Self::UNREGISTERED_WRITE_INDEX, Ordering::Relaxed);
408        debug_assert!(old_write_index == Self::BUSY_WRITE_INDEX);
409
410        return error;
411    }
412
413    /// Registers this tracepoint.
414    ///
415    /// Requires: this `TracepointState` is not currently registered.
416    ///
417    /// Returns 0 for success, error code (e.g. EACCES, ENOENT) for error. The error code
418    /// is usually ignored in retail scenarios but may be helpful during development to
419    /// understand behavior or track down issues.
420    ///
421    /// `_name_args` is the tracepoint definition in the format
422    /// `Name[ FieldDef1[;FieldDef2...]]`. For example:
423    ///
424    /// - `MyTracepoint1`
425    /// - `MyTracepoint2 u32 Field1`
426    /// - `MyTracepoint3 u32 Field1;char Field2[20]`
427    ///
428    /// # Safety
429    ///
430    /// The tracepoint must be unregistered before it is deallocated. `TracepointState`
431    /// will unregister itself when dropped, so this is only an issue if the tracepoint
432    /// is not dropped before it is deallocated. This might happen for a static variable
433    /// in a shared library that gets unloaded.
434    pub unsafe fn register(self: Pin<&Self>, _name_args: &ffi::CStr) -> i32 {
435        return self.register_with_flags(_name_args, 0);
436    }
437
438    /// Advanced: Registers this tracepoint using the specified `user_reg` flags.
439    ///
440    /// Requires: this `TracepointState` is not currently registered.
441    ///
442    /// Returns 0 for success, error code (e.g. EACCES, ENOENT) for error. The error code
443    /// is usually ignored in retail scenarios but may be helpful during development to
444    /// understand behavior or track down issues.
445    ///
446    /// `_name_args` is the tracepoint definition in the format
447    /// `Name[ FieldDef1[;FieldDef2...]]`. For example:
448    ///
449    /// - `MyTracepoint1`
450    /// - `MyTracepoint2 u32 Field1`
451    /// - `MyTracepoint3 u32 Field1;char Field2[20]`
452    ///
453    /// `_flags` is normally `0`, but may also be set to a `user_reg` flag such as
454    /// `USER_EVENT_REG_PERSIST`.
455    ///
456    /// # Safety
457    ///
458    /// The tracepoint must be unregistered before it is deallocated. `TracepointState`
459    /// will unregister itself when dropped, so this is only an issue if the tracepoint
460    /// is not dropped before it is deallocated. This might happen for a static variable
461    /// in a shared library that gets unloaded.
462    pub unsafe fn register_with_flags(
463        self: Pin<&Self>,
464        _name_args: &ffi::CStr,
465        _flags: u16,
466    ) -> i32 {
467        let error;
468        let new_write_index;
469
470        let old_write_index = self
471            .write_index
472            .swap(Self::BUSY_WRITE_INDEX, Ordering::Relaxed);
473        assert!(
474            old_write_index == Self::UNREGISTERED_WRITE_INDEX,
475            "register of active tracepoint (already-registered or being-unregistered)"
476        );
477
478        let user_events_data = USER_EVENTS_DATA_FILE.get();
479        if user_events_data < 0 {
480            error = -user_events_data;
481            new_write_index = Self::UNREGISTERED_WRITE_INDEX;
482        } else {
483            #[cfg(not(all(target_os = "linux", feature = "user_events")))]
484            {
485                error = 0;
486                new_write_index = 0;
487            }
488
489            #[cfg(all(target_os = "linux", feature = "user_events"))]
490            {
491                #[repr(C, packed)]
492                #[allow(non_camel_case_types)]
493                struct user_reg {
494                    size: u32,
495                    enable_bit: u8,
496                    enable_size: u8,
497                    flags: u16,
498                    enable_addr: u64,
499                    name_args: u64,
500                    write_index: u32,
501                }
502
503                let mut reg = user_reg {
504                    size: size_of::<user_reg>() as u32,
505                    enable_bit: 0,
506                    enable_size: 4,
507                    flags: _flags,
508                    enable_addr: &self.enable_status as *const AtomicU32 as usize as u64,
509                    name_args: _name_args.as_ptr() as usize as u64,
510                    write_index: 0,
511                };
512
513                clear_errno();
514                let ioctl_result =
515                    unsafe { linux::ioctl(user_events_data, Self::DIAG_IOCSREG, &mut reg) };
516                if 0 > ioctl_result {
517                    error = get_failure_errno();
518                    new_write_index = Self::UNREGISTERED_WRITE_INDEX;
519                } else {
520                    error = 0;
521                    new_write_index = reg.write_index;
522                    debug_assert!(new_write_index <= Self::HIGHEST_VALID_WRITE_INDEX);
523                }
524            }
525        }
526
527        let old_write_index = self.write_index.swap(new_write_index, Ordering::Relaxed);
528        debug_assert!(old_write_index == Self::BUSY_WRITE_INDEX);
529
530        return error;
531    }
532
533    /// Generates an event.
534    ///
535    /// Requires: `data[0].is_empty()` since it will be used for the event headers.
536    ///
537    /// Returns 0 for success, error code (e.g. EBADF) for error. The error code
538    /// is usually ignored in retail scenarios but may be helpful during development to
539    /// understand behavior or track down issues.
540    ///
541    /// If disabled or unregistered, this method does nothing and returnes EBADF.
542    /// Otherwise, sets `data[0] = write_index` then sends `data[..]` to the
543    /// `user_events_data` file handle.
544    ///
545    /// The event's payload is the concatenation of the remaining data blocks, if any
546    /// (i.e. `data[1..]`).
547    ///
548    /// The payload's layout should match the args specified in the call to `register`.
549    pub fn write(&self, data: &mut [EventDataDescriptor]) -> i32 {
550        debug_assert!(data[0].is_empty());
551
552        let enable_status = self.enable_status.load(Ordering::Relaxed);
553        let write_index = self.write_index.load(Ordering::Relaxed);
554        if enable_status == 0 || write_index > Self::HIGHEST_VALID_WRITE_INDEX {
555            return 9; // linux::EBADF
556        }
557
558        let writev_result = self.writev(data, &write_index.to_ne_bytes());
559        return writev_result;
560    }
561
562    /// Generates an event with headers.
563    ///
564    /// Requires: `data[0].is_empty()` since it will be used for the event headers;
565    /// `headers.len() >= 4` since it will be used for `write_index`.
566    ///
567    /// Returns 0 for success, error code (e.g. EBADF) for error. The error code
568    /// is usually ignored in retail scenarios but may be helpful during development to
569    /// understand behavior or track down issues.
570    ///
571    /// If disabled or unregistered, this method does nothing and returnes EBADF.
572    /// Otherwise, sets `data[0] = headers` and `headers[0..4] = write_index`, then sends
573    /// `data[..]` to the `user_events_data` file.
574    ///
575    /// The event's payload is the concatenation of the remaining data blocks, if any
576    /// (i.e. `data[1..]`).
577    ///
578    /// The payload's layout should match the args specified in the call to `register`.
579    pub fn write_with_headers(&self, data: &mut [EventDataDescriptor], headers: &mut [u8]) -> i32 {
580        debug_assert!(data[0].is_empty());
581        debug_assert!(headers.len() >= 4);
582
583        let enable_status = self.enable_status.load(Ordering::Relaxed);
584        let write_index = self.write_index.load(Ordering::Relaxed);
585        if enable_status == 0 || write_index > Self::HIGHEST_VALID_WRITE_INDEX {
586            return 9; // linux::EBADF
587        }
588
589        *<&mut [u8; 4]>::try_from(&mut headers[0..4]).unwrap() = write_index.to_ne_bytes();
590
591        let writev_result = self.writev(data, headers);
592        return writev_result;
593    }
594
595    // Returns 0 for success, errno for error.
596    fn writev(&self, _data: &mut [EventDataDescriptor], _headers: &[u8]) -> i32 {
597        #[cfg(all(target_os = "linux", feature = "user_events"))]
598        unsafe {
599            // Unsafe: Putting headers into a container a with longer lifetime.
600            _data[0] =
601                EventDataDescriptor::from_raw_ptr(_headers.as_ptr() as usize, _headers.len());
602
603            let writev_result = linux::writev(
604                USER_EVENTS_DATA_FILE.peek(),
605                _data.as_ptr() as *const linux::iovec,
606                _data.len() as i32,
607            );
608
609            // Clear the container before headers lifetime ends.
610            _data[0] = EventDataDescriptor::zero();
611
612            if 0 > writev_result {
613                return get_failure_errno();
614            }
615        }
616
617        return 0;
618    }
619}
620
621impl Drop for TracepointState {
622    fn drop(&mut self) {
623        self.unregister();
624    }
625}
626
627/// Possible configurations under which this crate can be compiled: `LinuxUserEvents` or
628/// `Other`.
629pub enum NativeImplementation {
630    /// Crate compiled for other configuration (no logging is performed).
631    Other,
632
633    /// Crate compiled for Linux user_events configuration (logging is performed via
634    /// `user_events_data` file).
635    LinuxUserEvents,
636}
637
638/// The configuration under which this crate was compiled: `LinuxUserEvents` or `Other`.
639pub const NATIVE_IMPLEMENTATION: NativeImplementation =
640    if cfg!(all(target_os = "linux", feature = "user_events")) {
641        NativeImplementation::LinuxUserEvents
642    } else {
643        NativeImplementation::Other
644    };