tracepoint/
native.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT license.
3
4use core::ffi;
5use core::marker;
6use core::pin::Pin;
7use core::sync::atomic::AtomicI32;
8use core::sync::atomic::AtomicU32;
9use core::sync::atomic::Ordering;
10
11use crate::descriptors::EventDataDescriptor;
12
13#[cfg(all(target_os = "linux", feature = "user_events"))]
14use core::mem::size_of;
15
16#[cfg(all(target_os = "linux", feature = "user_events"))]
17use libc as linux;
18
19// Note: this is intentionally leaked.
20static USER_EVENTS_DATA_FILE: UserEventsDataFile = UserEventsDataFile::new();
21
22/// Requires: an errno-setting operation has failed.
23///
24/// Returns the current value of `linux::errno`.
25/// Debug-asserts that `errno > 0`.
26#[cfg(all(target_os = "linux", feature = "user_events"))]
27fn get_failure_errno() -> i32 {
28    let errno = unsafe { *linux::__errno_location() };
29    debug_assert!(errno > 0); // Shouldn't call this unless an errno-based operation failed.
30    return errno;
31}
32
33/// Sets `linux::errno` to 0.
34#[cfg(all(target_os = "linux", feature = "user_events"))]
35fn clear_errno() {
36    unsafe { *linux::__errno_location() = 0 };
37}
38
39/// linux::open(path0, O_WRONLY)
40#[cfg(all(target_os = "linux", feature = "user_events"))]
41fn open_wronly(path0: &[u8]) -> ffi::c_int {
42    assert!(path0.ends_with(&[0]));
43    return unsafe { linux::open(path0.as_ptr().cast::<ffi::c_char>(), linux::O_WRONLY) };
44}
45
46struct UserEventsDataFile {
47    /// Initial value is -EAGAIN.
48    /// Negative value is -errno with the error code from failed open.
49    /// Non-negative value is file descriptor for the "user_events_data" file.
50    file_or_error: AtomicI32,
51}
52
53impl UserEventsDataFile {
54    const EAGAIN_ERROR: i32 = -11;
55
56    #[cfg(all(target_os = "linux", feature = "user_events"))]
57    const fn is_space_char(ch: u8) -> bool {
58        return ch == b' ' || ch == b'\t';
59    }
60
61    #[cfg(all(target_os = "linux", feature = "user_events"))]
62    const fn is_nonspace_char(ch: u8) -> bool {
63        return ch != b'\0' && !Self::is_space_char(ch);
64    }
65
66    /// Opens a file descriptor to the `user_events_data` file.
67    /// Atomically updates `self.file_or_error` to either a negative
68    /// value (-errno returned from `linux::open`) or a non-negative value
69    /// (the file descriptor). If `self.file_or_error` already contains a
70    /// non-negative value, the existing value is retained and the new
71    /// descriptor is closed. In all cases, returns the final value of
72    /// `self.file_or_error`.
73    fn update(&self) -> i32 {
74        let new_file_or_error;
75
76        #[cfg(not(all(target_os = "linux", feature = "user_events")))]
77        {
78            new_file_or_error = 0;
79        }
80        #[cfg(all(target_os = "linux", feature = "user_events"))]
81        {
82            // Need to find the ".../tracing/user_events_data" file in tracefs or debugfs.
83
84            // First, try the usual tracefs mount point.
85            if let new_file @ 0.. = open_wronly(b"/sys/kernel/tracing/user_events_data\0") {
86                new_file_or_error = new_file;
87            } else {
88                // Determine tracefs/debugfs mount point by parsing "/proc/mounts".
89                // Prefer "tracefs" over "debugfs": if we find a debugfs, save the path but
90                // keep looking in case we find a tracefs later.
91                clear_errno();
92                let mounts_file = unsafe {
93                    linux::fopen(
94                        "/proc/mounts\0".as_ptr().cast::<ffi::c_char>(),
95                        "r\0".as_ptr().cast::<ffi::c_char>(),
96                    )
97                };
98                if mounts_file.is_null() {
99                    new_file_or_error = -get_failure_errno();
100                } else {
101                    let mut path = [0u8; 274]; // 256 + sizeof("/user_events_data")
102                    let mut line = [0u8; 4097];
103                    loop {
104                        let fgets_result = unsafe {
105                            linux::fgets(
106                                line.as_mut_ptr().cast::<ffi::c_char>(),
107                                line.len() as ffi::c_int,
108                                mounts_file,
109                            )
110                        };
111                        if fgets_result.is_null() {
112                            break;
113                        }
114
115                        // line is "device_name mount_point file_system other_stuff..."
116
117                        let mut line_pos = 0;
118
119                        // device_name
120                        while Self::is_nonspace_char(line[line_pos]) {
121                            line_pos += 1;
122                        }
123
124                        // whitespace
125                        while Self::is_space_char(line[line_pos]) {
126                            line_pos += 1;
127                        }
128
129                        // mount_point
130                        let mount_begin = line_pos;
131                        while Self::is_nonspace_char(line[line_pos]) {
132                            line_pos += 1;
133                        }
134
135                        let mount_end = line_pos;
136
137                        // whitespace
138                        while Self::is_space_char(line[line_pos]) {
139                            line_pos += 1;
140                        }
141
142                        // file_system
143                        let fs_begin = line_pos;
144                        while Self::is_nonspace_char(line[line_pos]) {
145                            line_pos += 1;
146                        }
147
148                        let fs_end = line_pos;
149
150                        if !Self::is_space_char(line[line_pos]) {
151                            // Ignore line if no whitespace after file_system.
152                            continue;
153                        }
154
155                        let path_suffix: &[u8]; // Includes NUL
156                        let fs = &line[fs_begin..fs_end];
157                        let keep_looking;
158                        if fs == b"tracefs" {
159                            // "tracefsMountPoint/user_events_data"
160                            path_suffix = b"/user_events_data\0";
161                            keep_looking = false; // prefer "tracefs" over "debugfs"
162                        } else if path[0] == 0 && fs == b"debugfs" {
163                            // "debugfsMountPoint/tracing/user_events_data"
164                            path_suffix = b"/tracing/user_events_data\0";
165                            keep_looking = true; // prefer "tracefs" over "debugfs"
166                        } else {
167                            continue;
168                        }
169
170                        let mount_len = mount_end - mount_begin;
171                        let path_len = mount_len + path_suffix.len(); // Includes NUL
172                        if path_len > path.len() {
173                            continue;
174                        }
175
176                        // path = mountpoint + suffix
177                        path[0..mount_len].copy_from_slice(&line[mount_begin..mount_end]);
178                        path[mount_len..path_len].copy_from_slice(path_suffix); // Includes NUL
179
180                        if !keep_looking {
181                            break;
182                        }
183                    }
184
185                    unsafe { linux::fclose(mounts_file) };
186
187                    if path[0] == 0 {
188                        new_file_or_error = -linux::ENOTSUP;
189                    } else {
190                        // path is now something like "/sys/kernel/tracing/user_events_data\0" or
191                        // "/sys/kernel/debug/tracing/user_events_data\0".
192                        clear_errno();
193                        new_file_or_error = if let new_file @ 0.. = open_wronly(&path) {
194                            new_file
195                        } else {
196                            -get_failure_errno()
197                        };
198                    }
199                }
200            }
201        }
202
203        let mut old_file_or_error = Self::EAGAIN_ERROR;
204        loop {
205            match self.file_or_error.compare_exchange(
206                old_file_or_error,
207                new_file_or_error,
208                Ordering::Relaxed,
209                Ordering::Relaxed,
210            ) {
211                Ok(_) => {
212                    // We updated FILE_OR_ERROR to new.
213                    return new_file_or_error;
214                }
215                Err(current_file_or_error) => {
216                    // Somebody else updated FILE_OR_ERROR to current.
217                    if current_file_or_error >= 0 || new_file_or_error < 0 {
218                        // prefer current.
219                        #[cfg(all(target_os = "linux", feature = "user_events"))]
220                        if new_file_or_error >= 0 {
221                            unsafe { linux::close(new_file_or_error) };
222                        }
223                        return current_file_or_error;
224                    }
225
226                    // current is an error, new is a file, try again.
227                    old_file_or_error = current_file_or_error;
228                }
229            }
230        }
231    }
232
233    // Initial state is -EAGAIN.
234    pub const fn new() -> Self {
235        return Self {
236            file_or_error: AtomicI32::new(Self::EAGAIN_ERROR),
237        };
238    }
239
240    // If file is open, closes it. Sets state to -EAGAIN.
241    pub fn close(&self) {
242        let file_or_error = self
243            .file_or_error
244            .swap(Self::EAGAIN_ERROR, Ordering::Relaxed);
245        if file_or_error >= 0 {
246            #[cfg(all(target_os = "linux", feature = "user_events"))]
247            unsafe {
248                linux::close(file_or_error)
249            };
250        }
251    }
252
253    // Returns existing state. This will be non-negative user_events_data file
254    // descriptor or -errno if file is not currently open.
255    #[cfg(all(target_os = "linux", feature = "user_events"))]
256    pub fn peek(&self) -> i32 {
257        return self.file_or_error.load(Ordering::Relaxed);
258    }
259
260    // If we have not already tried to open the `user_events_data` file, try
261    // to open it, atomically update state, and return the new state. Otherwise,
262    // return the existing state. Returns non-negative user_events_data file
263    // descriptor on success or -errno for error.
264    #[inline]
265    pub fn get(&self) -> i32 {
266        let file_or_error = self.file_or_error.load(Ordering::Relaxed);
267        return if file_or_error == Self::EAGAIN_ERROR {
268            self.update()
269        } else {
270            file_or_error
271        };
272    }
273}
274
275impl Drop for UserEventsDataFile {
276    fn drop(&mut self) {
277        self.close();
278    }
279}
280
281/// Low-level API: Represents a tracepoint registration.
282pub struct TracepointState {
283    /// The kernel will update this variable with tracepoint enable/disable state.
284    /// It will be 0 if tracepoint is disabled, nonzero if tracepoint is enabled.
285    enable_status: AtomicU32,
286
287    /// This will be a kernel-assigned value if registered,
288    /// `UNREGISTERED_WRITE_INDEX` or `BUSY_WRITE_INDEX` if not registered.
289    write_index: AtomicU32,
290
291    _pinned: marker::PhantomPinned,
292}
293
294impl TracepointState {
295    const UNREGISTERED_WRITE_INDEX: u32 = u32::MAX;
296    const BUSY_WRITE_INDEX: u32 = u32::MAX - 1;
297    const HIGHEST_VALID_WRITE_INDEX: u32 = u32::MAX - 2;
298
299    #[cfg(all(target_os = "linux", feature = "user_events"))]
300    const IOC_WRITE: ffi::c_ulong = 1;
301
302    #[cfg(all(target_os = "linux", feature = "user_events"))]
303    const IOC_READ: ffi::c_ulong = 2;
304
305    #[cfg(all(target_os = "linux", feature = "user_events"))]
306    const DIAG_IOC_MAGIC: ffi::c_ulong = '*' as ffi::c_ulong;
307
308    #[cfg(all(target_os = "linux", feature = "user_events"))]
309    const DIAG_IOCSREG: ffi::c_ulong =
310        Self::ioc(Self::IOC_WRITE | Self::IOC_READ, Self::DIAG_IOC_MAGIC, 0);
311
312    #[cfg(all(target_os = "linux", feature = "user_events"))]
313    const DIAG_IOCSUNREG: ffi::c_ulong = Self::ioc(Self::IOC_WRITE, Self::DIAG_IOC_MAGIC, 2);
314
315    #[cfg(all(target_os = "linux", feature = "user_events"))]
316    const fn ioc(dir: ffi::c_ulong, typ: ffi::c_ulong, nr: ffi::c_ulong) -> ffi::c_ulong {
317        const IOC_NRBITS: u8 = 8;
318        const IOC_TYPEBITS: u8 = 8;
319        const IOC_SIZEBITS: u8 = 14;
320        const IOC_NRSHIFT: u8 = 0;
321        const IOC_TYPESHIFT: u8 = IOC_NRSHIFT + IOC_NRBITS;
322        const IOC_SIZESHIFT: u8 = IOC_TYPESHIFT + IOC_TYPEBITS;
323        const IOC_DIRSHIFT: u8 = IOC_SIZESHIFT + IOC_SIZEBITS;
324
325        return (dir << IOC_DIRSHIFT)
326            | (typ << IOC_TYPESHIFT)
327            | (nr << IOC_NRSHIFT)
328            | ((size_of::<usize>() as ffi::c_ulong) << IOC_SIZESHIFT);
329    }
330
331    /// Creates a new unregistered tracepoint.
332    ///
333    /// initial_enable_status is normally 0, since an unregistered tracepoint will
334    /// normally be considered disabled.
335    pub const fn new(initial_enable_status: u32) -> Self {
336        return Self {
337            enable_status: AtomicU32::new(initial_enable_status),
338            write_index: AtomicU32::new(Self::UNREGISTERED_WRITE_INDEX),
339            _pinned: marker::PhantomPinned,
340        };
341    }
342
343    /// Returns true if this tracepoint is enabled, i.e. `enable_status != 0`.
344    #[inline(always)]
345    pub fn enabled(&self) -> bool {
346        return 0 != self.enable_status.load(Ordering::Relaxed);
347    }
348
349    /// Unregisters this tracepoint.
350    ///
351    /// Returns 0 for success, error code (e.g. EBUSY, EALREADY) for error.
352    /// Error code is usually ignored in retail code, but may be helpful during
353    /// development to understand behavior or track down issues.
354    pub fn unregister(&self) -> i32 {
355        let error;
356
357        let old_write_index = self
358            .write_index
359            .swap(Self::BUSY_WRITE_INDEX, Ordering::Relaxed);
360        match old_write_index {
361            Self::BUSY_WRITE_INDEX => {
362                error = 16; // EBUSY: Another thread is registering/unregistering. Do nothing.
363                return error; // Return immediately, need to leave write_index = BUSY.
364            }
365            Self::UNREGISTERED_WRITE_INDEX => {
366                error = 116; // EALREADY: Already unregistered. No action needed.
367            }
368            _ => {
369                #[cfg(not(all(target_os = "linux", feature = "user_events")))]
370                {
371                    error = 0;
372                }
373
374                #[cfg(all(target_os = "linux", feature = "user_events"))]
375                {
376                    #[repr(C, packed)]
377                    #[allow(non_camel_case_types)]
378                    struct user_unreg {
379                        size: u32,
380                        disable_bit: u8,
381                        reserved1: u8,
382                        reserved2: u16,
383                        disable_addr: u64,
384                    }
385
386                    let unreg = user_unreg {
387                        size: size_of::<user_unreg>() as u32,
388                        disable_bit: 0,
389                        reserved1: 0,
390                        reserved2: 0,
391                        disable_addr: &self.enable_status as *const AtomicU32 as usize as u64,
392                    };
393
394                    clear_errno();
395                    let ioctl_result = unsafe {
396                        linux::ioctl(USER_EVENTS_DATA_FILE.peek(), Self::DIAG_IOCSUNREG, &unreg)
397                    };
398                    if 0 > ioctl_result {
399                        error = get_failure_errno();
400                    } else {
401                        error = 0;
402                    }
403                }
404            }
405        }
406
407        let old_write_index = self
408            .write_index
409            .swap(Self::UNREGISTERED_WRITE_INDEX, Ordering::Relaxed);
410        debug_assert!(old_write_index == Self::BUSY_WRITE_INDEX);
411
412        return error;
413    }
414
415    /// Registers this tracepoint.
416    ///
417    /// Requires: this `TracepointState` is not currently registered.
418    ///
419    /// Returns 0 for success, error code (e.g. EACCES, ENOENT) for error. The error code
420    /// is usually ignored in retail scenarios but may be helpful during development to
421    /// understand behavior or track down issues.
422    ///
423    /// `_name_args` is the tracepoint definition in the format
424    /// `Name[ FieldDef1[; FieldDef2...]]`. For example:
425    ///
426    /// - `MyTracepoint1`
427    /// - `MyTracepoint2 u32 Field1`
428    /// - `MyTracepoint3 u32 Field1; char Field2[20]`
429    ///
430    /// # Safety
431    ///
432    /// The tracepoint must be unregistered before it is deallocated. `TracepointState`
433    /// will unregister itself when dropped, so this is only an issue if the tracepoint
434    /// is not dropped before it is deallocated. This might happen for a static variable
435    /// in a shared library that gets unloaded.
436    pub unsafe fn register(self: Pin<&Self>, _name_args: &ffi::CStr) -> i32 {
437        return self.register_with_flags(_name_args, 0);
438    }
439
440    /// Advanced: Registers this tracepoint using the specified `user_reg` flags.
441    ///
442    /// Requires: this `TracepointState` is not currently registered.
443    ///
444    /// Returns 0 for success, error code (e.g. EACCES, ENOENT) for error. The error code
445    /// is usually ignored in retail scenarios but may be helpful during development to
446    /// understand behavior or track down issues.
447    ///
448    /// `_name_args` is the tracepoint definition in the format
449    /// `Name[ FieldDef1[; FieldDef2...]]`. For example:
450    ///
451    /// - `MyTracepoint1`
452    /// - `MyTracepoint2 u32 Field1`
453    /// - `MyTracepoint3 u32 Field1; char Field2[20]`
454    ///
455    /// `_flags` is normally `0`, but may also be set to a `user_reg` flag such as
456    /// `USER_EVENT_REG_PERSIST`.
457    ///
458    /// # Safety
459    ///
460    /// The tracepoint must be unregistered before it is deallocated. `TracepointState`
461    /// will unregister itself when dropped, so this is only an issue if the tracepoint
462    /// is not dropped before it is deallocated. This might happen for a static variable
463    /// in a shared library that gets unloaded.
464    pub unsafe fn register_with_flags(
465        self: Pin<&Self>,
466        _name_args: &ffi::CStr,
467        _flags: u16,
468    ) -> i32 {
469        let error;
470        let new_write_index;
471
472        let old_write_index = self
473            .write_index
474            .swap(Self::BUSY_WRITE_INDEX, Ordering::Relaxed);
475        assert!(
476            old_write_index == Self::UNREGISTERED_WRITE_INDEX,
477            "register of active tracepoint (already-registered or being-unregistered)"
478        );
479
480        let user_events_data = USER_EVENTS_DATA_FILE.get();
481        if user_events_data < 0 {
482            error = -user_events_data;
483            new_write_index = Self::UNREGISTERED_WRITE_INDEX;
484        } else {
485            #[cfg(not(all(target_os = "linux", feature = "user_events")))]
486            {
487                error = 0;
488                new_write_index = 0;
489            }
490
491            #[cfg(all(target_os = "linux", feature = "user_events"))]
492            {
493                #[repr(C, packed)]
494                #[allow(non_camel_case_types)]
495                struct user_reg {
496                    size: u32,
497                    enable_bit: u8,
498                    enable_size: u8,
499                    flags: u16,
500                    enable_addr: u64,
501                    name_args: u64,
502                    write_index: u32,
503                }
504
505                let mut reg = user_reg {
506                    size: size_of::<user_reg>() as u32,
507                    enable_bit: 0,
508                    enable_size: 4,
509                    flags: _flags,
510                    enable_addr: &self.enable_status as *const AtomicU32 as usize as u64,
511                    name_args: _name_args.as_ptr() as usize as u64,
512                    write_index: 0,
513                };
514
515                clear_errno();
516                let ioctl_result =
517                    unsafe { linux::ioctl(user_events_data, Self::DIAG_IOCSREG, &mut reg) };
518                if 0 > ioctl_result {
519                    error = get_failure_errno();
520                    new_write_index = Self::UNREGISTERED_WRITE_INDEX;
521                } else {
522                    error = 0;
523                    new_write_index = reg.write_index;
524                    debug_assert!(new_write_index <= Self::HIGHEST_VALID_WRITE_INDEX);
525                }
526            }
527        }
528
529        let old_write_index = self.write_index.swap(new_write_index, Ordering::Relaxed);
530        debug_assert!(old_write_index == Self::BUSY_WRITE_INDEX);
531
532        return error;
533    }
534
535    /// Generates an event.
536    ///
537    /// Requires: `data[0].is_empty()` since it will be used for the event headers.
538    ///
539    /// Returns 0 for success, error code (e.g. EBADF) for error. The error code
540    /// is usually ignored in retail scenarios but may be helpful during development to
541    /// understand behavior or track down issues.
542    ///
543    /// If disabled or unregistered, this method does nothing and returnes EBADF.
544    /// Otherwise, sets `data[0] = write_index` then sends `data[..]` to the
545    /// `user_events_data` file handle.
546    ///
547    /// The event's payload is the concatenation of the remaining data blocks, if any
548    /// (i.e. `data[1..]`).
549    ///
550    /// The payload's layout should match the args specified in the call to `register`.
551    pub fn write(&self, data: &mut [EventDataDescriptor]) -> i32 {
552        debug_assert!(data[0].is_empty());
553
554        let enable_status = self.enable_status.load(Ordering::Relaxed);
555        let write_index = self.write_index.load(Ordering::Relaxed);
556        if enable_status == 0 || write_index > Self::HIGHEST_VALID_WRITE_INDEX {
557            return 9; // linux::EBADF
558        }
559
560        let writev_result = self.writev(data, &write_index.to_ne_bytes());
561        return writev_result;
562    }
563
564    /// Generates an event with headers.
565    ///
566    /// Requires: `data[0].is_empty()` since it will be used for the event headers;
567    /// `headers.len() >= 4` since it will be used for `write_index`.
568    ///
569    /// Returns 0 for success, error code (e.g. EBADF) for error. The error code
570    /// is usually ignored in retail scenarios but may be helpful during development to
571    /// understand behavior or track down issues.
572    ///
573    /// If disabled or unregistered, this method does nothing and returnes EBADF.
574    /// Otherwise, sets `data[0] = headers` and `headers[0..4] = write_index`, then sends
575    /// `data[..]` to the `user_events_data` file.
576    ///
577    /// The event's payload is the concatenation of the remaining data blocks, if any
578    /// (i.e. `data[1..]`).
579    ///
580    /// The payload's layout should match the args specified in the call to `register`.
581    pub fn write_with_headers(&self, data: &mut [EventDataDescriptor], headers: &mut [u8]) -> i32 {
582        debug_assert!(data[0].is_empty());
583        debug_assert!(headers.len() >= 4);
584
585        let enable_status = self.enable_status.load(Ordering::Relaxed);
586        let write_index = self.write_index.load(Ordering::Relaxed);
587        if enable_status == 0 || write_index > Self::HIGHEST_VALID_WRITE_INDEX {
588            return 9; // linux::EBADF
589        }
590
591        *<&mut [u8; 4]>::try_from(&mut headers[0..4]).unwrap() = write_index.to_ne_bytes();
592
593        let writev_result = self.writev(data, headers);
594        return writev_result;
595    }
596
597    // Returns 0 for success, errno for error.
598    fn writev(&self, _data: &mut [EventDataDescriptor], _headers: &[u8]) -> i32 {
599        #[cfg(all(target_os = "linux", feature = "user_events"))]
600        unsafe {
601            // Unsafe: Putting headers into a container a with longer lifetime.
602            _data[0] =
603                EventDataDescriptor::from_raw_ptr(_headers.as_ptr() as usize, _headers.len());
604
605            let writev_result = linux::writev(
606                USER_EVENTS_DATA_FILE.peek(),
607                _data.as_ptr() as *const linux::iovec,
608                _data.len() as i32,
609            );
610
611            // Clear the container before headers lifetime ends.
612            _data[0] = EventDataDescriptor::zero();
613
614            if 0 > writev_result {
615                return get_failure_errno();
616            }
617        }
618
619        return 0;
620    }
621}
622
623impl Drop for TracepointState {
624    fn drop(&mut self) {
625        self.unregister();
626    }
627}
628
629/// Possible configurations under which this crate can be compiled: `LinuxUserEvents` or
630/// `Other`.
631pub enum NativeImplementation {
632    /// Crate compiled for other configuration (no logging is performed).
633    Other,
634
635    /// Crate compiled for Linux user_events configuration (logging is performed via
636    /// `user_events_data` file).
637    LinuxUserEvents,
638}
639
640/// The configuration under which this crate was compiled: `LinuxUserEvents` or `Other`.
641pub const NATIVE_IMPLEMENTATION: NativeImplementation =
642    if cfg!(all(target_os = "linux", feature = "user_events")) {
643        NativeImplementation::LinuxUserEvents
644    } else {
645        NativeImplementation::Other
646    };