tracepoint/native.rs
1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT license.
3
4use core::ffi;
5use core::marker;
6use core::pin::Pin;
7use core::sync::atomic::AtomicI32;
8use core::sync::atomic::AtomicU32;
9use core::sync::atomic::Ordering;
10
11use crate::descriptors::EventDataDescriptor;
12
13#[cfg(all(target_os = "linux", feature = "user_events"))]
14use core::mem::size_of;
15
16#[cfg(all(target_os = "linux", feature = "user_events"))]
17use libc as linux;
18
19// Note: this is intentionally leaked.
20static USER_EVENTS_DATA_FILE: UserEventsDataFile = UserEventsDataFile::new();
21
22/// Requires: an errno-setting operation has failed.
23///
24/// Returns the current value of `linux::errno`.
25/// Debug-asserts that `errno > 0`.
26#[cfg(all(target_os = "linux", feature = "user_events"))]
27fn get_failure_errno() -> i32 {
28 let errno = unsafe { *linux::__errno_location() };
29 debug_assert!(errno > 0); // Shouldn't call this unless an errno-based operation failed.
30 return errno;
31}
32
33/// Sets `linux::errno` to 0.
34#[cfg(all(target_os = "linux", feature = "user_events"))]
35fn clear_errno() {
36 unsafe { *linux::__errno_location() = 0 };
37}
38
39/// linux::open(path0, O_WRONLY)
40#[cfg(all(target_os = "linux", feature = "user_events"))]
41fn open_wronly(path0: &[u8]) -> ffi::c_int {
42 assert!(path0.ends_with(&[0]));
43 return unsafe { linux::open(path0.as_ptr().cast::<ffi::c_char>(), linux::O_WRONLY) };
44}
45
46struct UserEventsDataFile {
47 /// Initial value is -EAGAIN.
48 /// Negative value is -errno with the error code from failed open.
49 /// Non-negative value is file descriptor for the "user_events_data" file.
50 file_or_error: AtomicI32,
51}
52
53impl UserEventsDataFile {
54 const EAGAIN_ERROR: i32 = -11;
55
56 #[cfg(all(target_os = "linux", feature = "user_events"))]
57 const fn is_space_char(ch: u8) -> bool {
58 return ch == b' ' || ch == b'\t';
59 }
60
61 #[cfg(all(target_os = "linux", feature = "user_events"))]
62 const fn is_nonspace_char(ch: u8) -> bool {
63 return ch != b'\0' && !Self::is_space_char(ch);
64 }
65
66 /// Opens a file descriptor to the `user_events_data` file.
67 /// Atomically updates `self.file_or_error` to either a negative
68 /// value (-errno returned from `linux::open`) or a non-negative value
69 /// (the file descriptor). If `self.file_or_error` already contains a
70 /// non-negative value, the existing value is retained and the new
71 /// descriptor is closed. In all cases, returns the final value of
72 /// `self.file_or_error`.
73 fn update(&self) -> i32 {
74 let new_file_or_error;
75
76 #[cfg(not(all(target_os = "linux", feature = "user_events")))]
77 {
78 new_file_or_error = 0;
79 }
80 #[cfg(all(target_os = "linux", feature = "user_events"))]
81 {
82 // Need to find the ".../tracing/user_events_data" file in tracefs or debugfs.
83
84 // First, try the usual tracefs mount point.
85 if let new_file @ 0.. = open_wronly(b"/sys/kernel/tracing/user_events_data\0") {
86 new_file_or_error = new_file;
87 } else {
88 // Determine tracefs/debugfs mount point by parsing "/proc/mounts".
89 // Prefer "tracefs" over "debugfs": if we find a debugfs, save the path but
90 // keep looking in case we find a tracefs later.
91 clear_errno();
92 let mounts_file = unsafe {
93 linux::fopen(
94 "/proc/mounts\0".as_ptr().cast::<ffi::c_char>(),
95 "r\0".as_ptr().cast::<ffi::c_char>(),
96 )
97 };
98 if mounts_file.is_null() {
99 new_file_or_error = -get_failure_errno();
100 } else {
101 let mut path = [0u8; 274]; // 256 + sizeof("/user_events_data")
102 let mut line = [0u8; 4097];
103 loop {
104 let fgets_result = unsafe {
105 linux::fgets(
106 line.as_mut_ptr().cast::<ffi::c_char>(),
107 line.len() as ffi::c_int,
108 mounts_file,
109 )
110 };
111 if fgets_result.is_null() {
112 break;
113 }
114
115 // line is "device_name mount_point file_system other_stuff..."
116
117 let mut line_pos = 0;
118
119 // device_name
120 while Self::is_nonspace_char(line[line_pos]) {
121 line_pos += 1;
122 }
123
124 // whitespace
125 while Self::is_space_char(line[line_pos]) {
126 line_pos += 1;
127 }
128
129 // mount_point
130 let mount_begin = line_pos;
131 while Self::is_nonspace_char(line[line_pos]) {
132 line_pos += 1;
133 }
134
135 let mount_end = line_pos;
136
137 // whitespace
138 while Self::is_space_char(line[line_pos]) {
139 line_pos += 1;
140 }
141
142 // file_system
143 let fs_begin = line_pos;
144 while Self::is_nonspace_char(line[line_pos]) {
145 line_pos += 1;
146 }
147
148 let fs_end = line_pos;
149
150 if !Self::is_space_char(line[line_pos]) {
151 // Ignore line if no whitespace after file_system.
152 continue;
153 }
154
155 let path_suffix: &[u8]; // Includes NUL
156 let fs = &line[fs_begin..fs_end];
157 let keep_looking;
158 if fs == b"tracefs" {
159 // "tracefsMountPoint/user_events_data"
160 path_suffix = b"/user_events_data\0";
161 keep_looking = false; // prefer "tracefs" over "debugfs"
162 } else if path[0] == 0 && fs == b"debugfs" {
163 // "debugfsMountPoint/tracing/user_events_data"
164 path_suffix = b"/tracing/user_events_data\0";
165 keep_looking = true; // prefer "tracefs" over "debugfs"
166 } else {
167 continue;
168 }
169
170 let mount_len = mount_end - mount_begin;
171 let path_len = mount_len + path_suffix.len(); // Includes NUL
172 if path_len > path.len() {
173 continue;
174 }
175
176 // path = mountpoint + suffix
177 path[0..mount_len].copy_from_slice(&line[mount_begin..mount_end]);
178 path[mount_len..path_len].copy_from_slice(path_suffix); // Includes NUL
179
180 if !keep_looking {
181 break;
182 }
183 }
184
185 unsafe { linux::fclose(mounts_file) };
186
187 if path[0] == 0 {
188 new_file_or_error = -linux::ENOTSUP;
189 } else {
190 // path is now something like "/sys/kernel/tracing/user_events_data\0" or
191 // "/sys/kernel/debug/tracing/user_events_data\0".
192 clear_errno();
193 new_file_or_error = if let new_file @ 0.. = open_wronly(&path) {
194 new_file
195 } else {
196 -get_failure_errno()
197 };
198 }
199 }
200 }
201 }
202
203 let mut old_file_or_error = Self::EAGAIN_ERROR;
204 loop {
205 match self.file_or_error.compare_exchange(
206 old_file_or_error,
207 new_file_or_error,
208 Ordering::Relaxed,
209 Ordering::Relaxed,
210 ) {
211 Ok(_) => {
212 // We updated FILE_OR_ERROR to new.
213 return new_file_or_error;
214 }
215 Err(current_file_or_error) => {
216 // Somebody else updated FILE_OR_ERROR to current.
217 if current_file_or_error >= 0 || new_file_or_error < 0 {
218 // prefer current.
219 #[cfg(all(target_os = "linux", feature = "user_events"))]
220 if new_file_or_error >= 0 {
221 unsafe { linux::close(new_file_or_error) };
222 }
223 return current_file_or_error;
224 }
225
226 // current is an error, new is a file, try again.
227 old_file_or_error = current_file_or_error;
228 }
229 }
230 }
231 }
232
233 // Initial state is -EAGAIN.
234 pub const fn new() -> Self {
235 return Self {
236 file_or_error: AtomicI32::new(Self::EAGAIN_ERROR),
237 };
238 }
239
240 // If file is open, closes it. Sets state to -EAGAIN.
241 pub fn close(&self) {
242 let file_or_error = self
243 .file_or_error
244 .swap(Self::EAGAIN_ERROR, Ordering::Relaxed);
245 if file_or_error >= 0 {
246 #[cfg(all(target_os = "linux", feature = "user_events"))]
247 unsafe {
248 linux::close(file_or_error)
249 };
250 }
251 }
252
253 // Returns existing state. This will be non-negative user_events_data file
254 // descriptor or -errno if file is not currently open.
255 #[cfg(all(target_os = "linux", feature = "user_events"))]
256 pub fn peek(&self) -> i32 {
257 return self.file_or_error.load(Ordering::Relaxed);
258 }
259
260 // If we have not already tried to open the `user_events_data` file, try
261 // to open it, atomically update state, and return the new state. Otherwise,
262 // return the existing state. Returns non-negative user_events_data file
263 // descriptor on success or -errno for error.
264 #[inline]
265 pub fn get(&self) -> i32 {
266 let file_or_error = self.file_or_error.load(Ordering::Relaxed);
267 return if file_or_error == Self::EAGAIN_ERROR {
268 self.update()
269 } else {
270 file_or_error
271 };
272 }
273}
274
275impl Drop for UserEventsDataFile {
276 fn drop(&mut self) {
277 self.close();
278 }
279}
280
281/// Low-level API: Represents a tracepoint registration.
282pub struct TracepointState {
283 /// The kernel will update this variable with tracepoint enable/disable state.
284 /// It will be 0 if tracepoint is disabled, nonzero if tracepoint is enabled.
285 enable_status: AtomicU32,
286
287 /// This will be a kernel-assigned value if registered,
288 /// `UNREGISTERED_WRITE_INDEX` or `BUSY_WRITE_INDEX` if not registered.
289 write_index: AtomicU32,
290
291 _pinned: marker::PhantomPinned,
292}
293
294impl TracepointState {
295 const UNREGISTERED_WRITE_INDEX: u32 = u32::MAX;
296 const BUSY_WRITE_INDEX: u32 = u32::MAX - 1;
297 const HIGHEST_VALID_WRITE_INDEX: u32 = u32::MAX - 2;
298
299 #[cfg(all(target_os = "linux", feature = "user_events"))]
300 const IOC_WRITE: ffi::c_ulong = 1;
301
302 #[cfg(all(target_os = "linux", feature = "user_events"))]
303 const IOC_READ: ffi::c_ulong = 2;
304
305 #[cfg(all(target_os = "linux", feature = "user_events"))]
306 const DIAG_IOC_MAGIC: ffi::c_ulong = '*' as ffi::c_ulong;
307
308 #[cfg(all(target_os = "linux", feature = "user_events"))]
309 const DIAG_IOCSREG: ffi::c_ulong =
310 Self::ioc(Self::IOC_WRITE | Self::IOC_READ, Self::DIAG_IOC_MAGIC, 0);
311
312 #[cfg(all(target_os = "linux", feature = "user_events"))]
313 const DIAG_IOCSUNREG: ffi::c_ulong = Self::ioc(Self::IOC_WRITE, Self::DIAG_IOC_MAGIC, 2);
314
315 #[cfg(all(target_os = "linux", feature = "user_events"))]
316 const fn ioc(dir: ffi::c_ulong, typ: ffi::c_ulong, nr: ffi::c_ulong) -> ffi::c_ulong {
317 const IOC_NRBITS: u8 = 8;
318 const IOC_TYPEBITS: u8 = 8;
319 const IOC_SIZEBITS: u8 = 14;
320 const IOC_NRSHIFT: u8 = 0;
321 const IOC_TYPESHIFT: u8 = IOC_NRSHIFT + IOC_NRBITS;
322 const IOC_SIZESHIFT: u8 = IOC_TYPESHIFT + IOC_TYPEBITS;
323 const IOC_DIRSHIFT: u8 = IOC_SIZESHIFT + IOC_SIZEBITS;
324
325 return (dir << IOC_DIRSHIFT)
326 | (typ << IOC_TYPESHIFT)
327 | (nr << IOC_NRSHIFT)
328 | ((size_of::<usize>() as ffi::c_ulong) << IOC_SIZESHIFT);
329 }
330
331 /// Creates a new unregistered tracepoint.
332 ///
333 /// initial_enable_status is normally 0, since an unregistered tracepoint will
334 /// normally be considered disabled.
335 pub const fn new(initial_enable_status: u32) -> Self {
336 return Self {
337 enable_status: AtomicU32::new(initial_enable_status),
338 write_index: AtomicU32::new(Self::UNREGISTERED_WRITE_INDEX),
339 _pinned: marker::PhantomPinned,
340 };
341 }
342
343 /// Returns true if this tracepoint is enabled, i.e. `enable_status != 0`.
344 #[inline(always)]
345 pub fn enabled(&self) -> bool {
346 return 0 != self.enable_status.load(Ordering::Relaxed);
347 }
348
349 /// Unregisters this tracepoint.
350 ///
351 /// Returns 0 for success, error code (e.g. EBUSY, EALREADY) for error.
352 /// Error code is usually ignored in retail code, but may be helpful during
353 /// development to understand behavior or track down issues.
354 pub fn unregister(&self) -> i32 {
355 let error;
356
357 let old_write_index = self
358 .write_index
359 .swap(Self::BUSY_WRITE_INDEX, Ordering::Relaxed);
360 match old_write_index {
361 Self::BUSY_WRITE_INDEX => {
362 error = 16; // EBUSY: Another thread is registering/unregistering. Do nothing.
363 return error; // Return immediately, need to leave write_index = BUSY.
364 }
365 Self::UNREGISTERED_WRITE_INDEX => {
366 error = 116; // EALREADY: Already unregistered. No action needed.
367 }
368 _ => {
369 #[cfg(not(all(target_os = "linux", feature = "user_events")))]
370 {
371 error = 0;
372 }
373
374 #[cfg(all(target_os = "linux", feature = "user_events"))]
375 {
376 #[repr(C, packed)]
377 #[allow(non_camel_case_types)]
378 struct user_unreg {
379 size: u32,
380 disable_bit: u8,
381 reserved1: u8,
382 reserved2: u16,
383 disable_addr: u64,
384 }
385
386 let unreg = user_unreg {
387 size: size_of::<user_unreg>() as u32,
388 disable_bit: 0,
389 reserved1: 0,
390 reserved2: 0,
391 disable_addr: &self.enable_status as *const AtomicU32 as usize as u64,
392 };
393
394 clear_errno();
395 let ioctl_result = unsafe {
396 linux::ioctl(USER_EVENTS_DATA_FILE.peek(), Self::DIAG_IOCSUNREG, &unreg)
397 };
398 if 0 > ioctl_result {
399 error = get_failure_errno();
400 } else {
401 error = 0;
402 }
403 }
404 }
405 }
406
407 let old_write_index = self
408 .write_index
409 .swap(Self::UNREGISTERED_WRITE_INDEX, Ordering::Relaxed);
410 debug_assert!(old_write_index == Self::BUSY_WRITE_INDEX);
411
412 return error;
413 }
414
415 /// Registers this tracepoint.
416 ///
417 /// Requires: this `TracepointState` is not currently registered.
418 ///
419 /// Returns 0 for success, error code (e.g. EACCES, ENOENT) for error. The error code
420 /// is usually ignored in retail scenarios but may be helpful during development to
421 /// understand behavior or track down issues.
422 ///
423 /// `_name_args` is the tracepoint definition in the format
424 /// `Name[ FieldDef1[; FieldDef2...]]`. For example:
425 ///
426 /// - `MyTracepoint1`
427 /// - `MyTracepoint2 u32 Field1`
428 /// - `MyTracepoint3 u32 Field1; char Field2[20]`
429 ///
430 /// # Safety
431 ///
432 /// The tracepoint must be unregistered before it is deallocated. `TracepointState`
433 /// will unregister itself when dropped, so this is only an issue if the tracepoint
434 /// is not dropped before it is deallocated. This might happen for a static variable
435 /// in a shared library that gets unloaded.
436 pub unsafe fn register(self: Pin<&Self>, _name_args: &ffi::CStr) -> i32 {
437 return self.register_with_flags(_name_args, 0);
438 }
439
440 /// Advanced: Registers this tracepoint using the specified `user_reg` flags.
441 ///
442 /// Requires: this `TracepointState` is not currently registered.
443 ///
444 /// Returns 0 for success, error code (e.g. EACCES, ENOENT) for error. The error code
445 /// is usually ignored in retail scenarios but may be helpful during development to
446 /// understand behavior or track down issues.
447 ///
448 /// `_name_args` is the tracepoint definition in the format
449 /// `Name[ FieldDef1[; FieldDef2...]]`. For example:
450 ///
451 /// - `MyTracepoint1`
452 /// - `MyTracepoint2 u32 Field1`
453 /// - `MyTracepoint3 u32 Field1; char Field2[20]`
454 ///
455 /// `_flags` is normally `0`, but may also be set to a `user_reg` flag such as
456 /// `USER_EVENT_REG_PERSIST`.
457 ///
458 /// # Safety
459 ///
460 /// The tracepoint must be unregistered before it is deallocated. `TracepointState`
461 /// will unregister itself when dropped, so this is only an issue if the tracepoint
462 /// is not dropped before it is deallocated. This might happen for a static variable
463 /// in a shared library that gets unloaded.
464 pub unsafe fn register_with_flags(
465 self: Pin<&Self>,
466 _name_args: &ffi::CStr,
467 _flags: u16,
468 ) -> i32 {
469 let error;
470 let new_write_index;
471
472 let old_write_index = self
473 .write_index
474 .swap(Self::BUSY_WRITE_INDEX, Ordering::Relaxed);
475 assert!(
476 old_write_index == Self::UNREGISTERED_WRITE_INDEX,
477 "register of active tracepoint (already-registered or being-unregistered)"
478 );
479
480 let user_events_data = USER_EVENTS_DATA_FILE.get();
481 if user_events_data < 0 {
482 error = -user_events_data;
483 new_write_index = Self::UNREGISTERED_WRITE_INDEX;
484 } else {
485 #[cfg(not(all(target_os = "linux", feature = "user_events")))]
486 {
487 error = 0;
488 new_write_index = 0;
489 }
490
491 #[cfg(all(target_os = "linux", feature = "user_events"))]
492 {
493 #[repr(C, packed)]
494 #[allow(non_camel_case_types)]
495 struct user_reg {
496 size: u32,
497 enable_bit: u8,
498 enable_size: u8,
499 flags: u16,
500 enable_addr: u64,
501 name_args: u64,
502 write_index: u32,
503 }
504
505 let mut reg = user_reg {
506 size: size_of::<user_reg>() as u32,
507 enable_bit: 0,
508 enable_size: 4,
509 flags: _flags,
510 enable_addr: &self.enable_status as *const AtomicU32 as usize as u64,
511 name_args: _name_args.as_ptr() as usize as u64,
512 write_index: 0,
513 };
514
515 clear_errno();
516 let ioctl_result =
517 unsafe { linux::ioctl(user_events_data, Self::DIAG_IOCSREG, &mut reg) };
518 if 0 > ioctl_result {
519 error = get_failure_errno();
520 new_write_index = Self::UNREGISTERED_WRITE_INDEX;
521 } else {
522 error = 0;
523 new_write_index = reg.write_index;
524 debug_assert!(new_write_index <= Self::HIGHEST_VALID_WRITE_INDEX);
525 }
526 }
527 }
528
529 let old_write_index = self.write_index.swap(new_write_index, Ordering::Relaxed);
530 debug_assert!(old_write_index == Self::BUSY_WRITE_INDEX);
531
532 return error;
533 }
534
535 /// Generates an event.
536 ///
537 /// Requires: `data[0].is_empty()` since it will be used for the event headers.
538 ///
539 /// Returns 0 for success, error code (e.g. EBADF) for error. The error code
540 /// is usually ignored in retail scenarios but may be helpful during development to
541 /// understand behavior or track down issues.
542 ///
543 /// If disabled or unregistered, this method does nothing and returnes EBADF.
544 /// Otherwise, sets `data[0] = write_index` then sends `data[..]` to the
545 /// `user_events_data` file handle.
546 ///
547 /// The event's payload is the concatenation of the remaining data blocks, if any
548 /// (i.e. `data[1..]`).
549 ///
550 /// The payload's layout should match the args specified in the call to `register`.
551 pub fn write(&self, data: &mut [EventDataDescriptor]) -> i32 {
552 debug_assert!(data[0].is_empty());
553
554 let enable_status = self.enable_status.load(Ordering::Relaxed);
555 let write_index = self.write_index.load(Ordering::Relaxed);
556 if enable_status == 0 || write_index > Self::HIGHEST_VALID_WRITE_INDEX {
557 return 9; // linux::EBADF
558 }
559
560 let writev_result = self.writev(data, &write_index.to_ne_bytes());
561 return writev_result;
562 }
563
564 /// Generates an event with headers.
565 ///
566 /// Requires: `data[0].is_empty()` since it will be used for the event headers;
567 /// `headers.len() >= 4` since it will be used for `write_index`.
568 ///
569 /// Returns 0 for success, error code (e.g. EBADF) for error. The error code
570 /// is usually ignored in retail scenarios but may be helpful during development to
571 /// understand behavior or track down issues.
572 ///
573 /// If disabled or unregistered, this method does nothing and returnes EBADF.
574 /// Otherwise, sets `data[0] = headers` and `headers[0..4] = write_index`, then sends
575 /// `data[..]` to the `user_events_data` file.
576 ///
577 /// The event's payload is the concatenation of the remaining data blocks, if any
578 /// (i.e. `data[1..]`).
579 ///
580 /// The payload's layout should match the args specified in the call to `register`.
581 pub fn write_with_headers(&self, data: &mut [EventDataDescriptor], headers: &mut [u8]) -> i32 {
582 debug_assert!(data[0].is_empty());
583 debug_assert!(headers.len() >= 4);
584
585 let enable_status = self.enable_status.load(Ordering::Relaxed);
586 let write_index = self.write_index.load(Ordering::Relaxed);
587 if enable_status == 0 || write_index > Self::HIGHEST_VALID_WRITE_INDEX {
588 return 9; // linux::EBADF
589 }
590
591 *<&mut [u8; 4]>::try_from(&mut headers[0..4]).unwrap() = write_index.to_ne_bytes();
592
593 let writev_result = self.writev(data, headers);
594 return writev_result;
595 }
596
597 // Returns 0 for success, errno for error.
598 fn writev(&self, _data: &mut [EventDataDescriptor], _headers: &[u8]) -> i32 {
599 #[cfg(all(target_os = "linux", feature = "user_events"))]
600 unsafe {
601 // Unsafe: Putting headers into a container a with longer lifetime.
602 _data[0] =
603 EventDataDescriptor::from_raw_ptr(_headers.as_ptr() as usize, _headers.len());
604
605 let writev_result = linux::writev(
606 USER_EVENTS_DATA_FILE.peek(),
607 _data.as_ptr() as *const linux::iovec,
608 _data.len() as i32,
609 );
610
611 // Clear the container before headers lifetime ends.
612 _data[0] = EventDataDescriptor::zero();
613
614 if 0 > writev_result {
615 return get_failure_errno();
616 }
617 }
618
619 return 0;
620 }
621}
622
623impl Drop for TracepointState {
624 fn drop(&mut self) {
625 self.unregister();
626 }
627}
628
629/// Possible configurations under which this crate can be compiled: `LinuxUserEvents` or
630/// `Other`.
631pub enum NativeImplementation {
632 /// Crate compiled for other configuration (no logging is performed).
633 Other,
634
635 /// Crate compiled for Linux user_events configuration (logging is performed via
636 /// `user_events_data` file).
637 LinuxUserEvents,
638}
639
640/// The configuration under which this crate was compiled: `LinuxUserEvents` or `Other`.
641pub const NATIVE_IMPLEMENTATION: NativeImplementation =
642 if cfg!(all(target_os = "linux", feature = "user_events")) {
643 NativeImplementation::LinuxUserEvents
644 } else {
645 NativeImplementation::Other
646 };