tracepoint/native.rs
1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT license.
3
4use core::ffi;
5use core::marker;
6use core::mem::size_of;
7use core::pin::Pin;
8use core::sync::atomic::AtomicI32;
9use core::sync::atomic::AtomicU32;
10use core::sync::atomic::Ordering;
11
12use crate::descriptors::EventDataDescriptor;
13
14#[cfg(all(target_os = "linux", feature = "user_events"))]
15use libc as linux;
16
17// Note: this is intentionally leaked.
18static USER_EVENTS_DATA_FILE: UserEventsDataFile = UserEventsDataFile::new();
19
20/// Requires: an errno-setting operation has failed.
21///
22/// Returns the current value of `linux::errno`.
23/// Debug-asserts that `errno > 0`.
24#[cfg(all(target_os = "linux", feature = "user_events"))]
25fn get_failure_errno() -> i32 {
26 let errno = unsafe { *linux::__errno_location() };
27 debug_assert!(errno > 0); // Shouldn't call this unless an errno-based operation failed.
28 return errno;
29}
30
31/// Sets `linux::errno` to 0.
32#[cfg(all(target_os = "linux", feature = "user_events"))]
33fn clear_errno() {
34 unsafe { *linux::__errno_location() = 0 };
35}
36
37/// linux::open(path0, O_WRONLY)
38#[cfg(all(target_os = "linux", feature = "user_events"))]
39fn open_wronly(path0: &[u8]) -> ffi::c_int {
40 assert!(path0.ends_with(&[0]));
41 return unsafe { linux::open(path0.as_ptr().cast::<ffi::c_char>(), linux::O_WRONLY) };
42}
43
44struct UserEventsDataFile {
45 /// Initial value is -EAGAIN.
46 /// Negative value is -errno with the error code from failed open.
47 /// Non-negative value is file descriptor for the "user_events_data" file.
48 file_or_error: AtomicI32,
49}
50
51impl UserEventsDataFile {
52 const EAGAIN_ERROR: i32 = -11;
53
54 #[cfg(all(target_os = "linux", feature = "user_events"))]
55 const fn is_space_char(ch: u8) -> bool {
56 return ch == b' ' || ch == b'\t';
57 }
58
59 #[cfg(all(target_os = "linux", feature = "user_events"))]
60 const fn is_nonspace_char(ch: u8) -> bool {
61 return ch != b'\0' && !Self::is_space_char(ch);
62 }
63
64 /// Opens a file descriptor to the `user_events_data` file.
65 /// Atomically updates `self.file_or_error` to either a negative
66 /// value (-errno returned from `linux::open`) or a non-negative value
67 /// (the file descriptor). If `self.file_or_error` already contains a
68 /// non-negative value, the existing value is retained and the new
69 /// descriptor is closed. In all cases, returns the final value of
70 /// `self.file_or_error`.
71 fn update(&self) -> i32 {
72 let new_file_or_error;
73
74 #[cfg(not(all(target_os = "linux", feature = "user_events")))]
75 {
76 new_file_or_error = 0;
77 }
78 #[cfg(all(target_os = "linux", feature = "user_events"))]
79 {
80 // Need to find the ".../tracing/user_events_data" file in tracefs or debugfs.
81
82 // First, try the usual tracefs mount point.
83 if let new_file @ 0.. = open_wronly(b"/sys/kernel/tracing/user_events_data\0") {
84 new_file_or_error = new_file;
85 } else {
86 // Determine tracefs/debugfs mount point by parsing "/proc/mounts".
87 // Prefer "tracefs" over "debugfs": if we find a debugfs, save the path but
88 // keep looking in case we find a tracefs later.
89 clear_errno();
90 let mounts_file = unsafe {
91 linux::fopen(
92 "/proc/mounts\0".as_ptr().cast::<ffi::c_char>(),
93 "r\0".as_ptr().cast::<ffi::c_char>(),
94 )
95 };
96 if mounts_file.is_null() {
97 new_file_or_error = -get_failure_errno();
98 } else {
99 let mut path = [0u8; 274]; // 256 + sizeof("/user_events_data")
100 let mut line = [0u8; 4097];
101 loop {
102 let fgets_result = unsafe {
103 linux::fgets(
104 line.as_mut_ptr().cast::<ffi::c_char>(),
105 line.len() as ffi::c_int,
106 mounts_file,
107 )
108 };
109 if fgets_result.is_null() {
110 break;
111 }
112
113 // line is "device_name mount_point file_system other_stuff..."
114
115 let mut line_pos = 0;
116
117 // device_name
118 while Self::is_nonspace_char(line[line_pos]) {
119 line_pos += 1;
120 }
121
122 // whitespace
123 while Self::is_space_char(line[line_pos]) {
124 line_pos += 1;
125 }
126
127 // mount_point
128 let mount_begin = line_pos;
129 while Self::is_nonspace_char(line[line_pos]) {
130 line_pos += 1;
131 }
132
133 let mount_end = line_pos;
134
135 // whitespace
136 while Self::is_space_char(line[line_pos]) {
137 line_pos += 1;
138 }
139
140 // file_system
141 let fs_begin = line_pos;
142 while Self::is_nonspace_char(line[line_pos]) {
143 line_pos += 1;
144 }
145
146 let fs_end = line_pos;
147
148 if !Self::is_space_char(line[line_pos]) {
149 // Ignore line if no whitespace after file_system.
150 continue;
151 }
152
153 let path_suffix: &[u8]; // Includes NUL
154 let fs = &line[fs_begin..fs_end];
155 let keep_looking;
156 if fs == b"tracefs" {
157 // "tracefsMountPoint/user_events_data"
158 path_suffix = b"/user_events_data\0";
159 keep_looking = false; // prefer "tracefs" over "debugfs"
160 } else if path[0] == 0 && fs == b"debugfs" {
161 // "debugfsMountPoint/tracing/user_events_data"
162 path_suffix = b"/tracing/user_events_data\0";
163 keep_looking = true; // prefer "tracefs" over "debugfs"
164 } else {
165 continue;
166 }
167
168 let mount_len = mount_end - mount_begin;
169 let path_len = mount_len + path_suffix.len(); // Includes NUL
170 if path_len > path.len() {
171 continue;
172 }
173
174 // path = mountpoint + suffix
175 path[0..mount_len].copy_from_slice(&line[mount_begin..mount_end]);
176 path[mount_len..path_len].copy_from_slice(path_suffix); // Includes NUL
177
178 if !keep_looking {
179 break;
180 }
181 }
182
183 unsafe { linux::fclose(mounts_file) };
184
185 if path[0] == 0 {
186 new_file_or_error = -linux::ENOTSUP;
187 } else {
188 // path is now something like "/sys/kernel/tracing/user_events_data\0" or
189 // "/sys/kernel/debug/tracing/user_events_data\0".
190 clear_errno();
191 new_file_or_error = if let new_file @ 0.. = open_wronly(&path) {
192 new_file
193 } else {
194 -get_failure_errno()
195 };
196 }
197 }
198 }
199 }
200
201 let mut old_file_or_error = Self::EAGAIN_ERROR;
202 loop {
203 match self.file_or_error.compare_exchange(
204 old_file_or_error,
205 new_file_or_error,
206 Ordering::Relaxed,
207 Ordering::Relaxed,
208 ) {
209 Ok(_) => {
210 // We updated FILE_OR_ERROR to new.
211 return new_file_or_error;
212 }
213 Err(current_file_or_error) => {
214 // Somebody else updated FILE_OR_ERROR to current.
215 if current_file_or_error >= 0 || new_file_or_error < 0 {
216 // prefer current.
217 #[cfg(all(target_os = "linux", feature = "user_events"))]
218 if new_file_or_error >= 0 {
219 unsafe { linux::close(new_file_or_error) };
220 }
221 return current_file_or_error;
222 }
223
224 // current is an error, new is a file, try again.
225 old_file_or_error = current_file_or_error;
226 }
227 }
228 }
229 }
230
231 // Initial state is -EAGAIN.
232 pub const fn new() -> Self {
233 return Self {
234 file_or_error: AtomicI32::new(Self::EAGAIN_ERROR),
235 };
236 }
237
238 // If file is open, closes it. Sets state to -EAGAIN.
239 pub fn close(&self) {
240 let file_or_error = self
241 .file_or_error
242 .swap(Self::EAGAIN_ERROR, Ordering::Relaxed);
243 if file_or_error >= 0 {
244 #[cfg(all(target_os = "linux", feature = "user_events"))]
245 unsafe {
246 linux::close(file_or_error)
247 };
248 }
249 }
250
251 // Returns existing state. This will be non-negative user_events_data file
252 // descriptor or -errno if file is not currently open.
253 #[cfg(all(target_os = "linux", feature = "user_events"))]
254 pub fn peek(&self) -> i32 {
255 return self.file_or_error.load(Ordering::Relaxed);
256 }
257
258 // If we have not already tried to open the `user_events_data` file, try
259 // to open it, atomically update state, and return the new state. Otherwise,
260 // return the existing state. Returns non-negative user_events_data file
261 // descriptor on success or -errno for error.
262 #[inline]
263 pub fn get(&self) -> i32 {
264 let file_or_error = self.file_or_error.load(Ordering::Relaxed);
265 return if file_or_error == Self::EAGAIN_ERROR {
266 self.update()
267 } else {
268 file_or_error
269 };
270 }
271}
272
273impl Drop for UserEventsDataFile {
274 fn drop(&mut self) {
275 self.close();
276 }
277}
278
279/// Low-level API: Represents a tracepoint registration.
280pub struct TracepointState {
281 /// The kernel will update this variable with tracepoint enable/disable state.
282 /// It will be 0 if tracepoint is disabled, nonzero if tracepoint is enabled.
283 enable_status: AtomicU32,
284
285 /// This will be a kernel-assigned value if registered,
286 /// `UNREGISTERED_WRITE_INDEX` or `BUSY_WRITE_INDEX` if not registered.
287 write_index: AtomicU32,
288
289 _pinned: marker::PhantomPinned,
290}
291
292impl TracepointState {
293 const UNREGISTERED_WRITE_INDEX: u32 = u32::MAX;
294 const BUSY_WRITE_INDEX: u32 = u32::MAX - 1;
295 const HIGHEST_VALID_WRITE_INDEX: u32 = u32::MAX - 2;
296
297 #[cfg(all(target_os = "linux", feature = "user_events"))]
298 const IOC_WRITE: ffi::c_ulong = 1;
299
300 #[cfg(all(target_os = "linux", feature = "user_events"))]
301 const IOC_READ: ffi::c_ulong = 2;
302
303 #[cfg(all(target_os = "linux", feature = "user_events"))]
304 const DIAG_IOC_MAGIC: ffi::c_ulong = '*' as ffi::c_ulong;
305
306 #[cfg(all(target_os = "linux", feature = "user_events"))]
307 const DIAG_IOCSREG: ffi::c_ulong =
308 Self::ioc(Self::IOC_WRITE | Self::IOC_READ, Self::DIAG_IOC_MAGIC, 0);
309
310 #[cfg(all(target_os = "linux", feature = "user_events"))]
311 const DIAG_IOCSUNREG: ffi::c_ulong = Self::ioc(Self::IOC_WRITE, Self::DIAG_IOC_MAGIC, 2);
312
313 #[cfg(all(target_os = "linux", feature = "user_events"))]
314 const fn ioc(dir: ffi::c_ulong, typ: ffi::c_ulong, nr: ffi::c_ulong) -> ffi::c_ulong {
315 const IOC_NRBITS: u8 = 8;
316 const IOC_TYPEBITS: u8 = 8;
317 const IOC_SIZEBITS: u8 = 14;
318 const IOC_NRSHIFT: u8 = 0;
319 const IOC_TYPESHIFT: u8 = IOC_NRSHIFT + IOC_NRBITS;
320 const IOC_SIZESHIFT: u8 = IOC_TYPESHIFT + IOC_TYPEBITS;
321 const IOC_DIRSHIFT: u8 = IOC_SIZESHIFT + IOC_SIZEBITS;
322
323 return (dir << IOC_DIRSHIFT)
324 | (typ << IOC_TYPESHIFT)
325 | (nr << IOC_NRSHIFT)
326 | ((size_of::<usize>() as ffi::c_ulong) << IOC_SIZESHIFT);
327 }
328
329 /// Creates a new unregistered tracepoint.
330 ///
331 /// initial_enable_status is normally 0, since an unregistered tracepoint will
332 /// normally be considered disabled.
333 pub const fn new(initial_enable_status: u32) -> Self {
334 return Self {
335 enable_status: AtomicU32::new(initial_enable_status),
336 write_index: AtomicU32::new(Self::UNREGISTERED_WRITE_INDEX),
337 _pinned: marker::PhantomPinned,
338 };
339 }
340
341 /// Returns true if this tracepoint is enabled, i.e. `enable_status != 0`.
342 #[inline(always)]
343 pub fn enabled(&self) -> bool {
344 return 0 != self.enable_status.load(Ordering::Relaxed);
345 }
346
347 /// Unregisters this tracepoint.
348 ///
349 /// Returns 0 for success, error code (e.g. EBUSY, EALREADY) for error.
350 /// Error code is usually ignored in retail code, but may be helpful during
351 /// development to understand behavior or track down issues.
352 pub fn unregister(&self) -> i32 {
353 let error;
354
355 let old_write_index = self
356 .write_index
357 .swap(Self::BUSY_WRITE_INDEX, Ordering::Relaxed);
358 match old_write_index {
359 Self::BUSY_WRITE_INDEX => {
360 error = 16; // EBUSY: Another thread is registering/unregistering. Do nothing.
361 return error; // Return immediately, need to leave write_index = BUSY.
362 }
363 Self::UNREGISTERED_WRITE_INDEX => {
364 error = 116; // EALREADY: Already unregistered. No action needed.
365 }
366 _ => {
367 #[cfg(not(all(target_os = "linux", feature = "user_events")))]
368 {
369 error = 0;
370 }
371
372 #[cfg(all(target_os = "linux", feature = "user_events"))]
373 {
374 #[repr(C, packed)]
375 #[allow(non_camel_case_types)]
376 struct user_unreg {
377 size: u32,
378 disable_bit: u8,
379 reserved1: u8,
380 reserved2: u16,
381 disable_addr: u64,
382 }
383
384 let unreg = user_unreg {
385 size: size_of::<user_unreg>() as u32,
386 disable_bit: 0,
387 reserved1: 0,
388 reserved2: 0,
389 disable_addr: &self.enable_status as *const AtomicU32 as usize as u64,
390 };
391
392 clear_errno();
393 let ioctl_result = unsafe {
394 linux::ioctl(USER_EVENTS_DATA_FILE.peek(), Self::DIAG_IOCSUNREG, &unreg)
395 };
396 if 0 > ioctl_result {
397 error = get_failure_errno();
398 } else {
399 error = 0;
400 }
401 }
402 }
403 }
404
405 let old_write_index = self
406 .write_index
407 .swap(Self::UNREGISTERED_WRITE_INDEX, Ordering::Relaxed);
408 debug_assert!(old_write_index == Self::BUSY_WRITE_INDEX);
409
410 return error;
411 }
412
413 /// Registers this tracepoint.
414 ///
415 /// Requires: this `TracepointState` is not currently registered.
416 ///
417 /// Returns 0 for success, error code (e.g. EACCES, ENOENT) for error. The error code
418 /// is usually ignored in retail scenarios but may be helpful during development to
419 /// understand behavior or track down issues.
420 ///
421 /// `_name_args` is the tracepoint definition in the format
422 /// `Name[ FieldDef1[;FieldDef2...]]`. For example:
423 ///
424 /// - `MyTracepoint1`
425 /// - `MyTracepoint2 u32 Field1`
426 /// - `MyTracepoint3 u32 Field1;char Field2[20]`
427 ///
428 /// # Safety
429 ///
430 /// The tracepoint must be unregistered before it is deallocated. `TracepointState`
431 /// will unregister itself when dropped, so this is only an issue if the tracepoint
432 /// is not dropped before it is deallocated. This might happen for a static variable
433 /// in a shared library that gets unloaded.
434 pub unsafe fn register(self: Pin<&Self>, _name_args: &ffi::CStr) -> i32 {
435 return self.register_with_flags(_name_args, 0);
436 }
437
438 /// Advanced: Registers this tracepoint using the specified `user_reg` flags.
439 ///
440 /// Requires: this `TracepointState` is not currently registered.
441 ///
442 /// Returns 0 for success, error code (e.g. EACCES, ENOENT) for error. The error code
443 /// is usually ignored in retail scenarios but may be helpful during development to
444 /// understand behavior or track down issues.
445 ///
446 /// `_name_args` is the tracepoint definition in the format
447 /// `Name[ FieldDef1[;FieldDef2...]]`. For example:
448 ///
449 /// - `MyTracepoint1`
450 /// - `MyTracepoint2 u32 Field1`
451 /// - `MyTracepoint3 u32 Field1;char Field2[20]`
452 ///
453 /// `_flags` is normally `0`, but may also be set to a `user_reg` flag such as
454 /// `USER_EVENT_REG_PERSIST`.
455 ///
456 /// # Safety
457 ///
458 /// The tracepoint must be unregistered before it is deallocated. `TracepointState`
459 /// will unregister itself when dropped, so this is only an issue if the tracepoint
460 /// is not dropped before it is deallocated. This might happen for a static variable
461 /// in a shared library that gets unloaded.
462 pub unsafe fn register_with_flags(
463 self: Pin<&Self>,
464 _name_args: &ffi::CStr,
465 _flags: u16,
466 ) -> i32 {
467 let error;
468 let new_write_index;
469
470 let old_write_index = self
471 .write_index
472 .swap(Self::BUSY_WRITE_INDEX, Ordering::Relaxed);
473 assert!(
474 old_write_index == Self::UNREGISTERED_WRITE_INDEX,
475 "register of active tracepoint (already-registered or being-unregistered)"
476 );
477
478 let user_events_data = USER_EVENTS_DATA_FILE.get();
479 if user_events_data < 0 {
480 error = -user_events_data;
481 new_write_index = Self::UNREGISTERED_WRITE_INDEX;
482 } else {
483 #[cfg(not(all(target_os = "linux", feature = "user_events")))]
484 {
485 error = 0;
486 new_write_index = 0;
487 }
488
489 #[cfg(all(target_os = "linux", feature = "user_events"))]
490 {
491 #[repr(C, packed)]
492 #[allow(non_camel_case_types)]
493 struct user_reg {
494 size: u32,
495 enable_bit: u8,
496 enable_size: u8,
497 flags: u16,
498 enable_addr: u64,
499 name_args: u64,
500 write_index: u32,
501 }
502
503 let mut reg = user_reg {
504 size: size_of::<user_reg>() as u32,
505 enable_bit: 0,
506 enable_size: 4,
507 flags: _flags,
508 enable_addr: &self.enable_status as *const AtomicU32 as usize as u64,
509 name_args: _name_args.as_ptr() as usize as u64,
510 write_index: 0,
511 };
512
513 clear_errno();
514 let ioctl_result =
515 unsafe { linux::ioctl(user_events_data, Self::DIAG_IOCSREG, &mut reg) };
516 if 0 > ioctl_result {
517 error = get_failure_errno();
518 new_write_index = Self::UNREGISTERED_WRITE_INDEX;
519 } else {
520 error = 0;
521 new_write_index = reg.write_index;
522 debug_assert!(new_write_index <= Self::HIGHEST_VALID_WRITE_INDEX);
523 }
524 }
525 }
526
527 let old_write_index = self.write_index.swap(new_write_index, Ordering::Relaxed);
528 debug_assert!(old_write_index == Self::BUSY_WRITE_INDEX);
529
530 return error;
531 }
532
533 /// Generates an event.
534 ///
535 /// Requires: `data[0].is_empty()` since it will be used for the event headers.
536 ///
537 /// Returns 0 for success, error code (e.g. EBADF) for error. The error code
538 /// is usually ignored in retail scenarios but may be helpful during development to
539 /// understand behavior or track down issues.
540 ///
541 /// If disabled or unregistered, this method does nothing and returnes EBADF.
542 /// Otherwise, sets `data[0] = write_index` then sends `data[..]` to the
543 /// `user_events_data` file handle.
544 ///
545 /// The event's payload is the concatenation of the remaining data blocks, if any
546 /// (i.e. `data[1..]`).
547 ///
548 /// The payload's layout should match the args specified in the call to `register`.
549 pub fn write(&self, data: &mut [EventDataDescriptor]) -> i32 {
550 debug_assert!(data[0].is_empty());
551
552 let enable_status = self.enable_status.load(Ordering::Relaxed);
553 let write_index = self.write_index.load(Ordering::Relaxed);
554 if enable_status == 0 || write_index > Self::HIGHEST_VALID_WRITE_INDEX {
555 return 9; // linux::EBADF
556 }
557
558 let writev_result = self.writev(data, &write_index.to_ne_bytes());
559 return writev_result;
560 }
561
562 /// Generates an event with headers.
563 ///
564 /// Requires: `data[0].is_empty()` since it will be used for the event headers;
565 /// `headers.len() >= 4` since it will be used for `write_index`.
566 ///
567 /// Returns 0 for success, error code (e.g. EBADF) for error. The error code
568 /// is usually ignored in retail scenarios but may be helpful during development to
569 /// understand behavior or track down issues.
570 ///
571 /// If disabled or unregistered, this method does nothing and returnes EBADF.
572 /// Otherwise, sets `data[0] = headers` and `headers[0..4] = write_index`, then sends
573 /// `data[..]` to the `user_events_data` file.
574 ///
575 /// The event's payload is the concatenation of the remaining data blocks, if any
576 /// (i.e. `data[1..]`).
577 ///
578 /// The payload's layout should match the args specified in the call to `register`.
579 pub fn write_with_headers(&self, data: &mut [EventDataDescriptor], headers: &mut [u8]) -> i32 {
580 debug_assert!(data[0].is_empty());
581 debug_assert!(headers.len() >= 4);
582
583 let enable_status = self.enable_status.load(Ordering::Relaxed);
584 let write_index = self.write_index.load(Ordering::Relaxed);
585 if enable_status == 0 || write_index > Self::HIGHEST_VALID_WRITE_INDEX {
586 return 9; // linux::EBADF
587 }
588
589 *<&mut [u8; 4]>::try_from(&mut headers[0..4]).unwrap() = write_index.to_ne_bytes();
590
591 let writev_result = self.writev(data, headers);
592 return writev_result;
593 }
594
595 // Returns 0 for success, errno for error.
596 fn writev(&self, _data: &mut [EventDataDescriptor], _headers: &[u8]) -> i32 {
597 #[cfg(all(target_os = "linux", feature = "user_events"))]
598 unsafe {
599 // Unsafe: Putting headers into a container a with longer lifetime.
600 _data[0] =
601 EventDataDescriptor::from_raw_ptr(_headers.as_ptr() as usize, _headers.len());
602
603 let writev_result = linux::writev(
604 USER_EVENTS_DATA_FILE.peek(),
605 _data.as_ptr() as *const linux::iovec,
606 _data.len() as i32,
607 );
608
609 // Clear the container before headers lifetime ends.
610 _data[0] = EventDataDescriptor::zero();
611
612 if 0 > writev_result {
613 return get_failure_errno();
614 }
615 }
616
617 return 0;
618 }
619}
620
621impl Drop for TracepointState {
622 fn drop(&mut self) {
623 self.unregister();
624 }
625}
626
627/// Possible configurations under which this crate can be compiled: `LinuxUserEvents` or
628/// `Other`.
629pub enum NativeImplementation {
630 /// Crate compiled for other configuration (no logging is performed).
631 Other,
632
633 /// Crate compiled for Linux user_events configuration (logging is performed via
634 /// `user_events_data` file).
635 LinuxUserEvents,
636}
637
638/// The configuration under which this crate was compiled: `LinuxUserEvents` or `Other`.
639pub const NATIVE_IMPLEMENTATION: NativeImplementation =
640 if cfg!(all(target_os = "linux", feature = "user_events")) {
641 NativeImplementation::LinuxUserEvents
642 } else {
643 NativeImplementation::Other
644 };