perf_event_open/count/
mod.rs

1use std::borrow::Borrow;
2use std::cell::UnsafeCell;
3use std::ffi::CStr;
4use std::fs::File;
5use std::io::{self, Error, ErrorKind, Result};
6use std::mem::transmute;
7use std::os::fd::AsRawFd;
8use std::sync::Arc;
9
10use super::sample::Sampler;
11use crate::config::attr::from;
12use crate::config::{Opts, Target};
13use crate::event::Event;
14use crate::ffi::syscall::{ioctl_arg, ioctl_argp, perf_event_open, prctl, read};
15use crate::ffi::{bindings as b, Attr};
16
17pub mod group;
18mod stat;
19
20pub use stat::*;
21
22/// Event counter.
23///
24/// Linux has many performance events to help developers identify performance
25/// issues with their programs. The [`perf_event_open`](https://man7.org/linux/man-pages/man2/perf_event_open.2.html)
26/// system call exposes the performance event subsystem for us to monitor these events.
27///
28/// This type is the core of utilizing `perf_event_open`, which provides the
29/// event counting functionality of `perf_event_open`, similar to the `perf stat` command.
30///
31/// # Permission
32///
33/// Access to performance monitoring and observability operations needs
34/// `CAP_PERFMON` or `CAP_SYS_ADMIN` Linux capability, or consider adjusting
35/// `/proc/sys/kernel/perf_event_paranoid` for users without these capabilities.
36///
37/// Possible values:
38/// - -1: Allow use of (almost) all events by all users. Ignore mlock limit
39///   after `perf_event_mlock_kb` without `CAP_IPC_LOCK`.
40/// - \>= 0: Disallow raw and ftrace function tracepoint access.
41/// - \>= 1: Disallow CPU event access.
42/// - \>= 2: Disallow kernel profiling.
43///
44/// To make the adjusted `perf_event_paranoid` setting permanent, preserve it
45/// in `/etc/sysctl.conf` (e.g., `kernel.perf_event_paranoid = <setting>`).
46///
47/// # Examples
48///
49/// ```rust
50/// use perf_event_open::config::{Cpu, Opts, Proc, SampleOn, Size};
51/// use perf_event_open::count::Counter;
52/// use perf_event_open::event::hw::Hardware;
53///
54/// // Count retired instructions on current process, all CPUs.
55/// let event = Hardware::Instr;
56/// let target = (Proc::CURRENT, Cpu::ALL);
57///
58/// let mut opts = Opts::default();
59/// opts.sample_on = SampleOn::Freq(1000); // 1000 samples per second.
60/// opts.sample_format.user_stack = Some(Size(8)); // Dump 8-bytes user stack in sample.
61///
62/// let counter = Counter::new(event, target, opts).unwrap();
63///
64/// counter.enable().unwrap(); // Start the counter.
65/// fn fib(n: usize) -> usize {
66///     match n {
67///         0 => 0,
68///         1 => 1,
69///         n => fib(n - 1) + fib(n - 2),
70///     }
71/// }
72/// std::hint::black_box(fib(30));
73/// counter.disable().unwrap(); // Stop the counter.
74///
75/// let instrs = counter.stat().unwrap().count;
76/// println!("{} instructions retired", instrs);
77/// ```
78pub struct Counter {
79    pub(crate) target: Target,
80    pub(crate) attr: UnsafeCell<Attr>,
81    pub(crate) perf: Arc<File>,
82    pub(crate) read_buf: UnsafeCell<Vec<u8>>,
83}
84
85impl Counter {
86    /// Creates a new event counter.
87    pub fn new(
88        event: impl TryInto<Event, Error = io::Error>,
89        target: impl Into<Target>,
90        opts: impl Borrow<Opts>,
91    ) -> Result<Self> {
92        let target = target.into();
93        let attr = from(event.try_into()?.0, opts.borrow())?;
94        let flags = target.flags | b::PERF_FLAG_FD_CLOEXEC as u64;
95        let perf = perf_event_open(&attr, target.pid, target.cpu, -1, flags)?;
96        // Now there is only one event in the group, if in the future
97        // this counter becomes the group leader, `CounterGroup::add`
98        // will allocate a new buffer if `PERF_FORMAT_GROUP` is enabled.
99        let read_buf = vec![0; Stat::read_buf_size(1, attr.read_format)];
100
101        Ok(Self {
102            target,
103            attr: UnsafeCell::new(attr),
104            perf: Arc::new(perf),
105            read_buf: UnsafeCell::new(read_buf),
106        })
107    }
108
109    /// Enables all counters created by the current process.
110    pub fn enable_all() -> Result<()> {
111        prctl(libc::PR_TASK_PERF_EVENTS_ENABLE)
112    }
113
114    /// Disables all counters created by the current process.
115    pub fn disable_all() -> Result<()> {
116        prctl(libc::PR_TASK_PERF_EVENTS_DISABLE)
117    }
118
119    /// Create a sampler for this counter.
120    ///
121    /// The sampler needs a ring-buffer to store metadata and records,
122    /// and 1 + 2^`exp` pages will be allocated for this.
123    ///
124    /// A counter cannot have multiple samplers simultaneously.
125    /// Attempting to create a new sampler while the previous one
126    /// is still active will result in [`ErrorKind::AlreadyExists`].
127    pub fn sampler(&self, exp: u8) -> Result<Sampler> {
128        if Arc::strong_count(&self.perf) == 1 {
129            // We only change the attr fields related to event config,
130            // which are not used in `ChunkParser::from_attr`.
131            let attr = unsafe { &*self.attr.get() };
132            Sampler::new(Arc::clone(&self.perf), attr, exp)
133        } else {
134            // The kernel allows creating multiple samplers for a counter, these
135            // samplers share the same ring buffer in kernel space and require
136            // the same mmap length.
137            //
138            // Multiple samplers will result in an unsound `Send` impl, samplers
139            // from different threads will race on the drop of COW chunks, which
140            // may set the ring buffer head backwards.
141            //
142            // We prohibit users from creating multiple samplers per counter to
143            // avoid the data race. Creating multiple samplers on the same counter
144            // is usually useless, while the `Send` impl is much more useful.
145            let error = "There is already an sampler attached to this counter.";
146            Err(Error::new(ErrorKind::AlreadyExists, error))
147        }
148    }
149
150    /// Returns the file handle opened by [`perf_event_open`](https://man7.org/linux/man-pages/man2/perf_event_open.2.html)
151    /// system call for the current event.
152    ///
153    /// This might be useful if we want to interact with the handle directly.
154    pub fn file(&self) -> &File {
155        &self.perf
156    }
157
158    /// Returns the event ID.
159    ///
160    /// The event ID is a globally incremented ID used to distinguish the
161    /// results of different counters.
162    ///
163    /// This is the same as [`Stat::id`], [`SiblingStat::id`] and [`RecordId::id`][crate::sample::record::RecordId::id].
164    pub fn id(&self) -> Result<u64> {
165        let mut id = 0;
166        ioctl_argp(&self.perf, b::PERF_IOC_OP_ID as _, &mut id)?;
167        Ok(id)
168    }
169
170    /// Enable counter.
171    ///
172    /// Counter will start to accumulate event counts.
173    pub fn enable(&self) -> Result<()> {
174        ioctl_arg(&self.perf, b::PERF_IOC_OP_ENABLE as _, 0)?;
175        Ok(())
176    }
177
178    /// Disable counter.
179    ///
180    /// Counter will stop to accumulate event counts.
181    pub fn disable(&self) -> Result<()> {
182        ioctl_arg(&self.perf, b::PERF_IOC_OP_DISABLE as _, 0)?;
183        Ok(())
184    }
185
186    /// Clear event count.
187    ///
188    /// This will only clear the event counts in the statistics,
189    /// other fields (such as `time_enabled`) are not affected.
190    pub fn clear_count(&self) -> Result<()> {
191        ioctl_arg(&self.perf, b::PERF_IOC_OP_RESET as _, 0)?;
192        Ok(())
193    }
194
195    /// Returns counter statistics.
196    pub fn stat(&self) -> Result<Stat> {
197        // There could be only up to one reference to `read_buf` at the same time,
198        // since `Counter` is not `Sync`.
199        let buf = unsafe { &mut *self.read_buf.get() };
200
201        read(&self.perf, buf)?;
202        let buf = buf.as_mut_slice();
203        let buf = unsafe { transmute::<&mut [_], &mut [u8]>(buf) };
204
205        let ptr = buf.as_ptr();
206        // We only change the attr fields related to event config,
207        // there is nothing about `read_format`.
208        let read_format = unsafe { &*self.attr.get() }.read_format;
209        let stat = unsafe { Stat::from_ptr(ptr, read_format) };
210
211        Ok(stat)
212    }
213
214    /// Attach a BPF program to an existing kprobe tracepoint event.
215    ///
216    /// The argument is a BPF program file that was created by a previous
217    /// [`bpf`](https://man7.org/linux/man-pages/man2/bpf.2.html) system call.
218    pub fn attach_bpf(&self, file: &File) -> Result<()> {
219        ioctl_arg(
220            &self.perf,
221            b::PERF_IOC_OP_SET_BPF as _,
222            file.as_raw_fd() as _,
223        )?;
224        Ok(())
225    }
226
227    /// Querying which BPF programs are attached to the
228    /// existing kprobe tracepoint event.
229    ///
230    /// Returns the IDs of all BPF programs in all events attached to the tracepoint.
231    ///
232    /// If the buffer is not large enough to contain all IDs,
233    /// it also indicates how many IDs were lost.
234    ///
235    /// Since `linux-4.16`: <https://github.com/torvalds/linux/commit/f371b304f12e31fe30207c41ca7754564e0ea4dc>
236    #[cfg(feature = "linux-4.16")]
237    pub fn query_bpf(&self, buf_len: u32) -> Result<(Vec<u32>, Option<u32>)> {
238        // struct perf_event_query_bpf {
239        //     u32 ids_len;
240        //     u32 prog_cnt;
241        //     u32 ids[0];
242        // }
243
244        use std::mem::MaybeUninit;
245        let mut buf = vec![MaybeUninit::uninit(); (2 + buf_len) as _];
246        buf[0] = MaybeUninit::new(buf_len); // set `ids_len`
247
248        match ioctl_argp(
249            &self.perf,
250            b::PERF_IOC_OP_QUERY_BPF as _,
251            buf.as_mut_slice(),
252        ) {
253            Ok(_) => {
254                let prog_cnt = unsafe { buf[1].assume_init() };
255
256                let ids = buf[2..2 + (prog_cnt as usize)].to_vec();
257                let ids = unsafe { transmute::<Vec<_>, Vec<u32>>(ids) };
258
259                Ok((ids, None))
260            }
261            Err(e) => {
262                let option = e.raw_os_error();
263
264                // `option` is always `Some` since `Error` is constructed
265                // by `ioctl_argp` via `Error::last_os_error`.
266                let errno = unsafe { option.unwrap_unchecked() };
267
268                if errno == libc::ENOSPC {
269                    let prog_cnt = unsafe { buf[1].assume_init() };
270
271                    let ids = buf[2..].to_vec();
272                    let ids = unsafe { transmute::<Vec<_>, Vec<u32>>(ids) };
273
274                    return Ok((ids, Some(prog_cnt - buf_len)));
275                }
276
277                Err(e)
278            }
279        }
280    }
281
282    #[cfg(not(feature = "linux-4.16"))]
283    pub fn query_bpf(&self, len: u32) -> Result<(Vec<u32>, Option<u32>)> {
284        let _ = len;
285        crate::config::unsupported!()
286    }
287
288    /// Add an ftrace filter to current event.
289    pub fn with_ftrace_filter(&self, filter: &CStr) -> Result<()> {
290        let ptr = filter.as_ptr() as *mut i8;
291
292        // The following ioctl op just copies the bytes to kernel space,
293        // so we don't have to worry about the mutable reference.
294        let argp = unsafe { &mut *ptr };
295
296        ioctl_argp(&self.perf, b::PERF_IOC_OP_SET_FILTER as _, argp)?;
297        Ok(())
298    }
299
300    /// Switch to another event.
301    ///
302    /// This allows modifying an existing event without the overhead of
303    /// closing and reopening a new counter.
304    ///
305    /// Currently this is supported only for breakpoint events.
306    ///
307    /// Since `linux-4.17`: <https://github.com/torvalds/linux/commit/32ff77e8cc9e66cc4fb38098f64fd54cc8f54573>
308    #[cfg(feature = "linux-4.17")]
309    pub fn switch_to<E>(&self, event: E) -> Result<()>
310    where
311        E: TryInto<Event, Error = io::Error>,
312    {
313        let Event(event_cfg): Event = event.try_into()?;
314
315        // We can only access `self.attr` within the same thread,
316        // so there is no potential data race.
317        //
318        // We will only change fields about event config, this will
319        // not break any consumptions or states since these fields
320        // are never used elsewhere after the counter is initialized.
321        //
322        // The following ioctl op just copies the modified attr to kernel space,
323        // so we don't have to worry about the mutable reference.
324        let attr = unsafe { &mut *self.attr.get() };
325        attr.type_ = event_cfg.ty;
326        attr.config = event_cfg.config;
327        attr.__bindgen_anon_3.config1 = event_cfg.config1;
328        attr.__bindgen_anon_4.config2 = event_cfg.config2;
329        #[cfg(feature = "linux-6.3")]
330        (attr.config3 = event_cfg.config3);
331        attr.bp_type = event_cfg.bp_type;
332
333        ioctl_argp(&self.perf, b::PERF_IOC_OP_MODIFY_ATTRS as _, attr)?;
334
335        Ok(())
336    }
337
338    #[cfg(not(feature = "linux-4.17"))]
339    pub fn switch_to<E>(&self, event: E) -> Result<()>
340    where
341        E: TryInto<Event, Error = io::Error>,
342    {
343        let _ = event;
344        crate::config::unsupported!()
345    }
346}