perf_event_open/count/mod.rs
1use std::borrow::Borrow;
2use std::cell::UnsafeCell;
3use std::ffi::CStr;
4use std::fs::File;
5use std::io::{self, Error, ErrorKind, Result};
6use std::mem::transmute;
7use std::os::fd::AsRawFd;
8use std::sync::Arc;
9
10use super::sample::Sampler;
11use crate::config::attr::from;
12use crate::config::{Opts, Target};
13use crate::event::Event;
14use crate::ffi::syscall::{ioctl_arg, ioctl_argp, perf_event_open, prctl, read};
15use crate::ffi::{bindings as b, Attr};
16
17pub mod group;
18mod stat;
19
20pub use stat::*;
21
22/// Event counter.
23///
24/// Linux has many performance events to help developers identify performance
25/// issues with their programs. The [`perf_event_open`](https://man7.org/linux/man-pages/man2/perf_event_open.2.html)
26/// system call exposes the performance event subsystem for us to monitor these events.
27///
28/// This type is the core of utilizing `perf_event_open`, which provides the
29/// event counting functionality of `perf_event_open`, similar to the `perf stat` command.
30///
31/// # Permission
32///
33/// Access to performance monitoring and observability operations needs
34/// `CAP_PERFMON` or `CAP_SYS_ADMIN` Linux capability, or consider adjusting
35/// `/proc/sys/kernel/perf_event_paranoid` for users without these capabilities.
36///
37/// Possible values:
38/// - -1: Allow use of (almost) all events by all users. Ignore mlock limit
39/// after `perf_event_mlock_kb` without `CAP_IPC_LOCK`.
40/// - \>= 0: Disallow raw and ftrace function tracepoint access.
41/// - \>= 1: Disallow CPU event access.
42/// - \>= 2: Disallow kernel profiling.
43///
44/// To make the adjusted `perf_event_paranoid` setting permanent, preserve it
45/// in `/etc/sysctl.conf` (e.g., `kernel.perf_event_paranoid = <setting>`).
46///
47/// # Examples
48///
49/// ```rust
50/// use perf_event_open::config::{Cpu, Opts, Proc, SampleOn, Size};
51/// use perf_event_open::count::Counter;
52/// use perf_event_open::event::hw::Hardware;
53///
54/// // Count retired instructions on current process, all CPUs.
55/// let event = Hardware::Instr;
56/// let target = (Proc::CURRENT, Cpu::ALL);
57///
58/// let mut opts = Opts::default();
59/// opts.sample_on = SampleOn::Freq(1000); // 1000 samples per second.
60/// opts.sample_format.user_stack = Some(Size(8)); // Dump 8-bytes user stack in sample.
61///
62/// let counter = Counter::new(event, target, opts).unwrap();
63///
64/// counter.enable().unwrap(); // Start the counter.
65/// fn fib(n: usize) -> usize {
66/// match n {
67/// 0 => 0,
68/// 1 => 1,
69/// n => fib(n - 1) + fib(n - 2),
70/// }
71/// }
72/// std::hint::black_box(fib(30));
73/// counter.disable().unwrap(); // Stop the counter.
74///
75/// let instrs = counter.stat().unwrap().count;
76/// println!("{} instructions retired", instrs);
77/// ```
78pub struct Counter {
79 pub(crate) target: Target,
80 pub(crate) attr: UnsafeCell<Attr>,
81 pub(crate) perf: Arc<File>,
82 pub(crate) read_buf: UnsafeCell<Vec<u8>>,
83}
84
85impl Counter {
86 /// Creates a new event counter.
87 pub fn new(
88 event: impl TryInto<Event, Error = io::Error>,
89 target: impl Into<Target>,
90 opts: impl Borrow<Opts>,
91 ) -> Result<Self> {
92 let target = target.into();
93 let attr = from(event.try_into()?.0, opts.borrow())?;
94 let flags = target.flags | b::PERF_FLAG_FD_CLOEXEC as u64;
95 let perf = perf_event_open(&attr, target.pid, target.cpu, -1, flags)?;
96 // Now there is only one event in the group, if in the future
97 // this counter becomes the group leader, `CounterGroup::add`
98 // will allocate a new buffer if `PERF_FORMAT_GROUP` is enabled.
99 let read_buf = vec![0; Stat::read_buf_size(1, attr.read_format)];
100
101 Ok(Self {
102 target,
103 attr: UnsafeCell::new(attr),
104 perf: Arc::new(perf),
105 read_buf: UnsafeCell::new(read_buf),
106 })
107 }
108
109 /// Enables all counters created by the current process.
110 pub fn enable_all() -> Result<()> {
111 prctl(libc::PR_TASK_PERF_EVENTS_ENABLE)
112 }
113
114 /// Disables all counters created by the current process.
115 pub fn disable_all() -> Result<()> {
116 prctl(libc::PR_TASK_PERF_EVENTS_DISABLE)
117 }
118
119 /// Create a sampler for this counter.
120 ///
121 /// The sampler needs a ring-buffer to store metadata and records,
122 /// and 1 + 2^`exp` pages will be allocated for this.
123 ///
124 /// A counter cannot have multiple samplers simultaneously.
125 /// Attempting to create a new sampler while the previous one
126 /// is still active will result in [`ErrorKind::AlreadyExists`].
127 pub fn sampler(&self, exp: u8) -> Result<Sampler> {
128 if Arc::strong_count(&self.perf) == 1 {
129 // We only change the attr fields related to event config,
130 // which are not used in `ChunkParser::from_attr`.
131 let attr = unsafe { &*self.attr.get() };
132 Sampler::new(Arc::clone(&self.perf), attr, exp)
133 } else {
134 // The kernel allows creating multiple samplers for a counter, these
135 // samplers share the same ring buffer in kernel space and require
136 // the same mmap length.
137 //
138 // Multiple samplers will result in an unsound `Send` impl, samplers
139 // from different threads will race on the drop of COW chunks, which
140 // may set the ring buffer head backwards.
141 //
142 // We prohibit users from creating multiple samplers per counter to
143 // avoid the data race. Creating multiple samplers on the same counter
144 // is usually useless, while the `Send` impl is much more useful.
145 let error = "There is already an sampler attached to this counter.";
146 Err(Error::new(ErrorKind::AlreadyExists, error))
147 }
148 }
149
150 /// Returns the file handle opened by [`perf_event_open`](https://man7.org/linux/man-pages/man2/perf_event_open.2.html)
151 /// system call for the current event.
152 ///
153 /// This might be useful if we want to interact with the handle directly.
154 pub fn file(&self) -> &File {
155 &self.perf
156 }
157
158 /// Returns the event ID.
159 ///
160 /// The event ID is a globally incremented ID used to distinguish the
161 /// results of different counters.
162 ///
163 /// This is the same as [`Stat::id`], [`SiblingStat::id`] and [`RecordId::id`][crate::sample::record::RecordId::id].
164 pub fn id(&self) -> Result<u64> {
165 let mut id = 0;
166 ioctl_argp(&self.perf, b::PERF_IOC_OP_ID as _, &mut id)?;
167 Ok(id)
168 }
169
170 /// Enable counter.
171 ///
172 /// Counter will start to accumulate event counts.
173 pub fn enable(&self) -> Result<()> {
174 ioctl_arg(&self.perf, b::PERF_IOC_OP_ENABLE as _, 0)?;
175 Ok(())
176 }
177
178 /// Disable counter.
179 ///
180 /// Counter will stop to accumulate event counts.
181 pub fn disable(&self) -> Result<()> {
182 ioctl_arg(&self.perf, b::PERF_IOC_OP_DISABLE as _, 0)?;
183 Ok(())
184 }
185
186 /// Clear event count.
187 ///
188 /// This will only clear the event counts in the statistics,
189 /// other fields (such as `time_enabled`) are not affected.
190 pub fn clear_count(&self) -> Result<()> {
191 ioctl_arg(&self.perf, b::PERF_IOC_OP_RESET as _, 0)?;
192 Ok(())
193 }
194
195 /// Returns counter statistics.
196 pub fn stat(&self) -> Result<Stat> {
197 // There could be only up to one reference to `read_buf` at the same time,
198 // since `Counter` is not `Sync`.
199 let buf = unsafe { &mut *self.read_buf.get() };
200
201 read(&self.perf, buf)?;
202 let buf = buf.as_mut_slice();
203 let buf = unsafe { transmute::<&mut [_], &mut [u8]>(buf) };
204
205 let ptr = buf.as_ptr();
206 // We only change the attr fields related to event config,
207 // there is nothing about `read_format`.
208 let read_format = unsafe { &*self.attr.get() }.read_format;
209 let stat = unsafe { Stat::from_ptr(ptr, read_format) };
210
211 Ok(stat)
212 }
213
214 /// Attach a BPF program to an existing kprobe tracepoint event.
215 ///
216 /// The argument is a BPF program file that was created by a previous
217 /// [`bpf`](https://man7.org/linux/man-pages/man2/bpf.2.html) system call.
218 pub fn attach_bpf(&self, file: &File) -> Result<()> {
219 ioctl_arg(
220 &self.perf,
221 b::PERF_IOC_OP_SET_BPF as _,
222 file.as_raw_fd() as _,
223 )?;
224 Ok(())
225 }
226
227 /// Querying which BPF programs are attached to the
228 /// existing kprobe tracepoint event.
229 ///
230 /// Returns the IDs of all BPF programs in all events attached to the tracepoint.
231 ///
232 /// If the buffer is not large enough to contain all IDs,
233 /// it also indicates how many IDs were lost.
234 ///
235 /// Since `linux-4.16`: <https://github.com/torvalds/linux/commit/f371b304f12e31fe30207c41ca7754564e0ea4dc>
236 #[cfg(feature = "linux-4.16")]
237 pub fn query_bpf(&self, buf_len: u32) -> Result<(Vec<u32>, Option<u32>)> {
238 // struct perf_event_query_bpf {
239 // u32 ids_len;
240 // u32 prog_cnt;
241 // u32 ids[0];
242 // }
243
244 use std::mem::MaybeUninit;
245 let mut buf = vec![MaybeUninit::uninit(); (2 + buf_len) as _];
246 buf[0] = MaybeUninit::new(buf_len); // set `ids_len`
247
248 match ioctl_argp(
249 &self.perf,
250 b::PERF_IOC_OP_QUERY_BPF as _,
251 buf.as_mut_slice(),
252 ) {
253 Ok(_) => {
254 let prog_cnt = unsafe { buf[1].assume_init() };
255
256 let ids = buf[2..2 + (prog_cnt as usize)].to_vec();
257 let ids = unsafe { transmute::<Vec<_>, Vec<u32>>(ids) };
258
259 Ok((ids, None))
260 }
261 Err(e) => {
262 let option = e.raw_os_error();
263
264 // `option` is always `Some` since `Error` is constructed
265 // by `ioctl_argp` via `Error::last_os_error`.
266 let errno = unsafe { option.unwrap_unchecked() };
267
268 if errno == libc::ENOSPC {
269 let prog_cnt = unsafe { buf[1].assume_init() };
270
271 let ids = buf[2..].to_vec();
272 let ids = unsafe { transmute::<Vec<_>, Vec<u32>>(ids) };
273
274 return Ok((ids, Some(prog_cnt - buf_len)));
275 }
276
277 Err(e)
278 }
279 }
280 }
281
282 #[cfg(not(feature = "linux-4.16"))]
283 pub fn query_bpf(&self, len: u32) -> Result<(Vec<u32>, Option<u32>)> {
284 let _ = len;
285 crate::config::unsupported!()
286 }
287
288 /// Add an ftrace filter to current event.
289 pub fn with_ftrace_filter(&self, filter: &CStr) -> Result<()> {
290 let ptr = filter.as_ptr() as *mut i8;
291
292 // The following ioctl op just copies the bytes to kernel space,
293 // so we don't have to worry about the mutable reference.
294 let argp = unsafe { &mut *ptr };
295
296 ioctl_argp(&self.perf, b::PERF_IOC_OP_SET_FILTER as _, argp)?;
297 Ok(())
298 }
299
300 /// Switch to another event.
301 ///
302 /// This allows modifying an existing event without the overhead of
303 /// closing and reopening a new counter.
304 ///
305 /// Currently this is supported only for breakpoint events.
306 ///
307 /// Since `linux-4.17`: <https://github.com/torvalds/linux/commit/32ff77e8cc9e66cc4fb38098f64fd54cc8f54573>
308 #[cfg(feature = "linux-4.17")]
309 pub fn switch_to<E>(&self, event: E) -> Result<()>
310 where
311 E: TryInto<Event, Error = io::Error>,
312 {
313 let Event(event_cfg): Event = event.try_into()?;
314
315 // We can only access `self.attr` within the same thread,
316 // so there is no potential data race.
317 //
318 // We will only change fields about event config, this will
319 // not break any consumptions or states since these fields
320 // are never used elsewhere after the counter is initialized.
321 //
322 // The following ioctl op just copies the modified attr to kernel space,
323 // so we don't have to worry about the mutable reference.
324 let attr = unsafe { &mut *self.attr.get() };
325 attr.type_ = event_cfg.ty;
326 attr.config = event_cfg.config;
327 attr.__bindgen_anon_3.config1 = event_cfg.config1;
328 attr.__bindgen_anon_4.config2 = event_cfg.config2;
329 #[cfg(feature = "linux-6.3")]
330 (attr.config3 = event_cfg.config3);
331 attr.bp_type = event_cfg.bp_type;
332
333 ioctl_argp(&self.perf, b::PERF_IOC_OP_MODIFY_ATTRS as _, attr)?;
334
335 Ok(())
336 }
337
338 #[cfg(not(feature = "linux-4.17"))]
339 pub fn switch_to<E>(&self, event: E) -> Result<()>
340 where
341 E: TryInto<Event, Error = io::Error>,
342 {
343 let _ = event;
344 crate::config::unsupported!()
345 }
346}