use std::{
path::Path,
sync::{Arc, Mutex, OnceLock},
time::{Duration, Instant},
};
use tracing::{Subscriber, span};
use tracing_subscriber::{Layer, layer::Context, prelude::*, registry::LookupSpan};
const CPU_TRACK_UUID: u64 = 1;
const GPU_TRACK_UUID: u64 = 2;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
#[repr(u8)]
enum EventKind {
SliceBegin = 1,
SliceEnd = 2,
Instant = 3,
}
struct TraceEvent {
name: String,
timestamp_ns: u64,
track_uuid: u64,
kind: EventKind,
}
struct ProfilerInner {
epoch: Instant,
events: Vec<TraceEvent>,
}
impl ProfilerInner {
fn now_ns(&self) -> u64 {
self.epoch.elapsed().as_nanos() as u64
}
}
static PROFILER: OnceLock<Arc<Mutex<ProfilerInner>>> = OnceLock::new();
fn get_or_init() -> &'static Arc<Mutex<ProfilerInner>> {
PROFILER.get_or_init(|| {
Arc::new(Mutex::new(ProfilerInner {
epoch: Instant::now(),
events: Vec::with_capacity(8192),
}))
})
}
pub fn init() {
let inner = get_or_init().clone();
let layer = ProfileLayer { inner };
let subscriber = tracing_subscriber::registry().with(layer);
let _ = tracing::subscriber::set_global_default(subscriber);
}
pub fn record_gpu_passes(submit_offset_ns: u64, passes: &[(String, Duration)]) {
if let Some(inner) = PROFILER.get() {
let mut guard = inner.lock().unwrap();
let mut offset = submit_offset_ns;
for &(ref name, dur) in passes {
guard.events.push(TraceEvent {
name: name.clone(),
timestamp_ns: offset,
track_uuid: GPU_TRACK_UUID,
kind: EventKind::SliceBegin,
});
offset += dur.as_nanos() as u64;
guard.events.push(TraceEvent {
name: name.clone(),
timestamp_ns: offset,
track_uuid: GPU_TRACK_UUID,
kind: EventKind::SliceEnd,
});
}
}
}
pub fn record_instant(name: &str) {
if let Some(inner) = PROFILER.get() {
let mut guard = inner.lock().unwrap();
let ts = guard.now_ns();
guard.events.push(TraceEvent {
name: name.to_string(),
timestamp_ns: ts,
track_uuid: CPU_TRACK_UUID,
kind: EventKind::Instant,
});
}
}
pub fn now_ns() -> u64 {
PROFILER
.get()
.map(|inner| inner.lock().unwrap().now_ns())
.unwrap_or(0)
}
pub fn event_count() -> usize {
PROFILER
.get()
.map(|inner| inner.lock().unwrap().events.len())
.unwrap_or(0)
}
pub fn save(path: impl AsRef<Path>) -> std::io::Result<()> {
let inner = PROFILER
.get()
.ok_or_else(|| std::io::Error::other("profiler not initialized"))?;
let guard = inner.lock().unwrap();
write_pftrace(path.as_ref(), &guard.events)
}
pub struct ProfileLayer {
inner: Arc<Mutex<ProfilerInner>>,
}
impl<S> Layer<S> for ProfileLayer
where
S: Subscriber + for<'a> LookupSpan<'a>,
{
fn on_enter(&self, id: &span::Id, ctx: Context<'_, S>) {
if let Some(span) = ctx.span(id) {
let mut guard = self.inner.lock().unwrap();
let ts = guard.now_ns();
guard.events.push(TraceEvent {
name: span.name().to_string(),
timestamp_ns: ts,
track_uuid: CPU_TRACK_UUID,
kind: EventKind::SliceBegin,
});
}
}
fn on_exit(&self, id: &span::Id, ctx: Context<'_, S>) {
if let Some(span) = ctx.span(id) {
let mut guard = self.inner.lock().unwrap();
let ts = guard.now_ns();
guard.events.push(TraceEvent {
name: span.name().to_string(),
timestamp_ns: ts,
track_uuid: CPU_TRACK_UUID,
kind: EventKind::SliceEnd,
});
}
}
fn on_event(&self, event: &tracing::Event<'_>, _ctx: Context<'_, S>) {
let mut guard = self.inner.lock().unwrap();
let ts = guard.now_ns();
guard.events.push(TraceEvent {
name: event.metadata().name().to_string(),
timestamp_ns: ts,
track_uuid: CPU_TRACK_UUID,
kind: EventKind::Instant,
});
}
}
fn write_pftrace(path: &Path, events: &[TraceEvent]) -> std::io::Result<()> {
use std::io::Write;
let mut trace = ProtoBuf::new();
let mut proc_desc = ProtoBuf::new();
proc_desc.uint32(1, std::process::id()); let mut track_desc = ProtoBuf::new();
track_desc.uint64(1, 0); track_desc.message(3, &proc_desc); track_desc.string(2, "meganeura"); let mut pkt = ProtoBuf::new();
pkt.message(60, &track_desc); pkt.uint32(10, 1); trace.message(1, &pkt);
let mut td = ProtoBuf::new();
td.uint64(1, CPU_TRACK_UUID);
td.uint64(5, 0); td.string(2, "CPU");
let mut pkt = ProtoBuf::new();
pkt.message(60, &td);
pkt.uint32(10, 1);
trace.message(1, &pkt);
let mut td = ProtoBuf::new();
td.uint64(1, GPU_TRACK_UUID);
td.uint64(5, 0); td.string(2, "GPU");
let mut pkt = ProtoBuf::new();
pkt.message(60, &td);
pkt.uint32(10, 1);
trace.message(1, &pkt);
let mut sorted: Vec<usize> = (0..events.len()).collect();
sorted.sort_by_key(|&i| events[i].timestamp_ns);
for &i in &sorted {
let ev = &events[i];
let mut te = ProtoBuf::new();
te.uint64(11, ev.track_uuid); te.int32(9, ev.kind as i32); te.string(23, &ev.name);
let mut pkt = ProtoBuf::new();
pkt.uint64(8, ev.timestamp_ns); pkt.message(11, &te); pkt.uint32(10, 1); trace.message(1, &pkt);
}
let mut f = std::io::BufWriter::new(std::fs::File::create(path)?);
f.write_all(&trace.buf)?;
Ok(())
}
struct ProtoBuf {
buf: Vec<u8>,
}
impl ProtoBuf {
fn new() -> Self {
Self {
buf: Vec::with_capacity(128),
}
}
fn write_varint(&mut self, mut val: u64) {
loop {
let byte = (val & 0x7F) as u8;
val >>= 7;
if val == 0 {
self.buf.push(byte);
return;
}
self.buf.push(byte | 0x80);
}
}
fn tag(&mut self, field: u32, wire_type: u32) {
self.write_varint(((field as u64) << 3) | wire_type as u64);
}
fn uint64(&mut self, field: u32, val: u64) {
self.tag(field, 0);
self.write_varint(val);
}
fn uint32(&mut self, field: u32, val: u32) {
self.tag(field, 0);
self.write_varint(val as u64);
}
fn int32(&mut self, field: u32, val: i32) {
self.tag(field, 0);
self.write_varint(val as u32 as u64);
}
fn string(&mut self, field: u32, val: &str) {
self.tag(field, 2);
self.write_varint(val.len() as u64);
self.buf.extend_from_slice(val.as_bytes());
}
fn message(&mut self, field: u32, msg: &ProtoBuf) {
self.tag(field, 2);
self.write_varint(msg.buf.len() as u64);
self.buf.extend_from_slice(&msg.buf);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_varint_encoding() {
let mut pb = ProtoBuf::new();
pb.write_varint(0);
assert_eq!(pb.buf, &[0]);
let mut pb = ProtoBuf::new();
pb.write_varint(1);
assert_eq!(pb.buf, &[1]);
let mut pb = ProtoBuf::new();
pb.write_varint(300);
assert_eq!(pb.buf, &[0xAC, 0x02]);
}
#[test]
fn test_save_produces_nonempty_file() {
let inner = get_or_init();
{
let mut guard = inner.lock().unwrap();
guard.events.push(TraceEvent {
name: "test_span".into(),
timestamp_ns: 1000,
track_uuid: CPU_TRACK_UUID,
kind: EventKind::SliceBegin,
});
guard.events.push(TraceEvent {
name: "test_span".into(),
timestamp_ns: 2000,
track_uuid: CPU_TRACK_UUID,
kind: EventKind::SliceEnd,
});
guard.events.push(TraceEvent {
name: "matmul".into(),
timestamp_ns: 1200,
track_uuid: GPU_TRACK_UUID,
kind: EventKind::SliceBegin,
});
guard.events.push(TraceEvent {
name: "matmul".into(),
timestamp_ns: 1800,
track_uuid: GPU_TRACK_UUID,
kind: EventKind::SliceEnd,
});
}
let dir = std::env::temp_dir().join("meganeura_profiler_test");
std::fs::create_dir_all(&dir).unwrap();
let path = dir.join("test.pftrace");
save(&path).unwrap();
let bytes = std::fs::read(&path).unwrap();
assert!(
bytes.len() > 50,
"trace file too small: {} bytes",
bytes.len()
);
assert_eq!(bytes[0] & 0x07, 2, "expected length-delimited wire type");
assert_eq!(bytes[0] >> 3, 1, "expected field number 1 (Trace.packet)");
let _ = std::fs::remove_file(&path);
let _ = std::fs::remove_dir(&dir);
inner.lock().unwrap().events.clear();
}
#[test]
fn test_record_gpu_passes() {
let inner = get_or_init();
inner.lock().unwrap().events.clear();
record_gpu_passes(
5000,
&[
("relu".into(), Duration::from_nanos(100)),
("matmul".into(), Duration::from_nanos(500)),
],
);
let guard = inner.lock().unwrap();
assert_eq!(guard.events.len(), 4); assert_eq!(guard.events[0].name, "relu");
assert_eq!(guard.events[0].timestamp_ns, 5000);
assert_eq!(guard.events[0].kind, EventKind::SliceBegin);
assert_eq!(guard.events[1].timestamp_ns, 5100); assert_eq!(guard.events[1].kind, EventKind::SliceEnd);
assert_eq!(guard.events[2].name, "matmul");
assert_eq!(guard.events[2].timestamp_ns, 5100);
assert_eq!(guard.events[3].timestamp_ns, 5600);
drop(guard);
inner.lock().unwrap().events.clear();
}
#[test]
fn test_now_ns_increases() {
let _ = get_or_init();
let t1 = now_ns();
for _ in 0..1000 {
std::hint::black_box(0);
}
let t2 = now_ns();
assert!(t2 >= t1);
}
}