use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd};
use std::ptr;
use anyhow::{Context, Result, anyhow};
use btf_rs::Btf;
use super::arena::{ArenaPage, ArenaSnapshot, BpfArenaOffsets};
use super::bpf_map::{
BPF_MAP_TYPE_ARENA, BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_LRU_HASH,
BPF_MAP_TYPE_LRU_PERCPU_HASH, BPF_MAP_TYPE_PERCPU_ARRAY, BPF_MAP_TYPE_PERCPU_HASH,
BPF_MAP_TYPE_STRUCT_OPS, BpfMapAccessor, BpfMapInfo,
};
const BPF_MAP_LOOKUP_ELEM: u32 = 1;
const BPF_MAP_GET_NEXT_KEY: u32 = 4;
const BPF_MAP_GET_NEXT_ID: u32 = 0xc;
const BPF_MAP_GET_FD_BY_ID: u32 = 0xe;
const BPF_OBJ_GET_INFO_BY_FD: u32 = 0xf;
const BPF_BTF_GET_FD_BY_ID: u32 = 0x13;
const BPF_OBJ_NAME_LEN: usize = 16;
const ARENA_PAGE_SIZE: usize = 4096;
const MAX_ARENA_BYTES: u64 = 4 * 1024 * 1024 * 1024;
const MAX_ARENA_PAGES: u64 = 16 * 1024;
#[repr(C)]
#[derive(Default)]
struct BpfAttrMapElem {
map_fd: u32,
_pad0: u32,
key: u64,
value_or_next_key: u64,
flags: u64,
}
#[repr(C)]
#[derive(Default)]
struct BpfAttrGetId {
id_or_start_id: u32,
next_id: u32,
open_flags: u32,
}
#[repr(C)]
#[derive(Default)]
struct BpfAttrInfoByFd {
bpf_fd: u32,
info_len: u32,
info: u64,
}
#[repr(C)]
#[derive(Default)]
struct BpfMapInfoUapi {
map_type: u32,
id: u32,
key_size: u32,
value_size: u32,
max_entries: u32,
map_flags: u32,
name: [u8; BPF_OBJ_NAME_LEN],
ifindex: u32,
btf_vmlinux_value_type_id: u32,
netns_dev: u64,
netns_ino: u64,
btf_id: u32,
btf_key_type_id: u32,
btf_value_type_id: u32,
_pad: u32,
map_extra: u64,
}
#[repr(C)]
#[derive(Default)]
struct BpfBtfInfoUapi {
btf: u64,
btf_size: u32,
id: u32,
name: u64,
name_len: u32,
kernel_btf: u32,
}
unsafe fn bpf_syscall(cmd: u32, attr_ptr: *const u8, attr_size: usize) -> i64 {
unsafe { libc::syscall(libc::SYS_bpf, cmd as i64, attr_ptr, attr_size) as i64 }
}
fn bpf_call_fd(cmd: u32, attr_ptr: *const u8, attr_size: usize) -> Result<RawFd> {
let ret = unsafe { bpf_syscall(cmd, attr_ptr, attr_size) };
if ret < 0 {
let err = std::io::Error::last_os_error();
Err(anyhow!("bpf({cmd}) failed: {err}"))
} else {
Ok(ret as RawFd)
}
}
fn bpf_call_status(cmd: u32, attr_ptr: *const u8, attr_size: usize) -> Result<()> {
let ret = unsafe { bpf_syscall(cmd, attr_ptr, attr_size) };
if ret < 0 {
let err = std::io::Error::last_os_error();
Err(anyhow!("bpf({cmd}) failed: {err}"))
} else {
Ok(())
}
}
struct PinnedMap {
info: BpfMapInfo,
fd: OwnedFd,
map_extra: u64,
}
#[allow(dead_code)]
pub struct BpfSyscallAccessor {
maps: Vec<PinnedMap>,
}
impl BpfSyscallAccessor {
#[allow(dead_code)]
pub fn from_running_kernel() -> Result<Self> {
Self::from_running_kernel_filtered(|_info: &BpfMapInfo| true)
}
#[allow(dead_code)]
pub fn from_running_kernel_filtered<F>(mut predicate: F) -> Result<Self>
where
F: FnMut(&BpfMapInfo) -> bool,
{
let mut maps: Vec<PinnedMap> = Vec::new();
let mut start_id: u32 = 0;
loop {
let mut attr = BpfAttrGetId {
id_or_start_id: start_id,
next_id: 0,
open_flags: 0,
};
let res = unsafe {
bpf_syscall(
BPF_MAP_GET_NEXT_ID,
&raw mut attr as *const u8,
std::mem::size_of::<BpfAttrGetId>(),
)
};
if res < 0 {
let err = std::io::Error::last_os_error();
if err.raw_os_error() == Some(libc::ENOENT) {
break;
}
return Err(anyhow!("BPF_MAP_GET_NEXT_ID failed: {err}"));
}
let next_id = attr.next_id;
if next_id == 0 {
break;
}
start_id = next_id;
let fd_attr = BpfAttrGetId {
id_or_start_id: next_id,
next_id: 0,
open_flags: 0,
};
let fd_ret = unsafe {
bpf_syscall(
BPF_MAP_GET_FD_BY_ID,
&raw const fd_attr as *const u8,
std::mem::size_of::<BpfAttrGetId>(),
)
};
if fd_ret < 0 {
let err = std::io::Error::last_os_error();
let raw = err.raw_os_error().unwrap_or(0);
if raw == libc::ENOENT {
tracing::debug!(
map_id = next_id,
"BPF_MAP_GET_FD_BY_ID: map vanished mid-walk (ENOENT); skipping"
);
} else {
tracing::warn!(
map_id = next_id,
errno = raw,
error = %err,
"BPF_MAP_GET_FD_BY_ID failed; skipping this map but continuing the walk"
);
}
continue;
}
let fd = unsafe { OwnedFd::from_raw_fd(fd_ret as RawFd) };
let (info, map_extra) = match obj_get_info_map(fd.as_raw_fd()) {
Ok(pair) => pair,
Err(e) => {
tracing::warn!(
map_id = next_id,
error = %e,
"BPF_OBJ_GET_INFO_BY_FD failed for pinned map; skipping"
);
continue;
}
};
if !predicate(&info) {
continue;
}
maps.push(PinnedMap {
info,
fd,
map_extra,
});
}
Ok(Self { maps })
}
#[cfg(test)]
#[allow(dead_code)]
pub(crate) fn pinned_count(&self) -> usize {
self.maps.len()
}
fn pinned_for(&self, target: &BpfMapInfo) -> Option<&PinnedMap> {
self.maps
.iter()
.find(|p| p.info.name_bytes_active() == target.name_bytes_active())
}
}
fn obj_get_info_map(fd: RawFd) -> Result<(BpfMapInfo, u64)> {
let mut info = BpfMapInfoUapi::default();
let attr = BpfAttrInfoByFd {
bpf_fd: fd as u32,
info_len: std::mem::size_of::<BpfMapInfoUapi>() as u32,
info: &raw mut info as u64,
};
bpf_call_status(
BPF_OBJ_GET_INFO_BY_FD,
&raw const attr as *const u8,
std::mem::size_of::<BpfAttrInfoByFd>(),
)
.context("BPF_OBJ_GET_INFO_BY_FD on map fd")?;
let nul = info
.name
.iter()
.position(|&b| b == 0)
.unwrap_or(BPF_OBJ_NAME_LEN);
let mut name_bytes = [0u8; BPF_OBJ_NAME_LEN];
name_bytes.copy_from_slice(&info.name);
Ok((
BpfMapInfo {
map_pa: 0,
map_kva: 0,
name_bytes,
name_len: nul as u8,
map_type: info.map_type,
map_flags: info.map_flags,
key_size: info.key_size,
value_size: info.value_size,
max_entries: info.max_entries,
value_kva: None,
btf_kva: u64::from(info.btf_id),
btf_value_type_id: info.btf_value_type_id,
btf_vmlinux_value_type_id: 0,
btf_key_type_id: info.btf_key_type_id,
},
info.map_extra,
))
}
impl BpfMapAccessor for BpfSyscallAccessor {
fn maps(&self) -> Vec<BpfMapInfo> {
self.maps.iter().map(|p| p.info.clone()).collect()
}
fn read_value(&self, map: &BpfMapInfo, offset: usize, len: usize) -> Option<Vec<u8>> {
let pinned = self.pinned_for(map)?;
if map.map_type != BPF_MAP_TYPE_ARRAY && map.map_type != BPF_MAP_TYPE_STRUCT_OPS {
return None;
}
let mut key: u32 = 0;
let mut buf = vec![0u8; map.value_size as usize];
let attr = BpfAttrMapElem {
map_fd: pinned.fd.as_raw_fd() as u32,
_pad0: 0,
key: &raw mut key as u64,
value_or_next_key: buf.as_mut_ptr() as u64,
flags: 0,
};
bpf_call_status(
BPF_MAP_LOOKUP_ELEM,
&raw const attr as *const u8,
std::mem::size_of::<BpfAttrMapElem>(),
)
.ok()?;
let end = offset.checked_add(len)?;
if end > buf.len() {
return None;
}
Some(buf[offset..end].to_vec())
}
fn iter_hash_map(&self, map: &BpfMapInfo) -> Vec<(Vec<u8>, Vec<u8>)> {
let Some(pinned) = self.pinned_for(map) else {
return Vec::new();
};
if map.map_type != BPF_MAP_TYPE_HASH && map.map_type != BPF_MAP_TYPE_LRU_HASH {
return Vec::new();
}
let key_sz = map.key_size as usize;
let val_sz = map.value_size as usize;
let mut out: Vec<(Vec<u8>, Vec<u8>)> = Vec::new();
let mut cur_key = vec![0u8; key_sz];
let mut next_key = vec![0u8; key_sz];
let cap = (map.max_entries as usize).saturating_mul(2).max(1);
let mut got_first = false;
for _ in 0..cap {
let attr = BpfAttrMapElem {
map_fd: pinned.fd.as_raw_fd() as u32,
_pad0: 0,
key: if got_first {
cur_key.as_ptr() as u64
} else {
0 },
value_or_next_key: next_key.as_mut_ptr() as u64,
flags: 0,
};
let ret = unsafe {
bpf_syscall(
BPF_MAP_GET_NEXT_KEY,
&raw const attr as *const u8,
std::mem::size_of::<BpfAttrMapElem>(),
)
};
if ret < 0 {
break;
}
got_first = true;
let mut value = vec![0u8; val_sz];
let lookup_attr = BpfAttrMapElem {
map_fd: pinned.fd.as_raw_fd() as u32,
_pad0: 0,
key: next_key.as_ptr() as u64,
value_or_next_key: value.as_mut_ptr() as u64,
flags: 0,
};
let lret = unsafe {
bpf_syscall(
BPF_MAP_LOOKUP_ELEM,
&raw const lookup_attr as *const u8,
std::mem::size_of::<BpfAttrMapElem>(),
)
};
if lret >= 0 {
out.push((next_key.clone(), value));
}
cur_key.copy_from_slice(&next_key);
}
out
}
fn read_percpu_array(&self, map: &BpfMapInfo, key: u32, num_cpus: u32) -> Vec<Option<Vec<u8>>> {
let Some(pinned) = self.pinned_for(map) else {
return Vec::new();
};
if map.map_type != BPF_MAP_TYPE_PERCPU_ARRAY {
return Vec::new();
}
if key >= map.max_entries {
return Vec::new();
}
let val_sz = map.value_size as usize;
let total = (num_cpus as usize).saturating_mul(val_sz);
let mut buf = vec![0u8; total];
let mut k: u32 = key;
let attr = BpfAttrMapElem {
map_fd: pinned.fd.as_raw_fd() as u32,
_pad0: 0,
key: &raw mut k as u64,
value_or_next_key: buf.as_mut_ptr() as u64,
flags: 0,
};
if bpf_call_status(
BPF_MAP_LOOKUP_ELEM,
&raw const attr as *const u8,
std::mem::size_of::<BpfAttrMapElem>(),
)
.is_err()
{
return vec![None; num_cpus as usize];
}
let stride = (val_sz + 7) & !7;
let mut out = Vec::with_capacity(num_cpus as usize);
for cpu in 0..num_cpus as usize {
let start = cpu * stride;
let end = start + val_sz;
if end > buf.len() {
out.push(None);
} else {
out.push(Some(buf[start..end].to_vec()));
}
}
out
}
fn iter_percpu_hash_map(
&self,
map: &BpfMapInfo,
num_cpus: u32,
) -> super::bpf_map::PerCpuHashEntries {
let Some(pinned) = self.pinned_for(map) else {
return Vec::new();
};
if map.map_type != BPF_MAP_TYPE_PERCPU_HASH && map.map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH
{
return Vec::new();
}
let key_sz = map.key_size as usize;
let val_sz = map.value_size as usize;
let stride = (val_sz + 7) & !7;
let buf_total = (num_cpus as usize).saturating_mul(stride);
let mut out: super::bpf_map::PerCpuHashEntries = Vec::new();
let mut cur_key = vec![0u8; key_sz];
let mut next_key = vec![0u8; key_sz];
let cap = (map.max_entries as usize).saturating_mul(2).max(1);
let mut got_first = false;
for _ in 0..cap {
let attr = BpfAttrMapElem {
map_fd: pinned.fd.as_raw_fd() as u32,
_pad0: 0,
key: if got_first {
cur_key.as_ptr() as u64
} else {
0
},
value_or_next_key: next_key.as_mut_ptr() as u64,
flags: 0,
};
let ret = unsafe {
bpf_syscall(
BPF_MAP_GET_NEXT_KEY,
&raw const attr as *const u8,
std::mem::size_of::<BpfAttrMapElem>(),
)
};
if ret < 0 {
break;
}
got_first = true;
let mut value_buf = vec![0u8; buf_total];
let lookup_attr = BpfAttrMapElem {
map_fd: pinned.fd.as_raw_fd() as u32,
_pad0: 0,
key: next_key.as_ptr() as u64,
value_or_next_key: value_buf.as_mut_ptr() as u64,
flags: 0,
};
let lret = unsafe {
bpf_syscall(
BPF_MAP_LOOKUP_ELEM,
&raw const lookup_attr as *const u8,
std::mem::size_of::<BpfAttrMapElem>(),
)
};
if lret >= 0 {
let mut per_cpu = Vec::with_capacity(num_cpus as usize);
for cpu in 0..num_cpus as usize {
let start = cpu * stride;
let end = start + val_sz;
if end > value_buf.len() {
per_cpu.push(None);
} else {
per_cpu.push(Some(value_buf[start..end].to_vec()));
}
}
out.push((next_key.clone(), per_cpu));
}
cur_key.copy_from_slice(&next_key);
}
out
}
fn read_arena_pages(
&self,
map: &BpfMapInfo,
_arena_offsets: &BpfArenaOffsets,
) -> ArenaSnapshot {
let Some(pinned) = self.pinned_for(map) else {
return ArenaSnapshot::default();
};
if map.map_type != BPF_MAP_TYPE_ARENA {
return ArenaSnapshot::default();
}
let declared_bytes_raw = (map.max_entries as u64).saturating_mul(ARENA_PAGE_SIZE as u64);
let span_capped = declared_bytes_raw > MAX_ARENA_BYTES;
let declared_bytes = declared_bytes_raw.min(MAX_ARENA_BYTES);
let declared_pages = declared_bytes / ARENA_PAGE_SIZE as u64;
let user_vm_start = pinned.map_extra;
if declared_pages == 0 {
return ArenaSnapshot {
pages: Vec::new(),
truncated: false,
declared_pages: 0,
span_capped,
user_vm_start,
..Default::default()
};
}
let walk_pages = declared_pages.min(MAX_ARENA_PAGES);
let walk_bytes = (walk_pages as usize) * ARENA_PAGE_SIZE;
let truncated = declared_pages > walk_pages;
let addr = unsafe {
libc::mmap(
ptr::null_mut(),
walk_bytes,
libc::PROT_READ,
libc::MAP_SHARED,
pinned.fd.as_raw_fd(),
0,
)
};
if addr == libc::MAP_FAILED {
return ArenaSnapshot {
pages: Vec::new(),
truncated,
declared_pages,
span_capped,
user_vm_start,
..Default::default()
};
}
let mut pages: Vec<ArenaPage> = Vec::new();
let mut residency = vec![0u8; walk_pages as usize];
let mincore_ret = unsafe { libc::mincore(addr, walk_bytes, residency.as_mut_ptr()) };
if mincore_ret == 0 {
for (idx, &resident) in residency.iter().enumerate() {
if resident & 1 == 0 {
continue;
}
let page_addr = (addr as usize) + idx * ARENA_PAGE_SIZE;
let mut buf = vec![0u8; ARENA_PAGE_SIZE];
unsafe {
std::ptr::copy_nonoverlapping(
page_addr as *const u8,
buf.as_mut_ptr(),
ARENA_PAGE_SIZE,
);
}
let Some(idx_offset) = (idx as u64).checked_mul(ARENA_PAGE_SIZE as u64) else {
continue;
};
let Some(user_addr) = user_vm_start.checked_add(idx_offset) else {
continue;
};
pages.push(ArenaPage {
user_addr,
bytes: buf,
});
}
}
unsafe {
libc::munmap(addr, walk_bytes);
}
ArenaSnapshot {
pages,
truncated,
declared_pages,
span_capped,
user_vm_start,
..Default::default()
}
}
fn load_program_btf(&self, map: &BpfMapInfo, base_btf: &Btf) -> Option<Btf> {
let btf_id = map.btf_kva as u32;
if btf_id == 0 {
return None;
}
let attr = BpfAttrGetId {
id_or_start_id: btf_id,
next_id: 0,
open_flags: 0,
};
let btf_fd = bpf_call_fd(
BPF_BTF_GET_FD_BY_ID,
&raw const attr as *const u8,
std::mem::size_of::<BpfAttrGetId>(),
)
.ok()?;
let btf_owned = unsafe { OwnedFd::from_raw_fd(btf_fd) };
let mut info = BpfBtfInfoUapi::default();
let info_attr = BpfAttrInfoByFd {
bpf_fd: btf_owned.as_raw_fd() as u32,
info_len: std::mem::size_of::<BpfBtfInfoUapi>() as u32,
info: &raw mut info as u64,
};
bpf_call_status(
BPF_OBJ_GET_INFO_BY_FD,
&raw const info_attr as *const u8,
std::mem::size_of::<BpfAttrInfoByFd>(),
)
.ok()?;
if info.btf_size == 0 {
return None;
}
let mut buf = vec![0u8; info.btf_size as usize];
info.btf = buf.as_mut_ptr() as u64;
let info_attr2 = BpfAttrInfoByFd {
bpf_fd: btf_owned.as_raw_fd() as u32,
info_len: std::mem::size_of::<BpfBtfInfoUapi>() as u32,
info: &raw mut info as u64,
};
bpf_call_status(
BPF_OBJ_GET_INFO_BY_FD,
&raw const info_attr2 as *const u8,
std::mem::size_of::<BpfAttrInfoByFd>(),
)
.ok()?;
Btf::from_split_bytes(&buf, base_btf).ok()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn bpf_attr_map_elem_size() {
assert_eq!(std::mem::size_of::<BpfAttrMapElem>(), 32);
}
#[test]
fn bpf_attr_get_id_size() {
assert_eq!(std::mem::size_of::<BpfAttrGetId>(), 12);
}
#[test]
fn bpf_attr_info_by_fd_size() {
assert_eq!(std::mem::size_of::<BpfAttrInfoByFd>(), 16);
}
#[test]
fn bpf_map_info_uapi_layout() {
use crate::assert::Verdict;
let off_map_type = std::mem::offset_of!(BpfMapInfoUapi, map_type);
let off_name = std::mem::offset_of!(BpfMapInfoUapi, name);
let total_size = std::mem::size_of::<BpfMapInfoUapi>();
let off_map_extra = std::mem::offset_of!(BpfMapInfoUapi, map_extra);
let map_extra_tail = off_map_extra + 8;
let mut v = Verdict::new();
crate::claim!(v, off_map_type).eq(0usize);
crate::claim!(v, off_name).eq(24usize);
crate::claim!(v, map_extra_tail).eq(total_size);
let r = v.into_result();
assert!(r.passed, "bpf_map_info uapi layout drift: {:?}", r.details,);
}
#[test]
fn percpu_stride_round_up() {
let cases = [
(0usize, 0),
(1, 8),
(7, 8),
(8, 8),
(9, 16),
(15, 16),
(16, 16),
];
for (val_sz, expected) in cases {
let stride = (val_sz + 7) & !7;
assert_eq!(stride, expected, "value_size {val_sz} → stride {stride}");
}
}
#[test]
fn predicate_filters_pinned_set() {
fn _check_predicate_shape() {
let _ =
BpfSyscallAccessor::from_running_kernel_filtered(|_info: &BpfMapInfo| -> bool {
false
});
}
}
}