use std::sync::OnceLock;
#[cfg(all(target_arch = "aarch64", target_os = "macos"))]
const PLATFORM_CACHE_LINE: usize = 128;
#[cfg(all(target_arch = "aarch64", not(target_os = "macos")))]
const PLATFORM_CACHE_LINE: usize = 64;
#[cfg(not(target_arch = "aarch64"))]
const PLATFORM_CACHE_LINE: usize = 64;
#[cfg(all(target_arch = "aarch64", target_os = "macos"))]
const PLATFORM_PAR_THRESHOLD: usize = 16_384;
#[cfg(not(all(target_arch = "aarch64", target_os = "macos")))]
const PLATFORM_PAR_THRESHOLD: usize = 30_000;
struct HwInfo {
total_cpus: usize,
perf_cores: usize, l1d_cache: usize, l2_cache: usize, cache_line: usize, }
impl HwInfo {
fn detect() -> Self {
let total = std::thread::available_parallelism()
.map(|n| n.get())
.unwrap_or(2);
let mut info = HwInfo {
total_cpus: total,
perf_cores: 0,
l1d_cache: 0,
l2_cache: 0,
cache_line: 0,
};
#[cfg(target_os = "macos")]
{
info.perf_cores = sysctl_usize("hw.perflevel0.physicalcpu").unwrap_or(0);
info.l1d_cache = sysctl_usize("hw.l1dcachesize").unwrap_or(0);
info.l2_cache = sysctl_usize("hw.l2cachesize").unwrap_or(0);
info.cache_line = sysctl_usize("hw.cachelinesize").unwrap_or(0);
}
#[cfg(target_os = "linux")]
{
if let Ok(v) = std::fs::read_to_string(
"/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size",
) {
info.cache_line = v.trim().parse().unwrap_or(0);
}
if let Ok(v) = std::fs::read_to_string("/sys/devices/system/cpu/cpu0/cache/index0/size")
{
let s = v.trim().to_uppercase();
if s.ends_with('K') {
info.l1d_cache = s[..s.len() - 1].parse::<usize>().unwrap_or(0) * 1024;
} else {
info.l1d_cache = s.parse().unwrap_or(0);
}
}
}
info
}
fn optimal_workers(&self) -> usize {
let base = if self.perf_cores > 0 {
self.perf_cores / 2 } else {
self.total_cpus / 2 };
base.clamp(1, 15)
}
fn cache_line(&self) -> usize {
if self.cache_line > 0 {
self.cache_line
} else {
PLATFORM_CACHE_LINE
}
}
#[allow(dead_code)]
fn fuse_attn_threshold(&self) -> usize {
if self.l1d_cache > 0 {
64
} else {
64
}
}
}
#[cfg(target_os = "macos")]
fn sysctl_usize(name: &str) -> Option<usize> {
use std::ffi::CString;
let cname = CString::new(name).ok()?;
let mut val: u64 = 0;
let mut len = std::mem::size_of::<u64>();
unsafe {
unsafe extern "C" {
fn sysctlbyname(
name: *const i8,
oldp: *mut u8,
oldlenp: *mut usize,
newp: *const u8,
newlen: usize,
) -> i32;
}
let ret = sysctlbyname(
cname.as_ptr(),
&mut val as *mut u64 as *mut u8,
&mut len,
std::ptr::null(),
0,
);
if ret == 0 { Some(val as usize) } else { None }
}
}
#[derive(Debug, Clone)]
pub struct RuntimeConfig {
pub pool_workers: usize,
pub par_threshold: usize,
pub min_rows_per_thread: usize,
pub sdpa_seq_threshold: usize,
pub arena_alignment: usize,
pub ln_eps_default: f32,
pub attn_mask_neg_inf: f32,
pub score_skip_threshold: f32,
pub mask_binary_threshold: f32,
pub verbose: u8,
}
impl Default for RuntimeConfig {
fn default() -> Self {
Self::auto_detect()
}
}
impl RuntimeConfig {
pub fn auto_detect() -> Self {
let hw = HwInfo::detect();
Self {
pool_workers: hw.optimal_workers(),
par_threshold: PLATFORM_PAR_THRESHOLD,
min_rows_per_thread: 4,
sdpa_seq_threshold: 32,
arena_alignment: hw.cache_line(),
ln_eps_default: 1e-12,
attn_mask_neg_inf: -1e9,
score_skip_threshold: 1e-8,
mask_binary_threshold: 0.5,
verbose: 0,
}
}
pub fn from_env() -> Self {
let mut cfg = Self::auto_detect();
if let Some(v) = rlx_ir::env::var("RLX_WORKERS")
&& let Ok(n) = v.parse::<usize>()
{
cfg.pool_workers = if n == 0 { cfg.pool_workers } else { n.min(15) };
}
if let Some(v) = rlx_ir::env::var("RLX_PAR_THRESHOLD")
&& let Ok(n) = v.parse()
{
cfg.par_threshold = n;
}
if let Some(v) = rlx_ir::env::var("RLX_SDPA_THRESHOLD")
&& let Ok(n) = v.parse()
{
cfg.sdpa_seq_threshold = n;
}
if let Some(v) = rlx_ir::env::var("RLX_ARENA_ALIGN")
&& let Ok(n) = v.parse()
{
cfg.arena_alignment = n;
}
if let Some(v) = rlx_ir::env::var("RLX_VERBOSE")
&& let Ok(n) = v.parse()
{
cfg.verbose = n;
}
if cfg.verbose >= 1 {
let hw = HwInfo::detect();
eprintln!(
"[rlx] hw: {} CPUs ({} P-cores), L1={}KB, L2={}KB, cacheline={}B",
hw.total_cpus,
hw.perf_cores,
hw.l1d_cache / 1024,
hw.l2_cache / 1024,
hw.cache_line()
);
eprintln!(
"[rlx] config: workers={}, par_thr={}, sdpa_thr={}, align={}",
cfg.pool_workers, cfg.par_threshold, cfg.sdpa_seq_threshold, cfg.arena_alignment
);
}
cfg
}
pub fn install(&self) {
rlx_ir::env::set("RLX_WORKERS", self.pool_workers.to_string());
rlx_ir::env::set("RLX_PAR_THRESHOLD", self.par_threshold.to_string());
rlx_ir::env::set("RLX_SDPA_THRESHOLD", self.sdpa_seq_threshold.to_string());
rlx_ir::env::set("RLX_ARENA_ALIGN", self.arena_alignment.to_string());
rlx_ir::env::set("RLX_VERBOSE", self.verbose.to_string());
}
pub fn global() -> &'static RuntimeConfig {
static CONFIG: OnceLock<RuntimeConfig> = OnceLock::new();
CONFIG.get_or_init(RuntimeConfig::from_env)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn auto_detect_sane_defaults() {
let cfg = RuntimeConfig::auto_detect();
assert!(cfg.pool_workers >= 1);
assert!(cfg.pool_workers <= 15);
assert!(cfg.arena_alignment >= 64);
assert!(cfg.verbose == 0);
}
#[test]
fn global_is_consistent() {
let a = RuntimeConfig::global();
let b = RuntimeConfig::global();
assert_eq!(a.pool_workers, b.pool_workers);
}
#[test]
fn hw_detection() {
let hw = HwInfo::detect();
assert!(hw.total_cpus >= 1);
#[cfg(target_os = "macos")]
assert!(
hw.cache_line > 0,
"expected sysctl to return cache line size"
);
}
}