mod batch;
mod buffer;
mod circuit;
mod connection;
mod memory;
mod perf_metrics;
mod profiling;
mod rate_limit;
mod resource_pool;
mod shutdown;
pub use profiling::{
cached_nanos, cached_nanos_or_now, cpu_cycles, get_page_faults, init_time_service,
with_page_fault_tracking,
};
pub use perf_metrics::{InferencePhase, PerfMetrics};
#[cfg(not(target_arch = "wasm32"))]
pub use memory::AlignedBuffer;
pub use memory::{
is_direct_io_aligned, madvise_region, prefetch_for_inference, prefetch_ptr, prefetch_slice,
CacheAligned, MemoryAdvice, PrefetchLocality, CACHE_LINE_SIZE, CACHE_LINE_SIZE_F32,
DIRECT_IO_ALIGNMENT,
};
pub use buffer::{BufferWatermarks, WatermarkedBuffer};
pub use circuit::{CircuitBreaker, CircuitState};
pub use shutdown::{GracefulShutdown, ShutdownGuard, ShutdownResult};
pub use resource_pool::{PooledResource, ResourcePool};
pub use rate_limit::{LimitError, ServeLimits};
pub use connection::{ConnectionState, KeepAliveConfig, ManagedConnection};
pub use batch::{balance211, split_batch, Balance211Iter, BatchSplitStrategy};
mod kv_cache;
pub use kv_cache::{KvCacheManager, KvCacheSlotInfo, SequentialBatchOrderer};
mod simd_config;
pub use simd_config::{
unroll_tail_process, AmxTileConfig, LazySimdConfig, SimdBackendState, UnrollFactor,
UnrollTailIterator,
};
mod exec_graph;
pub use exec_graph::{
BrickBottleneck, BrickCategory, BrickId, BrickSample, BrickStats, CategoryStats, EdgeType,
ExecutionEdge, ExecutionGraph, ExecutionNode, ExecutionNodeId, PtxRegistry, SyncMode,
TransferDirection,
};
mod profiler;
pub use profiler::{
fnv1a_f32_checksum, BrickIdTimer, BrickProfiler, BrickTimer, DivergenceInfo, KernelChecksum,
TileLevel, TileStats, TileTimer,
};
mod tracing;
pub use tracing::{
AttentionTraceConfig, AttentionWeightTrace, KvCacheSessionTrace, KvCacheStateTrace,
LayerActivationTrace, LogitEvolutionTrace, ModelActivationTrace, ModelQuantizationError,
ModelTracer, ModelTracerConfig, ModelTracerSummary, QuantType, QuantizationErrorTrace,
TensorStats, TokenLogitEvolution,
};
mod patterns;
pub use patterns::{
reserve_capacity, AsyncResult, BoundedQueue, DualWakerState, FlowControlError,
GraphReuseCounter, ReserveStrategy, StrategicBuffer, StreamCapacity, WakeDecision,
WakeSkipState,
};
mod ops;
pub use ops::{AddOp, DotOp, MatmulOp, SoftmaxOp};
mod fused_ops;
pub use fused_ops::{FusedGateUpOp, FusedGateUpWeights, FusedQKVOp, FusedQKVWeights};
mod attention;
pub use attention::AttentionOp;
mod quant_ops;
pub use quant_ops::{BlockQ5K, BlockQ6K, DotQ5KOp, DotQ6KOp};
#[cfg(test)]
mod tests;
mod async_profiler;
pub use async_profiler::AsyncTaskProfiler;
mod budget;
pub use budget::{ByteBudget, TokenBudget, TokenResult};
mod types;
pub use types::{
AssertionResult, Backend, BrickError, BrickVerification, ComputeAssertion, ComputeBackend,
ComputeOp,
};
mod compute_brick;
pub use compute_brick::{BrickLayer, ComputeBrick};