Expand description
§OxiBonsai Runtime
High-level inference engine, sampling, tokenizer bridge, and OpenAI-compatible HTTP server for OxiBonsai.
This crate ties together oxibonsai_core, oxibonsai_kernels,
and oxibonsai_model into a production-ready inference runtime:
InferenceEngine— orchestrates prefill + decode with metricsSampler— temperature, top-k, top-p, repetition penaltySamplingPreset— named parameter sets (Greedy, Balanced, Creative, …)SamplerBuilder/ConfigBuilder/EngineBuilder— ergonomic setupserver— Axum-based/v1/chat/completionsserver (feature-gated)InferenceMetrics— Prometheus-compatible counters, gauges, histogramsCircuitBreaker— resilience pattern for cascading-failure protectionHealthReport— structured health checks for ops monitoring
§Quick Start
use oxibonsai_core::config::Qwen3Config;
use oxibonsai_runtime::engine::InferenceEngine;
use oxibonsai_runtime::presets::SamplingPreset;
let config = Qwen3Config::tiny_test();
let params = SamplingPreset::Balanced.params();
let mut engine = InferenceEngine::new(config, params, 42);
let tokens = engine.generate(&[151644, 872], 10)
.expect("generation should succeed");Re-exports§
pub use adaptive_lookahead::AdaptiveLookahead;pub use adaptive_lookahead::AdaptiveLookaheadConfig;pub use adaptive_lookahead::AdaptiveLookaheadError;pub use adaptive_sampling::AdaptiveSamplerChain;pub use adaptive_sampling::AdaptiveStrategy;pub use adaptive_sampling::EntropyCooling;pub use adaptive_sampling::GenerationState;pub use adaptive_sampling::RepetitionAdaptation;pub use adaptive_sampling::ScheduledDecay;pub use auto_tuner::AutoTuner;pub use auto_tuner::CpuArch;pub use auto_tuner::CpuFeatures;pub use auto_tuner::KernelBenchmark;pub use auto_tuner::KvCacheType;pub use auto_tuner::MemoryBudget;pub use auto_tuner::SimdTier;pub use auto_tuner::TuningRecommendation;pub use builders::ConfigBuilder;pub use builders::EngineBuilder;pub use builders::SamplerBuilder;pub use circuit_breaker::CircuitBreaker;pub use circuit_breaker::CircuitBreakerConfig;pub use circuit_breaker::CircuitState;pub use config::OxiBonsaiConfig;pub use constrained_decoding::AllowListConstraint;pub use constrained_decoding::ConstrainedSampler;pub use constrained_decoding::ConstrainedSamplerBuilder;pub use constrained_decoding::ConstraintError;pub use constrained_decoding::JsonConstraint;pub use constrained_decoding::JsonParseState;pub use constrained_decoding::LengthConstraint;pub use constrained_decoding::NoConstraint;pub use constrained_decoding::RegexConstraint;pub use constrained_decoding::SequenceConstraint;pub use constrained_decoding::TokenConstraint;pub use convenience::GenerationResult;pub use convenience::MemoryEstimate;pub use convenience::ModelFileInfo;pub use convenience::TokenStats;pub use dedup::DedupCache;pub use dedup::DedupStats;pub use dedup::RequestKey;pub use distributed::ConsistentHashRing;pub use distributed::CoordinatorConfig;pub use distributed::DistributedCoordinator;pub use distributed::NodeInfo;pub use distributed::NodeRegistry;pub use engine::InferenceEngine;pub use error::RuntimeError;pub use error::RuntimeResult;pub use grammar::compile_json_schema;pub use grammar::compile_json_schema_str;pub use grammar::compile_regex;pub use grammar::parse_bnf;pub use grammar::parse_gbnf;pub use grammar::BnfParseError;pub use grammar::EarleyRecognizer;pub use grammar::GbnfParseError;pub use grammar::Grammar;pub use grammar::GrammarConstraint;pub use grammar::JsonSchemaCompileError;pub use grammar::RegexCompileError;pub use grammar::Rule;pub use grammar::Symbol;pub use health::HealthReport;pub use health::HealthStatus;pub use hot_reload::HotReloadCoordinator;pub use hot_reload::ModelVersion;pub use hot_reload::ReloadLog;pub use json_schema::parse_schema;pub use json_schema::schema_example;pub use json_schema::schema_template;pub use json_schema::validate_against_schema;pub use json_schema::SchemaError;pub use json_schema::SchemaState;pub use json_schema::SchemaType;pub use kv_cache_policy::KvCacheLevel;pub use kv_cache_policy::KvCachePolicy;pub use kv_cache_policy::KvCachePolicyConfig;pub use kv_cache_policy::KvCachePolicyError;pub use memory::get_rss_bytes;pub use memory::MemoryProfiler;pub use memory::MemorySnapshot;pub use metrics::InferenceMetrics;pub use multi_model::AdapterRef;pub use multi_model::AdapterStack;pub use multi_model::EndpointStatus;pub use multi_model::ModelEndpoint;pub use multi_model::ModelId;pub use multi_model::ModelListEntry;pub use multi_model::ModelRegistry;pub use multi_model::ModelRouter;pub use multi_model::RoutingError;pub use native_tokenizer::NativeTokenizerBridge;pub use native_tokenizer::NativeTokenizerError;pub use nbest::Hypothesis;pub use nbest::NBestDecoder;pub use nbest::NBestList;pub use presets::SamplingPreset;pub use profiler::flop_counter;pub use profiler::AggregateStats;pub use profiler::ProfileEvent;pub use profiler::ProfileTrace;pub use profiler::Profiler;pub use quality_metrics::extract_ngrams;pub use quality_metrics::perplexity_from_logprobs;pub use quality_metrics::repetition_penalty_rate;pub use quality_metrics::self_bleu;pub use quality_metrics::token_entropy;pub use quality_metrics::BatchQualityAnalyzer;pub use quality_metrics::BleuScore;pub use quality_metrics::DiversityMetrics;pub use quality_metrics::GenerationQualityReport;pub use quality_metrics::RepetitionMetrics;pub use recovery::ErrorClass;pub use recovery::RecoveryStrategy;pub use request_id::RequestId;pub use request_metrics::AggregateRateSnapshot;pub use request_metrics::RequestRateAggregator;pub use request_metrics::RequestRateSnapshot;pub use request_metrics::RequestRateTracker;pub use sampling::Sampler;pub use sampling_advanced::EtaSampler;pub use sampling_advanced::LcgRng;pub use sampling_advanced::MinPSampler;pub use sampling_advanced::MirostatV1Sampler;pub use sampling_advanced::MirostatV2Sampler;pub use sampling_advanced::SamplerChain;pub use sampling_advanced::SamplerStep;pub use sampling_advanced::TypicalSampler;pub use stream_metrics::RequestStreamMetrics;pub use stream_metrics::StreamMetricsSnapshot;pub use stream_metrics::StreamingMetricsAggregator;pub use token_budget::BudgetConfig;pub use token_budget::BudgetError;pub use token_budget::BudgetPolicy;pub use token_budget::GlobalTokenBudget;pub use token_budget::RequestBudget;pub use token_budget::TokenCostEstimate;pub use tokenizer_bridge::TokenizerBridge;pub use tool_calling::build_tool_constraint;pub use tool_calling::make_tool_call;pub use tool_calling::new_tool_call_id;pub use tool_calling::select_tool;pub use tool_calling::validate_tool_arguments;pub use tool_calling::ToolCallError;pub use tool_calling::ToolRegistry;pub use tracing_setup::init_tracing;pub use tracing_setup::TracingConfig;pub use web_ui::create_ui_router;
Modules§
- adaptive_
lookahead - Adaptive lookahead controller for speculative decoding.
- adaptive_
sampling - Adaptive sampling: dynamically adjust temperature/top_p based on generation state.
- admin
- Admin API endpoints for operational management.
- api_
extensions - Extended
/v1/chat/completionshandler. - api_
types - Extended OpenAI-compatible API types.
- async_
engine - Async wrapper around the synchronous
InferenceEngine. - auto_
tuner - Performance auto-tuning: detect hardware, select kernels, tune parameters.
- batch_
engine - Batched inference: process multiple prompts efficiently.
- beam_
search - Beam search decoding for OxiBonsai.
- builders
- Builder patterns for ergonomic OxiBonsai setup.
- circuit_
breaker - Circuit breaker pattern for production resilience.
- completions
- OpenAI v1 completions endpoint (legacy, non-chat).
- config
- Layered configuration system for OxiBonsai.
- constrained_
decoding - Grammar-constrained decoding for token-by-token generation.
- context_
manager - Context window management for multi-turn inference.
- continuous_
batch - Continuous (iteration-level) batching for OxiBonsai.
- convenience
- High-level convenience functions for common OxiBonsai operations.
- dedup
- Request deduplication: cache identical requests to avoid redundant inference.
- distributed
- Distributed serving infrastructure for OxiBonsai.
- embedding_
index - Navigable Small World (NSW) approximate nearest-neighbor index.
- embeddings
- OpenAI v1 embeddings endpoint.
- engine
- Inference engine orchestrating model loading and generation.
- error
- Runtime error types.
- grammar
- Context-free grammar engine with Earley-parser-based constrained decoding.
- health
- Health check system for production monitoring.
- hot_
reload - Model hot-reload: update model weights without server restart.
- json_
schema - JSON Schema-driven structured output constraint.
- kv_
cache_ policy - KV cache compression policy controller.
- memory
- Runtime memory profiling.
- metrics
- Lightweight, thread-safe Prometheus-compatible metrics system.
- middleware
- Request middleware: context injection, logging, CORS, and idempotency caching.
- model_
cache - In-process model cache for GGUF files.
- multi_
model - Multi-model serving: manage base models + LoRA adapters with smart routing.
- native_
tokenizer - Native tokenizer bridge using oxibonsai-tokenizer (pure Rust BPE).
- nbest
- N-best hypothesis tracking for diverse generation.
- ngram_
cache - N-gram cache for zero-cost speculative decoding draft generation.
- pipeline
- High-level inference pipeline API for OxiBonsai.
- prefix_
cache_ engine - Prefix-cache-aware inference engine wrapper.
- presets
- Predefined sampling parameter presets for common use cases.
- profiler
- Inference profiler: per-layer timing, memory, and FLOP accounting.
- quality_
metrics - Generation quality metrics for evaluating LLM outputs.
- rate_
limiter - Token bucket rate limiter with per-client and optional global limits.
- recovery
- Error recovery strategies for production resilience.
- request_
id - Structured request identifiers for end-to-end tracing.
- request_
metrics - Per-request token-rate and latency metrics.
- request_
queue - Bounded request queue with backpressure for the inference pipeline.
- sampling
- Sampling strategies for text generation.
- sampling_
advanced - Advanced sampling algorithms for text generation.
- semantic_
cache - Semantic caching layer for LLM inference.
- server
- OpenAI-compatible chat completions server.
- speculative
- Speculative decoding for accelerated autoregressive generation.
- stream_
metrics - Token streaming metrics: TTFT, inter-token latency (TBT), throughput.
- streaming
- Enhanced SSE streaming with delta tokens, finish reasons, and usage info.
- token_
budget - Token budget management: enforce per-request and global token limits.
- token_
healing - Token healing for partial-token prompt repair.
- tokenizer_
bridge - Tokenizer bridge wrapping HuggingFace tokenizers.
- tool_
calling - High-level tool-calling orchestration for OxiBonsai.
- tracing_
setup - Tracing initialization with configurable output format.
- wasm_
api - WASM-compatible inference API.
- web_ui
- Web UI module — serves a minimal HTML chat interface from the Axum server.