Skip to main content

Crate oxibonsai_runtime

Crate oxibonsai_runtime 

Source
Expand description

§OxiBonsai Runtime

High-level inference engine, sampling, tokenizer bridge, and OpenAI-compatible HTTP server for OxiBonsai.

This crate ties together oxibonsai_core, oxibonsai_kernels, and oxibonsai_model into a production-ready inference runtime:

§Quick Start

use oxibonsai_core::config::Qwen3Config;
use oxibonsai_runtime::engine::InferenceEngine;
use oxibonsai_runtime::presets::SamplingPreset;

let config = Qwen3Config::tiny_test();
let params = SamplingPreset::Balanced.params();
let mut engine = InferenceEngine::new(config, params, 42);

let tokens = engine.generate(&[151644, 872], 10)
    .expect("generation should succeed");

Re-exports§

pub use adaptive_lookahead::AdaptiveLookahead;
pub use adaptive_lookahead::AdaptiveLookaheadConfig;
pub use adaptive_lookahead::AdaptiveLookaheadError;
pub use adaptive_sampling::AdaptiveSamplerChain;
pub use adaptive_sampling::AdaptiveStrategy;
pub use adaptive_sampling::EntropyCooling;
pub use adaptive_sampling::GenerationState;
pub use adaptive_sampling::RepetitionAdaptation;
pub use adaptive_sampling::ScheduledDecay;
pub use auto_tuner::AutoTuner;
pub use auto_tuner::CpuArch;
pub use auto_tuner::CpuFeatures;
pub use auto_tuner::KernelBenchmark;
pub use auto_tuner::KvCacheType;
pub use auto_tuner::MemoryBudget;
pub use auto_tuner::SimdTier;
pub use auto_tuner::TuningRecommendation;
pub use builders::ConfigBuilder;
pub use builders::EngineBuilder;
pub use builders::SamplerBuilder;
pub use circuit_breaker::CircuitBreaker;
pub use circuit_breaker::CircuitBreakerConfig;
pub use circuit_breaker::CircuitState;
pub use config::OxiBonsaiConfig;
pub use constrained_decoding::AllowListConstraint;
pub use constrained_decoding::ConstrainedSampler;
pub use constrained_decoding::ConstrainedSamplerBuilder;
pub use constrained_decoding::ConstraintError;
pub use constrained_decoding::JsonConstraint;
pub use constrained_decoding::JsonParseState;
pub use constrained_decoding::LengthConstraint;
pub use constrained_decoding::NoConstraint;
pub use constrained_decoding::RegexConstraint;
pub use constrained_decoding::SequenceConstraint;
pub use constrained_decoding::TokenConstraint;
pub use convenience::GenerationResult;
pub use convenience::MemoryEstimate;
pub use convenience::ModelFileInfo;
pub use convenience::TokenStats;
pub use dedup::DedupCache;
pub use dedup::DedupStats;
pub use dedup::RequestKey;
pub use distributed::ConsistentHashRing;
pub use distributed::CoordinatorConfig;
pub use distributed::DistributedCoordinator;
pub use distributed::NodeInfo;
pub use distributed::NodeRegistry;
pub use engine::InferenceEngine;
pub use error::RuntimeError;
pub use error::RuntimeResult;
pub use grammar::compile_json_schema;
pub use grammar::compile_json_schema_str;
pub use grammar::compile_regex;
pub use grammar::parse_bnf;
pub use grammar::parse_gbnf;
pub use grammar::BnfParseError;
pub use grammar::EarleyRecognizer;
pub use grammar::GbnfParseError;
pub use grammar::Grammar;
pub use grammar::GrammarConstraint;
pub use grammar::JsonSchemaCompileError;
pub use grammar::RegexCompileError;
pub use grammar::Rule;
pub use grammar::Symbol;
pub use health::HealthReport;
pub use health::HealthStatus;
pub use hot_reload::HotReloadCoordinator;
pub use hot_reload::ModelVersion;
pub use hot_reload::ReloadLog;
pub use json_schema::parse_schema;
pub use json_schema::schema_example;
pub use json_schema::schema_template;
pub use json_schema::validate_against_schema;
pub use json_schema::SchemaError;
pub use json_schema::SchemaState;
pub use json_schema::SchemaType;
pub use kv_cache_policy::KvCacheLevel;
pub use kv_cache_policy::KvCachePolicy;
pub use kv_cache_policy::KvCachePolicyConfig;
pub use kv_cache_policy::KvCachePolicyError;
pub use memory::get_rss_bytes;
pub use memory::MemoryProfiler;
pub use memory::MemorySnapshot;
pub use metrics::InferenceMetrics;
pub use multi_model::AdapterRef;
pub use multi_model::AdapterStack;
pub use multi_model::EndpointStatus;
pub use multi_model::ModelEndpoint;
pub use multi_model::ModelId;
pub use multi_model::ModelListEntry;
pub use multi_model::ModelRegistry;
pub use multi_model::ModelRouter;
pub use multi_model::RoutingError;
pub use native_tokenizer::NativeTokenizerBridge;
pub use native_tokenizer::NativeTokenizerError;
pub use nbest::Hypothesis;
pub use nbest::NBestDecoder;
pub use nbest::NBestList;
pub use presets::SamplingPreset;
pub use profiler::flop_counter;
pub use profiler::AggregateStats;
pub use profiler::ProfileEvent;
pub use profiler::ProfileTrace;
pub use profiler::Profiler;
pub use quality_metrics::extract_ngrams;
pub use quality_metrics::perplexity_from_logprobs;
pub use quality_metrics::repetition_penalty_rate;
pub use quality_metrics::self_bleu;
pub use quality_metrics::token_entropy;
pub use quality_metrics::BatchQualityAnalyzer;
pub use quality_metrics::BleuScore;
pub use quality_metrics::DiversityMetrics;
pub use quality_metrics::GenerationQualityReport;
pub use quality_metrics::RepetitionMetrics;
pub use recovery::ErrorClass;
pub use recovery::RecoveryStrategy;
pub use request_id::RequestId;
pub use request_metrics::AggregateRateSnapshot;
pub use request_metrics::RequestRateAggregator;
pub use request_metrics::RequestRateSnapshot;
pub use request_metrics::RequestRateTracker;
pub use sampling::Sampler;
pub use sampling_advanced::EtaSampler;
pub use sampling_advanced::LcgRng;
pub use sampling_advanced::MinPSampler;
pub use sampling_advanced::MirostatV1Sampler;
pub use sampling_advanced::MirostatV2Sampler;
pub use sampling_advanced::SamplerChain;
pub use sampling_advanced::SamplerStep;
pub use sampling_advanced::TypicalSampler;
pub use stream_metrics::RequestStreamMetrics;
pub use stream_metrics::StreamMetricsSnapshot;
pub use stream_metrics::StreamingMetricsAggregator;
pub use token_budget::BudgetConfig;
pub use token_budget::BudgetError;
pub use token_budget::BudgetPolicy;
pub use token_budget::GlobalTokenBudget;
pub use token_budget::RequestBudget;
pub use token_budget::TokenCostEstimate;
pub use tokenizer_bridge::TokenizerBridge;
pub use tool_calling::build_tool_constraint;
pub use tool_calling::make_tool_call;
pub use tool_calling::new_tool_call_id;
pub use tool_calling::select_tool;
pub use tool_calling::validate_tool_arguments;
pub use tool_calling::ToolCallError;
pub use tool_calling::ToolRegistry;
pub use tracing_setup::init_tracing;
pub use tracing_setup::TracingConfig;
pub use web_ui::create_ui_router;

Modules§

adaptive_lookahead
Adaptive lookahead controller for speculative decoding.
adaptive_sampling
Adaptive sampling: dynamically adjust temperature/top_p based on generation state.
admin
Admin API endpoints for operational management.
api_extensions
Extended /v1/chat/completions handler.
api_types
Extended OpenAI-compatible API types.
async_engine
Async wrapper around the synchronous InferenceEngine.
auto_tuner
Performance auto-tuning: detect hardware, select kernels, tune parameters.
batch_engine
Batched inference: process multiple prompts efficiently.
beam_search
Beam search decoding for OxiBonsai.
builders
Builder patterns for ergonomic OxiBonsai setup.
circuit_breaker
Circuit breaker pattern for production resilience.
completions
OpenAI v1 completions endpoint (legacy, non-chat).
config
Layered configuration system for OxiBonsai.
constrained_decoding
Grammar-constrained decoding for token-by-token generation.
context_manager
Context window management for multi-turn inference.
continuous_batch
Continuous (iteration-level) batching for OxiBonsai.
convenience
High-level convenience functions for common OxiBonsai operations.
dedup
Request deduplication: cache identical requests to avoid redundant inference.
distributed
Distributed serving infrastructure for OxiBonsai.
embedding_index
Navigable Small World (NSW) approximate nearest-neighbor index.
embeddings
OpenAI v1 embeddings endpoint.
engine
Inference engine orchestrating model loading and generation.
error
Runtime error types.
grammar
Context-free grammar engine with Earley-parser-based constrained decoding.
health
Health check system for production monitoring.
hot_reload
Model hot-reload: update model weights without server restart.
json_schema
JSON Schema-driven structured output constraint.
kv_cache_policy
KV cache compression policy controller.
memory
Runtime memory profiling.
metrics
Lightweight, thread-safe Prometheus-compatible metrics system.
middleware
Request middleware: context injection, logging, CORS, and idempotency caching.
model_cache
In-process model cache for GGUF files.
multi_model
Multi-model serving: manage base models + LoRA adapters with smart routing.
native_tokenizer
Native tokenizer bridge using oxibonsai-tokenizer (pure Rust BPE).
nbest
N-best hypothesis tracking for diverse generation.
ngram_cache
N-gram cache for zero-cost speculative decoding draft generation.
pipeline
High-level inference pipeline API for OxiBonsai.
prefix_cache_engine
Prefix-cache-aware inference engine wrapper.
presets
Predefined sampling parameter presets for common use cases.
profiler
Inference profiler: per-layer timing, memory, and FLOP accounting.
quality_metrics
Generation quality metrics for evaluating LLM outputs.
rate_limiter
Token bucket rate limiter with per-client and optional global limits.
recovery
Error recovery strategies for production resilience.
request_id
Structured request identifiers for end-to-end tracing.
request_metrics
Per-request token-rate and latency metrics.
request_queue
Bounded request queue with backpressure for the inference pipeline.
sampling
Sampling strategies for text generation.
sampling_advanced
Advanced sampling algorithms for text generation.
semantic_cache
Semantic caching layer for LLM inference.
server
OpenAI-compatible chat completions server.
speculative
Speculative decoding for accelerated autoregressive generation.
stream_metrics
Token streaming metrics: TTFT, inter-token latency (TBT), throughput.
streaming
Enhanced SSE streaming with delta tokens, finish reasons, and usage info.
token_budget
Token budget management: enforce per-request and global token limits.
token_healing
Token healing for partial-token prompt repair.
tokenizer_bridge
Tokenizer bridge wrapping HuggingFace tokenizers.
tool_calling
High-level tool-calling orchestration for OxiBonsai.
tracing_setup
Tracing initialization with configurable output format.
wasm_api
WASM-compatible inference API.
web_ui
Web UI module — serves a minimal HTML chat interface from the Axum server.