Skip to main content

oxibonsai_runtime/
lib.rs

1//! # OxiBonsai Runtime
2//!
3//! High-level inference engine, sampling, tokenizer bridge, and
4//! OpenAI-compatible HTTP server for OxiBonsai.
5//!
6//! This crate ties together [`oxibonsai_core`], [`oxibonsai_kernels`],
7//! and [`oxibonsai_model`] into a production-ready inference runtime:
8//!
9//! - **[`InferenceEngine`]** — orchestrates prefill + decode with metrics
10//! - **[`Sampler`]** — temperature, top-k, top-p, repetition penalty
11//! - **[`SamplingPreset`]** — named parameter sets (Greedy, Balanced, Creative, ...)
12//! - **[`SamplerBuilder`] / [`ConfigBuilder`] / [`EngineBuilder`]** — ergonomic setup
13//! - **[`server`]** — Axum-based `/v1/chat/completions` server (feature-gated)
14//! - **[`InferenceMetrics`]** — Prometheus-compatible counters, gauges, histograms
15//! - **[`CircuitBreaker`]** — resilience pattern for cascading-failure protection
16//! - **[`HealthReport`]** — structured health checks for ops monitoring
17//!
18//! ## Quick Start
19//!
20//! ```rust,no_run
21//! use oxibonsai_core::config::Qwen3Config;
22//! use oxibonsai_runtime::engine::InferenceEngine;
23//! use oxibonsai_runtime::presets::SamplingPreset;
24//!
25//! let config = Qwen3Config::tiny_test();
26//! let params = SamplingPreset::Balanced.params();
27//! let mut engine = InferenceEngine::new(config, params, 42);
28//!
29//! let tokens = engine.generate(&[151644, 872], 10)
30//!     .expect("generation should succeed");
31//! ```
32
33pub mod adaptive_lookahead;
34pub mod adaptive_sampling;
35#[cfg(feature = "server")]
36pub mod admin;
37#[cfg(feature = "server")]
38pub mod api_extensions;
39#[cfg(feature = "server")]
40pub mod api_types;
41#[cfg(not(target_arch = "wasm32"))]
42pub mod async_engine;
43pub mod auto_tuner;
44pub mod batch_engine;
45pub mod beam_search;
46pub mod builders;
47pub mod circuit_breaker;
48#[cfg(feature = "server")]
49pub mod completions;
50pub mod config;
51pub mod constrained_decoding;
52pub mod context_manager;
53pub mod continuous_batch;
54pub mod convenience;
55pub mod dedup;
56#[cfg(feature = "server")]
57pub mod distributed;
58pub mod embedding_index;
59#[cfg(feature = "server")]
60pub mod embeddings;
61pub mod engine;
62pub mod error;
63pub mod grammar;
64pub mod health;
65pub mod hot_reload;
66pub mod json_schema;
67pub mod kv_cache_policy;
68pub mod memory;
69pub mod metrics;
70pub mod middleware;
71pub mod model_cache;
72pub mod multi_model;
73pub mod native_tokenizer;
74pub mod nbest;
75pub mod ngram_cache;
76pub mod pipeline;
77pub mod prefix_cache_engine;
78pub mod presets;
79pub mod profiler;
80pub mod quality_metrics;
81#[cfg(feature = "rag")]
82pub mod rag_server;
83pub mod rate_limiter;
84pub mod recovery;
85pub mod request_id;
86pub mod request_metrics;
87pub mod request_queue;
88pub mod sampling;
89pub mod sampling_advanced;
90pub mod semantic_cache;
91#[cfg(feature = "server")]
92pub mod server;
93pub mod speculative;
94pub mod stream_metrics;
95pub mod streaming;
96pub mod token_budget;
97pub mod token_healing;
98pub mod tokenizer_bridge;
99#[cfg(feature = "server")]
100pub mod tool_calling;
101pub mod tracing_setup;
102pub mod wasm_api;
103#[cfg(feature = "server")]
104pub mod web_ui;
105
106pub use adaptive_lookahead::{AdaptiveLookahead, AdaptiveLookaheadConfig, AdaptiveLookaheadError};
107pub use adaptive_sampling::{
108    AdaptiveSamplerChain, AdaptiveStrategy, EntropyCooling, GenerationState, RepetitionAdaptation,
109    ScheduledDecay,
110};
111pub use auto_tuner::{
112    AutoTuner, CpuArch, CpuFeatures, KernelBenchmark, KvCacheType, MemoryBudget, SimdTier,
113    TuningRecommendation,
114};
115pub use builders::{ConfigBuilder, EngineBuilder, SamplerBuilder};
116pub use circuit_breaker::{CircuitBreaker, CircuitBreakerConfig, CircuitState};
117pub use config::OxiBonsaiConfig;
118pub use constrained_decoding::{
119    AllowListConstraint, ConstrainedSampler, ConstrainedSamplerBuilder, ConstraintError,
120    JsonConstraint, JsonParseState, LengthConstraint, NoConstraint, RegexConstraint,
121    SequenceConstraint, TokenConstraint,
122};
123pub use convenience::{GenerationResult, MemoryEstimate, ModelFileInfo, TokenStats};
124pub use dedup::{DedupCache, DedupStats, RequestKey};
125#[cfg(feature = "server")]
126pub use distributed::{
127    ConsistentHashRing, CoordinatorConfig, DistributedCoordinator, NodeInfo, NodeRegistry,
128};
129pub use engine::InferenceEngine;
130pub use error::{RuntimeError, RuntimeResult};
131pub use grammar::{
132    compile_json_schema, compile_json_schema_str, compile_regex, parse_bnf, parse_gbnf,
133    BnfParseError, EarleyRecognizer, GbnfParseError, Grammar, GrammarConstraint,
134    JsonSchemaCompileError, RegexCompileError, Rule, Symbol,
135};
136pub use health::{HealthReport, HealthStatus};
137pub use hot_reload::{HotReloadCoordinator, ModelVersion, ReloadLog};
138pub use json_schema::{
139    parse_schema, schema_example, schema_template, validate_against_schema, SchemaError,
140    SchemaState, SchemaType,
141};
142pub use kv_cache_policy::{KvCacheLevel, KvCachePolicy, KvCachePolicyConfig, KvCachePolicyError};
143pub use memory::{get_rss_bytes, MemoryProfiler, MemorySnapshot};
144pub use metrics::InferenceMetrics;
145pub use multi_model::{
146    AdapterRef, AdapterStack, EndpointStatus, ModelEndpoint, ModelId, ModelListEntry,
147    ModelRegistry, ModelRouter, RoutingError,
148};
149pub use native_tokenizer::{NativeTokenizerBridge, NativeTokenizerError};
150pub use nbest::{Hypothesis, NBestDecoder, NBestList};
151pub use presets::SamplingPreset;
152pub use profiler::{flop_counter, AggregateStats, ProfileEvent, ProfileTrace, Profiler};
153pub use quality_metrics::{
154    extract_ngrams, perplexity_from_logprobs, repetition_penalty_rate, self_bleu, token_entropy,
155    BatchQualityAnalyzer, BleuScore, DiversityMetrics, GenerationQualityReport, RepetitionMetrics,
156};
157pub use recovery::{ErrorClass, RecoveryStrategy};
158pub use request_id::RequestId;
159pub use request_metrics::{
160    AggregateRateSnapshot, RequestRateAggregator, RequestRateSnapshot, RequestRateTracker,
161};
162pub use sampling::Sampler;
163pub use sampling_advanced::{
164    EtaSampler, LcgRng, MinPSampler, MirostatV1Sampler, MirostatV2Sampler, SamplerChain,
165    SamplerStep, TypicalSampler,
166};
167pub use stream_metrics::{RequestStreamMetrics, StreamMetricsSnapshot, StreamingMetricsAggregator};
168pub use token_budget::{
169    BudgetConfig, BudgetError, BudgetPolicy, GlobalTokenBudget, RequestBudget, TokenCostEstimate,
170};
171pub use tokenizer_bridge::TokenizerBridge;
172#[cfg(feature = "server")]
173pub use tool_calling::{
174    build_tool_constraint, make_tool_call, new_tool_call_id, select_tool, validate_tool_arguments,
175    ToolCallError, ToolRegistry,
176};
177pub use tracing_setup::{init_tracing, TracingConfig};
178#[cfg(feature = "server")]
179pub use web_ui::create_ui_router;