orion_core/backend.rs
1use std::sync::atomic::AtomicBool;
2use std::sync::Arc;
3
4use crate::error::CoreResult;
5
6/// Token callback invoked for each generated token.
7/// Receives the token text, tokens generated so far, and current tokens/sec.
8pub type TokenCallback = Box<dyn FnMut(&str, u32, f64) + Send>;
9
10/// Inference parameters for a single generation request.
11#[derive(Debug, Clone)]
12pub struct InferenceParams {
13 /// Maximum number of tokens to generate in the response.
14 pub max_tokens: u32,
15 /// Sampling temperature (0.0 = deterministic, higher = more random).
16 pub temperature: f32,
17 /// Context window size in tokens to allocate for this request.
18 pub context_size: u32,
19 /// Number of CPU threads to use for inference.
20 pub n_threads: u32,
21}
22
23impl Default for InferenceParams {
24 fn default() -> Self {
25 let default_threads = std::thread::available_parallelism()
26 .map(|n| (n.get() as u32).saturating_sub(2).max(1))
27 .unwrap_or(4);
28 Self {
29 max_tokens: 2048,
30 temperature: 0.7,
31 context_size: 4096,
32 n_threads: default_threads,
33 }
34 }
35}
36
37/// Result of a completed generation.
38#[derive(Debug, Clone)]
39pub struct GenerationResult {
40 /// The full generated text.
41 pub text: String,
42 /// Number of tokens generated in the response.
43 pub tokens_generated: u32,
44 /// Number of tokens in the (formatted) prompt that was fed in.
45 pub prompt_tokens: u32,
46 /// Average generation speed in tokens per second.
47 pub tokens_per_sec: f64,
48 /// Time from request start to the first emitted token, in milliseconds.
49 pub time_to_first_token_ms: f64,
50 /// Total generation time, in milliseconds.
51 pub generation_time_ms: f64,
52}
53
54/// Trait for LLM backends (llama.cpp, MLX, cloud APIs, etc.).
55///
56/// The agent loop is backend-agnostic. OrionPod implements this
57/// with llama.cpp; other backends can be swapped in freely.
58///
59/// `generate` runs synchronously on a blocking thread. The agent
60/// loop handles the async orchestration around it.
61///
62/// ```no_run
63/// use orion_core::{LlmBackend, InferenceParams, GenerationResult, TokenCallback, CoreResult};
64/// use std::sync::atomic::AtomicBool;
65/// use std::sync::Arc;
66///
67/// struct MyBackend; // your engine state
68///
69/// impl LlmBackend for MyBackend {
70/// fn generate(
71/// &self,
72/// prompt: &str, // fully formatted (chat template applied)
73/// params: &InferenceParams, // max_tokens, temperature, context_size, n_threads
74/// abort: Arc<AtomicBool>, // check each token to support cancellation
75/// on_token: TokenCallback, // call with (token_text, count, tokens_per_sec)
76/// ) -> CoreResult<GenerationResult> {
77/// // Feed prompt, sample tokens, call on_token per token, return stats.
78/// todo!()
79/// }
80///
81/// fn tokenize_count(&self, text: &str) -> CoreResult<u32> {
82/// // Count tokens without running inference (used for budgeting).
83/// todo!()
84/// }
85///
86/// fn is_ready(&self) -> bool {
87/// // Whether a model is loaded and ready.
88/// todo!()
89/// }
90/// }
91/// ```
92pub trait LlmBackend: Send + Sync {
93 /// Run inference on a formatted prompt string.
94 ///
95 /// The prompt is already fully formatted (chat template applied).
96 /// The backend just needs to feed it and generate tokens.
97 fn generate(
98 &self,
99 prompt: &str,
100 params: &InferenceParams,
101 abort: Arc<AtomicBool>,
102 on_token: TokenCallback,
103 ) -> CoreResult<GenerationResult>;
104
105 /// Count tokens in a string without running inference.
106 fn tokenize_count(&self, text: &str) -> CoreResult<u32>;
107
108 /// Whether a model is currently loaded and ready.
109 fn is_ready(&self) -> bool;
110}