Skip to main content

orion_core/
backend.rs

1use std::sync::atomic::AtomicBool;
2use std::sync::Arc;
3
4use crate::error::CoreResult;
5
6/// Token callback invoked for each generated token.
7/// Receives the token text, tokens generated so far, and current tokens/sec.
8pub type TokenCallback = Box<dyn FnMut(&str, u32, f64) + Send>;
9
10/// Inference parameters for a single generation request.
11#[derive(Debug, Clone)]
12pub struct InferenceParams {
13    /// Maximum number of tokens to generate in the response.
14    pub max_tokens: u32,
15    /// Sampling temperature (0.0 = deterministic, higher = more random).
16    pub temperature: f32,
17    /// Context window size in tokens to allocate for this request.
18    pub context_size: u32,
19    /// Number of CPU threads to use for inference.
20    pub n_threads: u32,
21}
22
23impl Default for InferenceParams {
24    fn default() -> Self {
25        let default_threads = std::thread::available_parallelism()
26            .map(|n| (n.get() as u32).saturating_sub(2).max(1))
27            .unwrap_or(4);
28        Self {
29            max_tokens: 2048,
30            temperature: 0.7,
31            context_size: 4096,
32            n_threads: default_threads,
33        }
34    }
35}
36
37/// Result of a completed generation.
38#[derive(Debug, Clone)]
39pub struct GenerationResult {
40    /// The full generated text.
41    pub text: String,
42    /// Number of tokens generated in the response.
43    pub tokens_generated: u32,
44    /// Number of tokens in the (formatted) prompt that was fed in.
45    pub prompt_tokens: u32,
46    /// Average generation speed in tokens per second.
47    pub tokens_per_sec: f64,
48    /// Time from request start to the first emitted token, in milliseconds.
49    pub time_to_first_token_ms: f64,
50    /// Total generation time, in milliseconds.
51    pub generation_time_ms: f64,
52}
53
54/// Trait for LLM backends (llama.cpp, MLX, cloud APIs, etc.).
55///
56/// The agent loop is backend-agnostic. OrionPod implements this
57/// with llama.cpp; other backends can be swapped in freely.
58///
59/// `generate` runs synchronously on a blocking thread. The agent
60/// loop handles the async orchestration around it.
61///
62/// ```no_run
63/// use orion_core::{LlmBackend, InferenceParams, GenerationResult, TokenCallback, CoreResult};
64/// use std::sync::atomic::AtomicBool;
65/// use std::sync::Arc;
66///
67/// struct MyBackend; // your engine state
68///
69/// impl LlmBackend for MyBackend {
70///     fn generate(
71///         &self,
72///         prompt: &str,             // fully formatted (chat template applied)
73///         params: &InferenceParams, // max_tokens, temperature, context_size, n_threads
74///         abort: Arc<AtomicBool>,   // check each token to support cancellation
75///         on_token: TokenCallback,  // call with (token_text, count, tokens_per_sec)
76///     ) -> CoreResult<GenerationResult> {
77///         // Feed prompt, sample tokens, call on_token per token, return stats.
78///         todo!()
79///     }
80///
81///     fn tokenize_count(&self, text: &str) -> CoreResult<u32> {
82///         // Count tokens without running inference (used for budgeting).
83///         todo!()
84///     }
85///
86///     fn is_ready(&self) -> bool {
87///         // Whether a model is loaded and ready.
88///         todo!()
89///     }
90/// }
91/// ```
92pub trait LlmBackend: Send + Sync {
93    /// Run inference on a formatted prompt string.
94    ///
95    /// The prompt is already fully formatted (chat template applied).
96    /// The backend just needs to feed it and generate tokens.
97    fn generate(
98        &self,
99        prompt: &str,
100        params: &InferenceParams,
101        abort: Arc<AtomicBool>,
102        on_token: TokenCallback,
103    ) -> CoreResult<GenerationResult>;
104
105    /// Count tokens in a string without running inference.
106    fn tokenize_count(&self, text: &str) -> CoreResult<u32>;
107
108    /// Whether a model is currently loaded and ready.
109    fn is_ready(&self) -> bool;
110}