pub trait InferenceBackend: Send + Sync {
// Required methods
fn load(
&mut self,
model_path: PathBuf,
config: LoadConfig,
) -> impl Future<Output = Result<(), NativeError>> + Send;
fn unload(&mut self) -> impl Future<Output = Result<(), NativeError>> + Send;
fn is_loaded(&self) -> bool;
fn model_info(&self) -> Option<&ModelInfo>;
fn infer(
&self,
prompt: &str,
options: ChatOptions,
) -> impl Future<Output = Result<ChatResponse, NativeError>> + Send;
fn infer_stream(
&self,
prompt: &str,
options: ChatOptions,
) -> impl Future<Output = Result<impl Stream<Item = Result<String, NativeError>> + Send, NativeError>> + Send;
}Expand description
Trait for any inference backend (mistral.rs, llama.cpp, etc.).
This trait provides a unified interface for loading and running local LLM inference. Implementations can use different backends while presenting the same API to consumers.
Required Methods§
Sourcefn load(
&mut self,
model_path: PathBuf,
config: LoadConfig,
) -> impl Future<Output = Result<(), NativeError>> + Send
fn load( &mut self, model_path: PathBuf, config: LoadConfig, ) -> impl Future<Output = Result<(), NativeError>> + Send
Sourcefn unload(&mut self) -> impl Future<Output = Result<(), NativeError>> + Send
fn unload(&mut self) -> impl Future<Output = Result<(), NativeError>> + Send
Unload the model from memory.
Frees GPU/CPU memory used by the model.
Sourcefn model_info(&self) -> Option<&ModelInfo>
fn model_info(&self) -> Option<&ModelInfo>
Get metadata about the loaded model.
Returns None if no model is loaded.
Sourcefn infer(
&self,
prompt: &str,
options: ChatOptions,
) -> impl Future<Output = Result<ChatResponse, NativeError>> + Send
fn infer( &self, prompt: &str, options: ChatOptions, ) -> impl Future<Output = Result<ChatResponse, NativeError>> + Send
Sourcefn infer_stream(
&self,
prompt: &str,
options: ChatOptions,
) -> impl Future<Output = Result<impl Stream<Item = Result<String, NativeError>> + Send, NativeError>> + Send
fn infer_stream( &self, prompt: &str, options: ChatOptions, ) -> impl Future<Output = Result<impl Stream<Item = Result<String, NativeError>> + Send, NativeError>> + Send
Generate a response (streaming).
Returns a stream of token strings as they are generated.
§Arguments
prompt- The input promptoptions- Generation options (temperature, max_tokens, etc.)
Dyn Compatibility§
This trait is not dyn compatible.
In older versions of Rust, dyn compatibility was called "object safety", so this trait is not object safe.
Implementors§
impl InferenceBackend for NativeRuntime
Available on non-crate feature
inference only.