llm_base/
lib.rs

1//! This crate provides a unified interface for loading and using
2//! Large Language Models (LLMs).
3//!
4//! This is the base crate that implementors can use to implement their own
5//! LLMs.
6//!
7//! As a user, you probably want to use the [llm](https://crates.io/crates/llm) crate instead.
8#![deny(missing_docs)]
9
10use thiserror::Error;
11
12mod inference_session;
13mod loader;
14mod quantize;
15mod vocabulary;
16
17pub mod model;
18pub mod util;
19
20pub use ggml;
21pub use ggml::Type as ElementType;
22
23pub use inference_session::{
24    InferenceRequest, InferenceSession, InferenceSessionConfig, InferenceSnapshot, InferenceStats,
25    ModelKVMemoryType, SnapshotError,
26};
27pub use loader::{
28    load, load_progress_callback_stdout, ContainerType, FileType, LoadError, LoadProgress, Loader,
29    TensorLoader,
30};
31pub use memmap2::Mmap;
32pub use model::{Hyperparameters, KnownModel, Model, ModelParameters, OutputRequest};
33pub use quantize::{quantize, QuantizeError, QuantizeProgress};
34pub use util::TokenUtf8Buffer;
35pub use vocabulary::{TokenBias, TokenId, Vocabulary};
36
37#[derive(Clone, Debug, PartialEq)]
38/// The parameters for text generation.
39///
40/// This needs to be provided during all inference calls,
41/// but can be changed between calls.
42pub struct InferenceParameters {
43    /// The number of threads to use.
44    pub n_threads: usize,
45    /// Controls batch/chunk size for prompt ingestion in
46    /// [InferenceSession::feed_prompt].
47    pub n_batch: usize,
48    /// The top K words by score are kept during sampling.
49    pub top_k: usize,
50    /// The cumulative probability after which no more words are kept for sampling.
51    pub top_p: f32,
52    /// The penalty for repeating tokens. Higher values make the generation less
53    /// likely to get into a loop, but may harm results when repetitive outputs
54    /// are desired.
55    pub repeat_penalty: f32,
56    /// Temperature (randomness) used for sampling. A higher number is more random.
57    pub temperature: f32,
58    /// A list of tokens to bias against in the process of generation.
59    pub bias_tokens: TokenBias,
60    /// The number of tokens to consider for the repetition penalty.
61    pub repetition_penalty_last_n: usize,
62}
63impl Default for InferenceParameters {
64    fn default() -> Self {
65        Self {
66            n_threads: 8,
67            n_batch: 8,
68            top_k: 40,
69            top_p: 0.95,
70            repeat_penalty: 1.30,
71            temperature: 0.80,
72            bias_tokens: TokenBias::default(),
73            repetition_penalty_last_n: 512,
74        }
75    }
76}
77
78#[derive(Error, Debug)]
79/// Errors encountered during the inference process.
80pub enum InferenceError {
81    #[error("an invalid token was encountered during tokenization")]
82    /// During tokenization, one of the produced tokens was invalid / zero.
83    TokenizationFailed,
84    #[error("the context window is full")]
85    /// The context window for the model is full.
86    ContextFull,
87    #[error("reached end of text")]
88    /// The model has produced an end of text token, signalling that it thinks that the text should end here.
89    ///
90    /// Note that this error *can* be ignored and inference can continue, but the results are not guaranteed to be sensical.
91    EndOfText,
92    #[error("the user-specified callback returned an error")]
93    /// The user-specified callback returned an error.
94    UserCallback(Box<dyn std::error::Error>),
95}