llm_base/lib.rs
1//! This crate provides a unified interface for loading and using
2//! Large Language Models (LLMs).
3//!
4//! This is the base crate that implementors can use to implement their own
5//! LLMs.
6//!
7//! As a user, you probably want to use the [llm](https://crates.io/crates/llm) crate instead.
8#![deny(missing_docs)]
9
10use thiserror::Error;
11
12mod inference_session;
13mod loader;
14mod quantize;
15mod vocabulary;
16
17pub mod model;
18pub mod util;
19
20pub use ggml;
21pub use ggml::Type as ElementType;
22
23pub use inference_session::{
24 InferenceRequest, InferenceSession, InferenceSessionConfig, InferenceSnapshot, InferenceStats,
25 ModelKVMemoryType, SnapshotError,
26};
27pub use loader::{
28 load, load_progress_callback_stdout, ContainerType, FileType, LoadError, LoadProgress, Loader,
29 TensorLoader,
30};
31pub use memmap2::Mmap;
32pub use model::{Hyperparameters, KnownModel, Model, ModelParameters, OutputRequest};
33pub use quantize::{quantize, QuantizeError, QuantizeProgress};
34pub use util::TokenUtf8Buffer;
35pub use vocabulary::{TokenBias, TokenId, Vocabulary};
36
37#[derive(Clone, Debug, PartialEq)]
38/// The parameters for text generation.
39///
40/// This needs to be provided during all inference calls,
41/// but can be changed between calls.
42pub struct InferenceParameters {
43 /// The number of threads to use.
44 pub n_threads: usize,
45 /// Controls batch/chunk size for prompt ingestion in
46 /// [InferenceSession::feed_prompt].
47 pub n_batch: usize,
48 /// The top K words by score are kept during sampling.
49 pub top_k: usize,
50 /// The cumulative probability after which no more words are kept for sampling.
51 pub top_p: f32,
52 /// The penalty for repeating tokens. Higher values make the generation less
53 /// likely to get into a loop, but may harm results when repetitive outputs
54 /// are desired.
55 pub repeat_penalty: f32,
56 /// Temperature (randomness) used for sampling. A higher number is more random.
57 pub temperature: f32,
58 /// A list of tokens to bias against in the process of generation.
59 pub bias_tokens: TokenBias,
60 /// The number of tokens to consider for the repetition penalty.
61 pub repetition_penalty_last_n: usize,
62}
63impl Default for InferenceParameters {
64 fn default() -> Self {
65 Self {
66 n_threads: 8,
67 n_batch: 8,
68 top_k: 40,
69 top_p: 0.95,
70 repeat_penalty: 1.30,
71 temperature: 0.80,
72 bias_tokens: TokenBias::default(),
73 repetition_penalty_last_n: 512,
74 }
75 }
76}
77
78#[derive(Error, Debug)]
79/// Errors encountered during the inference process.
80pub enum InferenceError {
81 #[error("an invalid token was encountered during tokenization")]
82 /// During tokenization, one of the produced tokens was invalid / zero.
83 TokenizationFailed,
84 #[error("the context window is full")]
85 /// The context window for the model is full.
86 ContextFull,
87 #[error("reached end of text")]
88 /// The model has produced an end of text token, signalling that it thinks that the text should end here.
89 ///
90 /// Note that this error *can* be ignored and inference can continue, but the results are not guaranteed to be sensical.
91 EndOfText,
92 #[error("the user-specified callback returned an error")]
93 /// The user-specified callback returned an error.
94 UserCallback(Box<dyn std::error::Error>),
95}