llm_chain/
tokens.rs

1//! # Tokens Module
2//!
3//! This module provides utilities for managing tokens in Language Learning Models (LLMs),
4//! primarily focusing on measuring the sizes of prompts. This is useful for ensuring that
5//! prompts stay within the context window size supported by a given model.
6
7use crate::step::Step;
8use crate::{traits, Parameters};
9use serde::{Deserialize, Serialize};
10use std::cmp::max;
11use thiserror::Error;
12
13/// Custom error type for handling prompt token-related errors.
14#[derive(Clone, Debug, Error)]
15pub enum PromptTokensError {
16    /// Indicates that prompt tokens are not accessible for the given step.
17    #[error("The prompt tokens are not accessible for this type of step.")]
18    NotAvailable,
19    /// Indicates that the prompt tokens could not be computed.
20    #[error("The prompt tokens could not be computed.")]
21    UnableToCompute,
22    /// Indicates that the prompt tokens could not be computed because formatting the prompt failed.
23    #[error("Formatting prompt failed: {0}")]
24    PromptFormatFailed(#[from] crate::prompt::StringTemplateError),
25    #[error("Tokenizer error: {0}")]
26    TokenizerError(#[from] crate::tokens::TokenizerError),
27}
28
29/// An extension trait for the `Executor` trait that provides additional methods for working
30/// with token counts.
31pub trait ExecutorTokenCountExt: traits::Executor {
32    /// Splits a `Parameters` object into multiple smaller `Parameters` objects that fit within
33    /// the context window size supported by the given model.
34    ///
35    /// # Arguments
36    /// * `step` - The step that will process the Parameters. Has impact on tokenizer & text splitter used
37    /// * `doc` - The parameter object to split into multiple, smaller, parameters
38    /// * `chunk_overlap` - The amount of tokens each split part should overlap with previous & next chunk
39    ///
40    /// # Errors
41    ///
42    /// Returns a `PromptTokensError` if there is an issue computing the tokens.
43    fn split_to_fit(
44        &self,
45        step: &Step,
46        doc: &Parameters,
47        base_parameters: &Parameters,
48        chunk_overlap: Option<usize>,
49    ) -> Result<Vec<Parameters>, PromptTokensError> {
50        let splitter = self
51            .get_tokenizer(step.options())
52            .map_err(|_e| PromptTokensError::UnableToCompute)?;
53
54        let text = doc.get_text().ok_or(PromptTokensError::UnableToCompute)?;
55
56        let prompt = step.format(&base_parameters.combine(&Parameters::new_with_text("")))?;
57        let tokens_used = self.tokens_used(step.options(), &prompt)?;
58        let chunk_overlap = chunk_overlap.unwrap_or(0);
59
60        let split_params = splitter
61            .split_text(
62                &text,
63                tokens_used.max_tokens as usize - tokens_used.tokens_used as usize,
64                chunk_overlap,
65            )
66            .map_err(|_e| PromptTokensError::UnableToCompute)?
67            .into_iter()
68            .map(Parameters::new_with_text)
69            .collect();
70        Ok(split_params)
71    }
72}
73
74/// Blanket implementation of ExecutorTokenCountExt for all Executors
75impl<E: traits::Executor> ExecutorTokenCountExt for E {}
76
77/// Struct representing token count information, including the maximum tokens allowed and the
78/// total number of tokens used.
79pub struct TokenCount {
80    /// The maximum number of tokens allowed.
81    max_tokens: i32,
82    /// The total number of tokens used.
83    tokens_used: i32,
84}
85impl TokenCount {
86    /// Creates a new `TokenCount` instance with the given maximum tokens and tokens used.
87    ///
88    /// # Arguments
89    ///
90    /// * `max_tokens` - The maximum number of tokens allowed.
91    /// * `tokens_used` - The total number of tokens used.
92    pub fn new(max_tokens: i32, tokens_used: i32) -> Self {
93        Self {
94            max_tokens,
95            tokens_used,
96        }
97    }
98
99    /// Returns the number of tokens that could be added to the context window.
100    pub fn tokens_remaining(&self) -> i32 {
101        self.max_tokens - self.tokens_used
102    }
103
104    /// Returns true if there is still room in the context window.
105    pub fn has_tokens_remaining(&self) -> bool {
106        self.has_room_for(1)
107    }
108
109    /// Returns true if there is room for the given number of tokens.
110    ///
111    /// # Arguments
112    ///
113    /// * `tokens` - The number of tokens to check if there is room for.
114    ///
115    /// # Examples
116    ///
117    /// ```
118    /// use llm_chain::tokens::TokenCount;
119    /// let token_count = TokenCount::new(100, 50);
120    /// assert!(token_count.has_room_for(49));
121    /// ```
122    pub fn has_room_for(&self, tokens: i32) -> bool {
123        self.tokens_remaining() >= tokens
124    }
125}
126
127#[derive(Error, Debug, Clone)]
128pub enum TokenizerError {
129    #[error("Error tokenizing input text")]
130    TokenizationError,
131    #[error("Error stringifying tokens to text")]
132    ToStringError,
133    #[error("Error creating tokenizer")]
134    TokenizerCreationError,
135    #[error("Token Collection type mismatch")]
136    TokenCollectionTypeMismatch,
137}
138
139pub trait Tokenizer {
140    /// Tokenizes a string.
141    ///
142    /// # Parameters
143    ///
144    /// * `doc`: The string to tokenize.
145    ///
146    /// # Returns
147    ///
148    /// A `Result` containing a vector of tokens, or an error if there was a problem.
149    fn tokenize_str(&self, doc: &str) -> Result<TokenCollection, TokenizerError>;
150
151    /// Converts a vector of tokens into a string.
152    ///
153    /// # Parameters
154    ///
155    /// * `tokens`: The slice of tokens to convert.
156    ///
157    /// # Returns
158    ///
159    /// A `Result` containing a string, or an error if there was a problem.
160    fn to_string(&self, tokens: TokenCollection) -> Result<String, TokenizerError>;
161
162    fn split_text(
163        &self,
164        doc: &str,
165        max_tokens_per_chunk: usize,
166        chunk_overlap: usize,
167    ) -> Result<Vec<String>, TokenizerError> {
168        let tokens = self.tokenize_str(doc)?;
169        let step_size = max(
170            max_tokens_per_chunk.checked_sub(chunk_overlap).unwrap_or(1),
171            1,
172        );
173
174        debug_assert_ne!(step_size, 0);
175
176        (0..tokens.len())
177            .step_by(step_size)
178            .map(|start_idx| {
179                let end_idx = usize::min(start_idx + max_tokens_per_chunk, tokens.len());
180                self.to_string(tokens.slice(start_idx, end_idx))
181            })
182            .collect()
183    }
184}
185/// Represents a single token.
186#[derive(Serialize, Deserialize, Clone, Debug)]
187#[serde(transparent)]
188pub struct Token(TokenImpl);
189
190#[derive(Serialize, Deserialize, Clone, Debug)]
191enum TokenImpl {
192    I32(i32),
193    USize(usize),
194}
195
196impl From<i32> for Token {
197    fn from(value: i32) -> Self {
198        Token(TokenImpl::I32(value))
199    }
200}
201
202impl From<usize> for Token {
203    fn from(value: usize) -> Self {
204        Token(TokenImpl::USize(value))
205    }
206}
207
208impl Token {
209    pub fn to_i32(&self) -> Option<i32> {
210        match &self.0 {
211            TokenImpl::I32(x) => Some(*x),
212            _ => None,
213        }
214    }
215
216    pub fn to_usize(&self) -> Option<usize> {
217        match &self.0 {
218            TokenImpl::USize(x) => Some(*x),
219            _ => None,
220        }
221    }
222}
223
224/// A type-safe, enum-backed collection of tokens.
225///
226/// `TokenCollection` can hold a collection of `i32` or `usize` tokens,
227/// ensuring type safety and efficient storage.
228#[derive(Debug)]
229pub struct TokenCollection(TokenCollectionImpl);
230
231/// The internal enum representation of `TokenCollection`.
232///
233/// This enum holds the actual data for a `TokenCollection` instance,
234/// allowing us to differentiate between the two types of collections
235/// (`i32` and `usize`) in a type-safe manner.
236#[derive(Debug)]
237enum TokenCollectionImpl {
238    /// A token collection of `i32`
239    I32(Vec<i32>),
240    /// A token collection of usize, this should be avoided as the size is non-determinate, but is present in some libraries.
241    Usize(Vec<usize>),
242}
243
244impl TokenCollection {
245    /// Converts the `TokenCollection` into a vector of `i32`,
246    /// if it contains `i32` values. Returns `None` otherwise.
247    pub fn as_i32(self) -> Result<Vec<i32>, TokenizerError> {
248        match self.0 {
249            TokenCollectionImpl::I32(v) => Ok(v),
250            _ => Err(TokenizerError::TokenCollectionTypeMismatch),
251        }
252    }
253
254    /// Converts the `TokenCollection` into a vector of `usize`,
255    /// if it contains `usize` values. Returns `None` otherwise.
256    pub fn as_usize(self) -> Result<Vec<usize>, TokenizerError> {
257        match self.0 {
258            TokenCollectionImpl::Usize(v) => Ok(v),
259            _ => Err(TokenizerError::TokenCollectionTypeMismatch),
260        }
261    }
262
263    /// Returns the number of tokens in the token collection
264    pub fn len(&self) -> usize {
265        match &self.0 {
266            TokenCollectionImpl::I32(x) => x.len(),
267            TokenCollectionImpl::Usize(x) => x.len(),
268        }
269    }
270
271    /// Returns true if the TokenCollection is empty
272    pub fn is_empty(&self) -> bool {
273        self.len() == 0
274    }
275
276    /// Slices the token collection between start and end.
277    pub fn slice(&self, start: usize, end: usize) -> Self {
278        match &self.0 {
279            TokenCollectionImpl::I32(v) => Vec::from(&v[start..end]).into(),
280            TokenCollectionImpl::Usize(v) => Vec::from(&v[start..end]).into(),
281        }
282    }
283}
284
285/// Enables the conversion from a vector of `i32` into a `TokenCollection`.
286impl From<Vec<i32>> for TokenCollection {
287    fn from(v: Vec<i32>) -> Self {
288        TokenCollection(TokenCollectionImpl::I32(v))
289    }
290}
291
292/// Enables the conversion from a vector of `usize` into a `TokenCollection`.
293impl From<Vec<usize>> for TokenCollection {
294    fn from(v: Vec<usize>) -> Self {
295        TokenCollection(TokenCollectionImpl::Usize(v))
296    }
297}
llm_chain/tokens.rs

llm_chain/
tokens.rs