Skip to main content

tokenx_rs/
config.rs

1//! Configuration types for token estimation.
2
3/// Average characters per token for general English/Latin text.
4pub const DEFAULT_CHARS_PER_TOKEN: f64 = 6.0;
5
6/// A language-specific rule that adjusts characters-per-token when matched.
7///
8/// The `matcher` function is called for each character in a word segment.
9/// If it returns `true` for any character, the segment uses `chars_per_token`
10/// instead of the default ratio.
11#[derive(Clone)]
12pub struct LanguageConfig {
13    /// Character-level predicate that detects the language.
14    pub matcher: fn(char) -> bool,
15    /// Average characters per token for this language.
16    pub chars_per_token: f64,
17}
18
19/// Options for [`estimate_token_count_with_options`](crate::estimate_token_count_with_options).
20#[derive(Clone)]
21pub struct EstimationOptions {
22    /// Fallback characters-per-token ratio when no language matches.
23    pub default_chars_per_token: f64,
24    /// Language-specific overrides, checked in order.
25    pub language_configs: Vec<LanguageConfig>,
26}
27
28impl Default for EstimationOptions {
29    fn default() -> Self {
30        Self {
31            default_chars_per_token: DEFAULT_CHARS_PER_TOKEN,
32            language_configs: default_language_configs(),
33        }
34    }
35}
36
37/// Options for [`split_by_tokens`](crate::split_by_tokens).
38#[derive(Clone, Default)]
39pub struct SplitOptions {
40    /// Base estimation options.
41    pub estimation: EstimationOptions,
42    /// Number of overlapping tokens between consecutive chunks.
43    pub overlap: usize,
44}
45
46/// Returns `true` if `c` is a German diacritic character.
47pub fn is_german(c: char) -> bool {
48    matches!(c, 'ä' | 'Ä' | 'ö' | 'Ö' | 'ü' | 'Ü' | 'ß' | 'ẞ')
49}
50
51/// Returns `true` if `c` is a French diacritic character.
52pub fn is_french(c: char) -> bool {
53    matches!(
54        c,
55        'é' | 'É'
56            | 'è'
57            | 'È'
58            | 'ê'
59            | 'Ê'
60            | 'ë'
61            | 'Ë'
62            | 'à'
63            | 'À'
64            | 'â'
65            | 'Â'
66            | 'î'
67            | 'Î'
68            | 'ï'
69            | 'Ï'
70            | 'ô'
71            | 'Ô'
72            | 'û'
73            | 'Û'
74            | 'ù'
75            | 'Ù'
76            | 'ü'
77            | 'Ü'
78            | 'ÿ'
79            | 'Ÿ'
80            | 'ç'
81            | 'Ç'
82            | 'œ'
83            | 'Œ'
84            | 'æ'
85            | 'Æ'
86    )
87}
88
89/// Returns `true` if `c` is a Spanish diacritic character.
90pub fn is_spanish(c: char) -> bool {
91    matches!(
92        c,
93        'á' | 'Á' | 'é' | 'É' | 'í' | 'Í' | 'ó' | 'Ó' | 'ú' | 'Ú' | 'ü' | 'Ü' | 'ñ' | 'Ñ'
94    )
95}
96
97/// Returns the default language configurations (German, French, Spanish).
98pub fn default_language_configs() -> Vec<LanguageConfig> {
99    vec![
100        LanguageConfig {
101            matcher: |c| is_german(c) || is_french(c),
102            chars_per_token: 3.0,
103        },
104        LanguageConfig {
105            matcher: is_spanish,
106            chars_per_token: 3.5,
107        },
108    ]
109}