tokenx_rs/config.rs
1//! Configuration types for token estimation.
2
3/// Average characters per token for general English/Latin text.
4pub const DEFAULT_CHARS_PER_TOKEN: f64 = 6.0;
5
6/// A language-specific rule that adjusts characters-per-token when matched.
7///
8/// The `matcher` function is called for each character in a word segment.
9/// If it returns `true` for any character, the segment uses `chars_per_token`
10/// instead of the default ratio.
11#[derive(Clone)]
12pub struct LanguageConfig {
13 /// Character-level predicate that detects the language.
14 pub matcher: fn(char) -> bool,
15 /// Average characters per token for this language.
16 pub chars_per_token: f64,
17}
18
19/// Options for [`estimate_token_count_with_options`](crate::estimate_token_count_with_options).
20#[derive(Clone)]
21pub struct EstimationOptions {
22 /// Fallback characters-per-token ratio when no language matches.
23 pub default_chars_per_token: f64,
24 /// Language-specific overrides, checked in order.
25 pub language_configs: Vec<LanguageConfig>,
26}
27
28impl Default for EstimationOptions {
29 fn default() -> Self {
30 Self {
31 default_chars_per_token: DEFAULT_CHARS_PER_TOKEN,
32 language_configs: default_language_configs(),
33 }
34 }
35}
36
37/// Options for [`split_by_tokens`](crate::split_by_tokens).
38#[derive(Clone, Default)]
39pub struct SplitOptions {
40 /// Base estimation options.
41 pub estimation: EstimationOptions,
42 /// Number of overlapping tokens between consecutive chunks.
43 pub overlap: usize,
44}
45
46/// Returns `true` if `c` is a German diacritic character.
47pub fn is_german(c: char) -> bool {
48 matches!(c, 'ä' | 'Ä' | 'ö' | 'Ö' | 'ü' | 'Ü' | 'ß' | 'ẞ')
49}
50
51/// Returns `true` if `c` is a French diacritic character.
52pub fn is_french(c: char) -> bool {
53 matches!(
54 c,
55 'é' | 'É'
56 | 'è'
57 | 'È'
58 | 'ê'
59 | 'Ê'
60 | 'ë'
61 | 'Ë'
62 | 'à'
63 | 'À'
64 | 'â'
65 | 'Â'
66 | 'î'
67 | 'Î'
68 | 'ï'
69 | 'Ï'
70 | 'ô'
71 | 'Ô'
72 | 'û'
73 | 'Û'
74 | 'ù'
75 | 'Ù'
76 | 'ü'
77 | 'Ü'
78 | 'ÿ'
79 | 'Ÿ'
80 | 'ç'
81 | 'Ç'
82 | 'œ'
83 | 'Œ'
84 | 'æ'
85 | 'Æ'
86 )
87}
88
89/// Returns `true` if `c` is a Spanish diacritic character.
90pub fn is_spanish(c: char) -> bool {
91 matches!(
92 c,
93 'á' | 'Á' | 'é' | 'É' | 'í' | 'Í' | 'ó' | 'Ó' | 'ú' | 'Ú' | 'ü' | 'Ü' | 'ñ' | 'Ñ'
94 )
95}
96
97/// Returns the default language configurations (German, French, Spanish).
98pub fn default_language_configs() -> Vec<LanguageConfig> {
99 vec![
100 LanguageConfig {
101 matcher: |c| is_german(c) || is_french(c),
102 chars_per_token: 3.0,
103 },
104 LanguageConfig {
105 matcher: is_spanish,
106 chars_per_token: 3.5,
107 },
108 ]
109}