1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
pub use ;
pub use TokenReducer;
// TODO: reorganize token_reduction - move out of text, and reorganize text properly into utils etc.
/// Reduces token count in text while preserving meaning and structure.
///
/// This function removes stopwords, redundancy, and applies compression techniques
/// based on the specified reduction level. Supports 64 languages with automatic
/// stopword removal and optional semantic clustering.
///
/// # Arguments
///
/// * `text` - The input text to reduce
/// * `config` - Configuration specifying reduction level and options
/// * `language_hint` - Optional ISO 639-3 language code (e.g., "eng", "spa")
///
/// # Returns
///
/// Returns the reduced text with preserved structure (markdown, code blocks).
///
/// # Errors
///
/// Returns an error if the language hint is invalid or stopwords cannot be loaded.
///
/// # Examples
///
/// ```rust
/// use kreuzberg::text::token_reduction::{reduce_tokens, TokenReductionConfig, ReductionLevel};
///
/// let text = "This is a simple example text with some stopwords.";
/// let config = TokenReductionConfig::default();
/// let reduced = reduce_tokens(text, &config, Some("eng"))?;
/// println!("Reduced: {}", reduced);
/// # Ok::<(), kreuzberg::error::KreuzbergError>(())
/// ```
/// Reduces token count for multiple texts efficiently using parallel processing.
///
/// This function processes multiple texts in parallel using Rayon, providing
/// significant performance improvements for batch operations. All texts use the
/// same configuration and language hint for consistency.
///
/// # Arguments
///
/// * `texts` - Slice of text references to reduce
/// * `config` - Configuration specifying reduction level and options
/// * `language_hint` - Optional ISO 639-3 language code (e.g., "eng", "spa")
///
/// # Returns
///
/// Returns a vector of reduced texts in the same order as the input.
///
/// # Errors
///
/// Returns an error if the language hint is invalid or stopwords cannot be loaded.
///
/// # Examples
///
/// ```rust
/// use kreuzberg::text::token_reduction::{batch_reduce_tokens, TokenReductionConfig, ReductionLevel};
///
/// let texts = vec![
/// "This is the first document with some text.",
/// "Here is another document with different content.",
/// "And finally, a third document to process.",
/// ];
/// let config = TokenReductionConfig::default();
/// let reduced = batch_reduce_tokens(&texts, &config, Some("eng"))?;
/// assert_eq!(reduced.len(), 3);
/// # Ok::<(), kreuzberg::error::KreuzbergError>(())
/// ```
/// Calculates detailed statistics comparing original and reduced text.
///
/// Provides comprehensive metrics including reduction percentages and absolute
/// counts for both characters and tokens. Useful for analyzing the effectiveness
/// of token reduction and monitoring compression ratios.
///
/// # Arguments
///
/// * `original` - The original text before reduction
/// * `reduced` - The reduced text after applying token reduction
///
/// # Returns
///
/// Returns a tuple with the following statistics (in order):
/// 1. `char_reduction` (f64) - Character reduction ratio (0.0 to 1.0)
/// 2. `token_reduction` (f64) - Token reduction ratio (0.0 to 1.0)
/// 3. `original_chars` (usize) - Original character count
/// 4. `reduced_chars` (usize) - Reduced character count
/// 5. `original_tokens` (usize) - Original token count (whitespace-delimited)
/// 6. `reduced_tokens` (usize) - Reduced token count (whitespace-delimited)
///
/// # Examples
///
/// ```rust
/// use kreuzberg::text::token_reduction::{reduce_tokens, get_reduction_statistics, TokenReductionConfig, ReductionLevel};
///
/// let original = "This is a simple example text with some stopwords and redundancy.";
/// let config = TokenReductionConfig::default();
/// let reduced = reduce_tokens(original, &config, Some("eng"))?;
///
/// let (char_ratio, token_ratio, orig_chars, red_chars, orig_tokens, red_tokens) =
/// get_reduction_statistics(original, &reduced);
///
/// println!("Reduced {:.1}% of characters ({} -> {})", char_ratio * 100.0, orig_chars, red_chars);
/// println!("Reduced {:.1}% of tokens ({} -> {})", token_ratio * 100.0, orig_tokens, red_tokens);
/// # Ok::<(), kreuzberg::error::KreuzbergError>(())
/// ```