reasonkit/processing/
mod.rs1use crate::{Document, ProcessingState};
14
15pub mod chunking;
17
18#[derive(Debug, Clone, Default)]
20pub struct NormalizationOptions {
21 pub collapse_whitespace: bool,
23 pub lowercase: bool,
25 pub remove_punctuation: bool,
27 pub trim: bool,
29}
30
31impl NormalizationOptions {
32 pub fn for_indexing() -> Self {
34 Self {
35 collapse_whitespace: true,
36 lowercase: false,
37 remove_punctuation: false,
38 trim: true,
39 }
40 }
41
42 pub fn for_matching() -> Self {
44 Self {
45 collapse_whitespace: true,
46 lowercase: true,
47 remove_punctuation: true,
48 trim: true,
49 }
50 }
51}
52
53pub fn normalize_text(text: &str, options: &NormalizationOptions) -> String {
55 let mut result = text.to_string();
56
57 if options.trim {
58 result = result.trim().to_string();
59 }
60
61 if options.collapse_whitespace {
62 result = collapse_whitespace(&result);
63 }
64
65 if options.lowercase {
66 result = result.to_lowercase();
67 }
68
69 if options.remove_punctuation {
70 result = result
71 .chars()
72 .filter(|c| !c.is_ascii_punctuation())
73 .collect();
74 }
75
76 result
77}
78
79fn collapse_whitespace(text: &str) -> String {
81 let mut result = String::with_capacity(text.len());
82 let mut prev_whitespace = false;
83
84 for c in text.chars() {
85 if c.is_whitespace() {
86 if !prev_whitespace {
87 result.push(' ');
88 }
89 prev_whitespace = true;
90 } else {
91 result.push(c);
92 prev_whitespace = false;
93 }
94 }
95
96 result
97}
98
99pub fn estimate_tokens(text: &str) -> usize {
101 text.len().div_ceil(4)
104}
105
106pub fn count_words(text: &str) -> usize {
108 text.split_whitespace().count()
109}
110
111pub struct ProcessingPipeline {
113 normalization: NormalizationOptions,
114}
115
116impl Default for ProcessingPipeline {
117 fn default() -> Self {
118 Self {
119 normalization: NormalizationOptions::for_indexing(),
120 }
121 }
122}
123
124impl ProcessingPipeline {
125 pub fn with_normalization(normalization: NormalizationOptions) -> Self {
127 Self { normalization }
128 }
129
130 pub fn process_content(&self, content: &str) -> String {
132 normalize_text(content, &self.normalization)
133 }
134
135 pub fn mark_processing(doc: &mut Document) {
137 doc.processing.status = ProcessingState::Processing;
138 }
139
140 pub fn mark_complete(doc: &mut Document) {
142 doc.processing.status = ProcessingState::Completed;
143 doc.processing.indexed = true;
144 }
145
146 pub fn mark_failed(doc: &mut Document, error: &str) {
148 doc.processing.status = ProcessingState::Failed;
149 doc.processing.errors.push(error.to_string());
150 }
151}
152
153pub fn extract_sentences(text: &str) -> Vec<&str> {
155 text.split(['.', '!', '?'])
158 .map(|s| s.trim())
159 .filter(|s| !s.is_empty())
160 .collect()
161}
162
163pub fn split_paragraphs(text: &str) -> Vec<&str> {
165 text.split("\n\n")
166 .map(|s| s.trim())
167 .filter(|s| !s.is_empty())
168 .collect()
169}
170
171#[cfg(test)]
172mod tests {
173 use super::*;
174
175 #[test]
176 fn test_normalize_text() {
177 let options = NormalizationOptions::for_indexing();
178 assert_eq!(normalize_text(" Hello World ", &options), "Hello World");
179 }
180
181 #[test]
182 fn test_normalize_for_matching() {
183 let options = NormalizationOptions::for_matching();
184 assert_eq!(normalize_text("Hello, World!", &options), "hello world");
185 }
186
187 #[test]
188 fn test_estimate_tokens() {
189 assert_eq!(estimate_tokens("hello"), 2); assert_eq!(estimate_tokens("hello world"), 3); }
193
194 #[test]
195 fn test_count_words() {
196 assert_eq!(count_words("hello world"), 2);
197 assert_eq!(count_words(" hello world "), 2);
198 }
199
200 #[test]
201 fn test_extract_sentences() {
202 let text = "Hello world. How are you? I am fine!";
203 let sentences = extract_sentences(text);
204 assert_eq!(sentences.len(), 3);
205 assert_eq!(sentences[0], "Hello world");
206 }
207
208 #[test]
209 fn test_split_paragraphs() {
210 let text = "First paragraph.\n\nSecond paragraph.\n\nThird one.";
211 let paragraphs = split_paragraphs(text);
212 assert_eq!(paragraphs.len(), 3);
213 }
214}