anno/ingest/
preprocessor.rs1use crate::lang::detect_language;
6use std::collections::HashMap;
7
8#[derive(Debug, Clone)]
10pub struct PreparedDocument {
11 pub text: String,
13 pub metadata: HashMap<String, String>,
15}
16
17#[derive(Debug, Clone)]
19pub struct DocumentPreprocessor {
20 pub clean_whitespace: bool,
22 pub normalize_unicode: bool,
24 pub detect_language: bool,
26 pub chunk_size: Option<usize>,
28}
29
30impl Default for DocumentPreprocessor {
31 fn default() -> Self {
32 Self {
33 clean_whitespace: true,
34 normalize_unicode: true,
35 detect_language: false,
36 chunk_size: None,
37 }
38 }
39}
40
41impl DocumentPreprocessor {
42 #[must_use]
44 pub fn new() -> Self {
45 Self::default()
46 }
47
48 #[must_use]
50 pub fn with_all_cleaning() -> Self {
51 Self {
52 clean_whitespace: true,
53 normalize_unicode: true,
54 detect_language: true,
55 chunk_size: None,
56 }
57 }
58
59 pub fn prepare(&self, text: &str) -> PreparedDocument {
61 let mut processed = text.to_string();
62 let mut metadata = HashMap::new();
63
64 if self.normalize_unicode {
68 processed = processed
70 .chars()
71 .filter(|c| !matches!(c, '\u{200b}' | '\u{200c}' | '\u{200d}' | '\u{feff}'))
72 .collect();
73 metadata.insert("unicode_normalized".to_string(), "basic".to_string());
74 }
75
76 if self.clean_whitespace {
78 processed = processed.replace("\r\n", "\n").replace('\r', "\n");
80
81 let mut cleaned = String::with_capacity(processed.len());
83 let mut last_was_space = false;
84 for ch in processed.chars() {
85 if ch.is_whitespace() {
86 if !last_was_space {
87 if ch == '\n' {
89 cleaned.push('\n');
90 } else {
91 cleaned.push(' ');
92 }
93 last_was_space = true;
94 } else if ch == '\n' && !cleaned.ends_with('\n') {
95 cleaned.push('\n');
97 }
98 } else {
99 cleaned.push(ch);
100 last_was_space = false;
101 }
102 }
103
104 processed = cleaned.trim().to_string();
106 metadata.insert("whitespace_cleaned".to_string(), "true".to_string());
107 }
108
109 if self.detect_language {
111 let lang = detect_language(&processed);
112 metadata.insert("detected_language".to_string(), format!("{:?}", lang));
113 }
114
115 if let Some(chunk_size) = self.chunk_size {
117 metadata.insert("chunk_size".to_string(), chunk_size.to_string());
120 }
121
122 metadata.insert("original_length".to_string(), text.len().to_string());
123 metadata.insert("processed_length".to_string(), processed.len().to_string());
124
125 PreparedDocument {
126 text: processed,
127 metadata,
128 }
129 }
130}
131
132#[cfg(test)]
133mod tests {
134 use super::*;
135
136 #[test]
137 fn test_preprocessor_default() {
138 let prep = DocumentPreprocessor::new();
139 assert!(prep.clean_whitespace);
140 assert!(prep.normalize_unicode);
141 assert!(!prep.detect_language);
142 assert!(prep.chunk_size.is_none());
143 }
144
145 #[test]
146 fn test_preprocessor_with_all_cleaning() {
147 let prep = DocumentPreprocessor::with_all_cleaning();
148 assert!(prep.clean_whitespace);
149 assert!(prep.normalize_unicode);
150 assert!(prep.detect_language);
151 }
152
153 #[test]
154 fn test_whitespace_normalization() {
155 let prep = DocumentPreprocessor::new();
156 let doc = prep.prepare("Hello world\r\n\r\ntest");
157
158 assert!(!doc.text.contains(" "));
160 assert!(!doc.text.contains("\r"));
162 }
163
164 #[test]
165 fn test_unicode_zero_width_removal() {
166 let prep = DocumentPreprocessor::new();
167 let input = "Hello\u{200b}world\u{feff}test";
168 let doc = prep.prepare(input);
169
170 assert!(!doc.text.contains('\u{200b}'));
171 assert!(!doc.text.contains('\u{feff}'));
172 assert!(doc.text.contains("Helloworld"));
173 }
174
175 #[test]
176 fn test_trim_whitespace() {
177 let prep = DocumentPreprocessor::new();
178 let doc = prep.prepare(" text with spaces ");
179
180 assert_eq!(doc.text, "text with spaces");
181 }
182
183 #[test]
184 fn test_metadata_recording() {
185 let prep = DocumentPreprocessor::new();
186 let doc = prep.prepare("test input");
187
188 assert!(doc.metadata.contains_key("original_length"));
189 assert!(doc.metadata.contains_key("processed_length"));
190 assert!(doc.metadata.contains_key("whitespace_cleaned"));
191 assert!(doc.metadata.contains_key("unicode_normalized"));
192 }
193
194 #[test]
195 fn test_language_detection_metadata() {
196 let prep = DocumentPreprocessor::with_all_cleaning();
197 let doc = prep.prepare("Hello world, this is English text.");
198
199 assert!(doc.metadata.contains_key("detected_language"));
200 assert!(doc
201 .metadata
202 .get("detected_language")
203 .unwrap()
204 .contains("English"));
205 }
206
207 #[test]
208 fn test_preserve_paragraph_breaks() {
209 let prep = DocumentPreprocessor::new();
210 let doc = prep.prepare("First paragraph.\n\nSecond paragraph.");
211
212 assert!(doc.text.contains("\n\n") || doc.text.contains("\n"));
214 }
215
216 #[test]
217 fn test_empty_input() {
218 let prep = DocumentPreprocessor::new();
219 let doc = prep.prepare("");
220
221 assert!(doc.text.is_empty());
222 assert_eq!(doc.metadata.get("original_length"), Some(&"0".to_string()));
223 }
224
225 #[test]
226 fn test_prepared_document_clone() {
227 let prep = DocumentPreprocessor::new();
228 let doc = prep.prepare("test");
229 let cloned = doc.clone();
230
231 assert_eq!(doc.text, cloned.text);
232 assert_eq!(doc.metadata, cloned.metadata);
233 }
234
235 #[test]
236 fn test_cjk_text_handling() {
237 let prep = DocumentPreprocessor::with_all_cleaning();
238 let doc = prep.prepare("東京オリンピック2020は延期されました。");
239
240 assert!(doc.text.contains("東京"));
242 assert!(doc.metadata.contains_key("detected_language"));
244 }
245}