Skip to main content

anno/ingest/
preprocessor.rs

1//! Document preprocessing and cleaning utilities.
2//!
3//! Provides text normalization, cleaning, and preparation for entity extraction.
4
5use crate::lang::detect_language;
6use std::collections::HashMap;
7
8/// Prepared document with metadata.
9#[derive(Debug, Clone)]
10pub struct PreparedDocument {
11    /// The cleaned text
12    pub text: String,
13    /// Metadata about the preparation process
14    pub metadata: HashMap<String, String>,
15}
16
17/// Document preprocessor for cleaning and normalizing text.
18#[derive(Debug, Clone)]
19pub struct DocumentPreprocessor {
20    /// Normalize whitespace (collapse multiple spaces, normalize line breaks)
21    pub clean_whitespace: bool,
22    /// Normalize Unicode (NFC normalization)
23    pub normalize_unicode: bool,
24    /// Detect and record language
25    pub detect_language: bool,
26    /// Maximum chunk size (None = no chunking)
27    pub chunk_size: Option<usize>,
28}
29
30impl Default for DocumentPreprocessor {
31    fn default() -> Self {
32        Self {
33            clean_whitespace: true,
34            normalize_unicode: true,
35            detect_language: false,
36            chunk_size: None,
37        }
38    }
39}
40
41impl DocumentPreprocessor {
42    /// Create a new preprocessor with default settings.
43    #[must_use]
44    pub fn new() -> Self {
45        Self::default()
46    }
47
48    /// Create a preprocessor with all cleaning enabled.
49    #[must_use]
50    pub fn with_all_cleaning() -> Self {
51        Self {
52            clean_whitespace: true,
53            normalize_unicode: true,
54            detect_language: true,
55            chunk_size: None,
56        }
57    }
58
59    /// Prepare text for entity extraction.
60    pub fn prepare(&self, text: &str) -> PreparedDocument {
61        let mut processed = text.to_string();
62        let mut metadata = HashMap::new();
63
64        // Unicode normalization (NFC)
65        // Note: For now, we do basic normalization without external crate
66        // Full NFC normalization would require unicode-normalization crate
67        if self.normalize_unicode {
68            // Basic normalization: remove zero-width characters, normalize line breaks
69            processed = processed
70                .chars()
71                .filter(|c| !matches!(c, '\u{200b}' | '\u{200c}' | '\u{200d}' | '\u{feff}'))
72                .collect();
73            metadata.insert("unicode_normalized".to_string(), "basic".to_string());
74        }
75
76        // Whitespace cleaning
77        if self.clean_whitespace {
78            // Normalize line breaks to \n
79            processed = processed.replace("\r\n", "\n").replace('\r', "\n");
80
81            // Collapse multiple spaces (but preserve single spaces)
82            let mut cleaned = String::with_capacity(processed.len());
83            let mut last_was_space = false;
84            for ch in processed.chars() {
85                if ch.is_whitespace() {
86                    if !last_was_space {
87                        // Preserve newlines but collapse other whitespace
88                        if ch == '\n' {
89                            cleaned.push('\n');
90                        } else {
91                            cleaned.push(' ');
92                        }
93                        last_was_space = true;
94                    } else if ch == '\n' && !cleaned.ends_with('\n') {
95                        // Preserve consecutive newlines (paragraph breaks)
96                        cleaned.push('\n');
97                    }
98                } else {
99                    cleaned.push(ch);
100                    last_was_space = false;
101                }
102            }
103
104            // Trim leading/trailing whitespace
105            processed = cleaned.trim().to_string();
106            metadata.insert("whitespace_cleaned".to_string(), "true".to_string());
107        }
108
109        // Language detection
110        if self.detect_language {
111            let lang = detect_language(&processed);
112            metadata.insert("detected_language".to_string(), format!("{:?}", lang));
113        }
114
115        // Chunking (if requested)
116        if let Some(chunk_size) = self.chunk_size {
117            // For now, just record chunk size - actual chunking would be done
118            // at extraction time to preserve entity spans
119            metadata.insert("chunk_size".to_string(), chunk_size.to_string());
120        }
121
122        metadata.insert("original_length".to_string(), text.len().to_string());
123        metadata.insert("processed_length".to_string(), processed.len().to_string());
124
125        PreparedDocument {
126            text: processed,
127            metadata,
128        }
129    }
130}
131
132#[cfg(test)]
133mod tests {
134    use super::*;
135
136    #[test]
137    fn test_preprocessor_default() {
138        let prep = DocumentPreprocessor::new();
139        assert!(prep.clean_whitespace);
140        assert!(prep.normalize_unicode);
141        assert!(!prep.detect_language);
142        assert!(prep.chunk_size.is_none());
143    }
144
145    #[test]
146    fn test_preprocessor_with_all_cleaning() {
147        let prep = DocumentPreprocessor::with_all_cleaning();
148        assert!(prep.clean_whitespace);
149        assert!(prep.normalize_unicode);
150        assert!(prep.detect_language);
151    }
152
153    #[test]
154    fn test_whitespace_normalization() {
155        let prep = DocumentPreprocessor::new();
156        let doc = prep.prepare("Hello   world\r\n\r\ntest");
157
158        // Multiple spaces should be collapsed
159        assert!(!doc.text.contains("  "));
160        // CRLF should be normalized to LF
161        assert!(!doc.text.contains("\r"));
162    }
163
164    #[test]
165    fn test_unicode_zero_width_removal() {
166        let prep = DocumentPreprocessor::new();
167        let input = "Hello\u{200b}world\u{feff}test";
168        let doc = prep.prepare(input);
169
170        assert!(!doc.text.contains('\u{200b}'));
171        assert!(!doc.text.contains('\u{feff}'));
172        assert!(doc.text.contains("Helloworld"));
173    }
174
175    #[test]
176    fn test_trim_whitespace() {
177        let prep = DocumentPreprocessor::new();
178        let doc = prep.prepare("   text with spaces   ");
179
180        assert_eq!(doc.text, "text with spaces");
181    }
182
183    #[test]
184    fn test_metadata_recording() {
185        let prep = DocumentPreprocessor::new();
186        let doc = prep.prepare("test input");
187
188        assert!(doc.metadata.contains_key("original_length"));
189        assert!(doc.metadata.contains_key("processed_length"));
190        assert!(doc.metadata.contains_key("whitespace_cleaned"));
191        assert!(doc.metadata.contains_key("unicode_normalized"));
192    }
193
194    #[test]
195    fn test_language_detection_metadata() {
196        let prep = DocumentPreprocessor::with_all_cleaning();
197        let doc = prep.prepare("Hello world, this is English text.");
198
199        assert!(doc.metadata.contains_key("detected_language"));
200        assert!(doc
201            .metadata
202            .get("detected_language")
203            .unwrap()
204            .contains("English"));
205    }
206
207    #[test]
208    fn test_preserve_paragraph_breaks() {
209        let prep = DocumentPreprocessor::new();
210        let doc = prep.prepare("First paragraph.\n\nSecond paragraph.");
211
212        // Should preserve double newline (paragraph break)
213        assert!(doc.text.contains("\n\n") || doc.text.contains("\n"));
214    }
215
216    #[test]
217    fn test_empty_input() {
218        let prep = DocumentPreprocessor::new();
219        let doc = prep.prepare("");
220
221        assert!(doc.text.is_empty());
222        assert_eq!(doc.metadata.get("original_length"), Some(&"0".to_string()));
223    }
224
225    #[test]
226    fn test_prepared_document_clone() {
227        let prep = DocumentPreprocessor::new();
228        let doc = prep.prepare("test");
229        let cloned = doc.clone();
230
231        assert_eq!(doc.text, cloned.text);
232        assert_eq!(doc.metadata, cloned.metadata);
233    }
234
235    #[test]
236    fn test_cjk_text_handling() {
237        let prep = DocumentPreprocessor::with_all_cleaning();
238        let doc = prep.prepare("東京オリンピック2020は延期されました。");
239
240        // Should preserve CJK text
241        assert!(doc.text.contains("東京"));
242        // Language should be detected
243        assert!(doc.metadata.contains_key("detected_language"));
244    }
245}