use crate::lang::detect_language;
use std::collections::HashMap;
#[derive(Debug, Clone)]
pub struct PreparedDocument {
pub text: String,
pub metadata: HashMap<String, String>,
}
#[derive(Debug, Clone)]
pub struct DocumentPreprocessor {
pub clean_whitespace: bool,
pub normalize_unicode: bool,
pub detect_language: bool,
pub chunk_size: Option<usize>,
}
impl Default for DocumentPreprocessor {
fn default() -> Self {
Self {
clean_whitespace: true,
normalize_unicode: true,
detect_language: false,
chunk_size: None,
}
}
}
impl DocumentPreprocessor {
#[must_use]
pub fn new() -> Self {
Self::default()
}
#[must_use]
pub fn with_all_cleaning() -> Self {
Self {
clean_whitespace: true,
normalize_unicode: true,
detect_language: true,
chunk_size: None,
}
}
pub fn prepare(&self, text: &str) -> PreparedDocument {
let mut processed = text.to_string();
let mut metadata = HashMap::new();
if self.normalize_unicode {
processed = textprep::unicode::remove_zero_width(&processed);
processed = textprep::unicode::nfc(&processed);
metadata.insert("unicode_normalized".to_string(), "nfc".to_string());
}
if self.clean_whitespace {
processed = textprep::unicode::normalize_newlines(&processed);
let mut cleaned = String::with_capacity(processed.len());
let mut last_was_space = false;
for ch in processed.chars() {
if ch.is_whitespace() {
if !last_was_space {
if ch == '\n' {
cleaned.push('\n');
} else {
cleaned.push(' ');
}
last_was_space = true;
} else if ch == '\n' && !cleaned.ends_with('\n') {
cleaned.push('\n');
}
} else {
cleaned.push(ch);
last_was_space = false;
}
}
processed = cleaned.trim().to_string();
metadata.insert("whitespace_cleaned".to_string(), "true".to_string());
}
if self.detect_language {
let lang = detect_language(&processed);
metadata.insert("detected_language".to_string(), format!("{:?}", lang));
}
if let Some(chunk_size) = self.chunk_size {
metadata.insert("chunk_size".to_string(), chunk_size.to_string());
}
metadata.insert("original_length".to_string(), text.len().to_string());
metadata.insert("processed_length".to_string(), processed.len().to_string());
PreparedDocument {
text: processed,
metadata,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_preprocessor_default() {
let prep = DocumentPreprocessor::new();
assert!(prep.clean_whitespace);
assert!(prep.normalize_unicode);
assert!(!prep.detect_language);
assert!(prep.chunk_size.is_none());
}
#[test]
fn test_preprocessor_with_all_cleaning() {
let prep = DocumentPreprocessor::with_all_cleaning();
assert!(prep.clean_whitespace);
assert!(prep.normalize_unicode);
assert!(prep.detect_language);
}
#[test]
fn test_whitespace_normalization() {
let prep = DocumentPreprocessor::new();
let doc = prep.prepare("Hello world\r\n\r\ntest");
assert!(!doc.text.contains(" "));
assert!(!doc.text.contains("\r"));
}
#[test]
fn test_unicode_zero_width_removal() {
let prep = DocumentPreprocessor::new();
let input = "Hello\u{200b}world\u{feff}test";
let doc = prep.prepare(input);
assert!(!doc.text.contains('\u{200b}'));
assert!(!doc.text.contains('\u{feff}'));
assert!(doc.text.contains("Helloworld"));
}
#[test]
fn test_trim_whitespace() {
let prep = DocumentPreprocessor::new();
let doc = prep.prepare(" text with spaces ");
assert_eq!(doc.text, "text with spaces");
}
#[test]
fn test_metadata_recording() {
let prep = DocumentPreprocessor::new();
let doc = prep.prepare("test input");
assert!(doc.metadata.contains_key("original_length"));
assert!(doc.metadata.contains_key("processed_length"));
assert!(doc.metadata.contains_key("whitespace_cleaned"));
assert_eq!(
doc.metadata.get("unicode_normalized").map(String::as_str),
Some("nfc"),
);
}
#[test]
fn test_nfc_normalization_precomposed_eq_decomposed() {
let prep = DocumentPreprocessor::new();
let precomposed = "caf\u{00e9}"; let decomposed = "cafe\u{0301}"; assert_ne!(precomposed, decomposed);
let a = prep.prepare(precomposed).text;
let b = prep.prepare(decomposed).text;
assert_eq!(a, b);
}
#[test]
fn test_language_detection_metadata() {
let prep = DocumentPreprocessor::with_all_cleaning();
let doc = prep.prepare("Hello world, this is English text.");
assert!(doc.metadata.contains_key("detected_language"));
assert!(doc
.metadata
.get("detected_language")
.unwrap()
.contains("English"));
}
#[test]
fn test_preserve_paragraph_breaks() {
let prep = DocumentPreprocessor::new();
let doc = prep.prepare("First paragraph.\n\nSecond paragraph.");
assert!(doc.text.contains("\n\n") || doc.text.contains("\n"));
}
#[test]
fn test_empty_input() {
let prep = DocumentPreprocessor::new();
let doc = prep.prepare("");
assert!(doc.text.is_empty());
assert_eq!(doc.metadata.get("original_length"), Some(&"0".to_string()));
}
#[test]
fn test_prepared_document_clone() {
let prep = DocumentPreprocessor::new();
let doc = prep.prepare("test");
let cloned = doc.clone();
assert_eq!(doc.text, cloned.text);
assert_eq!(doc.metadata, cloned.metadata);
}
#[test]
fn test_cjk_text_handling() {
let prep = DocumentPreprocessor::with_all_cleaning();
let doc = prep.prepare("東京オリンピック2020は延期されました。");
assert!(doc.text.contains("東京"));
assert!(doc.metadata.contains_key("detected_language"));
}
}