use crate::{Document, ProcessingState};
pub mod chunking;
#[derive(Debug, Clone, Default)]
pub struct NormalizationOptions {
pub collapse_whitespace: bool,
pub lowercase: bool,
pub remove_punctuation: bool,
pub trim: bool,
}
impl NormalizationOptions {
pub fn for_indexing() -> Self {
Self {
collapse_whitespace: true,
lowercase: false,
remove_punctuation: false,
trim: true,
}
}
pub fn for_matching() -> Self {
Self {
collapse_whitespace: true,
lowercase: true,
remove_punctuation: true,
trim: true,
}
}
}
pub fn normalize_text(text: &str, options: &NormalizationOptions) -> String {
let mut result = text.to_string();
if options.trim {
result = result.trim().to_string();
}
if options.collapse_whitespace {
result = collapse_whitespace(&result);
}
if options.lowercase {
result = result.to_lowercase();
}
if options.remove_punctuation {
result = result
.chars()
.filter(|c| !c.is_ascii_punctuation())
.collect();
}
result
}
fn collapse_whitespace(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let mut prev_whitespace = false;
for c in text.chars() {
if c.is_whitespace() {
if !prev_whitespace {
result.push(' ');
}
prev_whitespace = true;
} else {
result.push(c);
prev_whitespace = false;
}
}
result
}
pub fn estimate_tokens(text: &str) -> usize {
text.len().div_ceil(4)
}
pub fn count_words(text: &str) -> usize {
text.split_whitespace().count()
}
pub struct ProcessingPipeline {
normalization: NormalizationOptions,
}
impl Default for ProcessingPipeline {
fn default() -> Self {
Self {
normalization: NormalizationOptions::for_indexing(),
}
}
}
impl ProcessingPipeline {
pub fn with_normalization(normalization: NormalizationOptions) -> Self {
Self { normalization }
}
pub fn process_content(&self, content: &str) -> String {
normalize_text(content, &self.normalization)
}
pub fn mark_processing(doc: &mut Document) {
doc.processing.status = ProcessingState::Processing;
}
pub fn mark_complete(doc: &mut Document) {
doc.processing.status = ProcessingState::Completed;
doc.processing.indexed = true;
}
pub fn mark_failed(doc: &mut Document, error: &str) {
doc.processing.status = ProcessingState::Failed;
doc.processing.errors.push(error.to_string());
}
}
pub fn extract_sentences(text: &str) -> Vec<&str> {
text.split(['.', '!', '?'])
.map(|s| s.trim())
.filter(|s| !s.is_empty())
.collect()
}
pub fn split_paragraphs(text: &str) -> Vec<&str> {
text.split("\n\n")
.map(|s| s.trim())
.filter(|s| !s.is_empty())
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_normalize_text() {
let options = NormalizationOptions::for_indexing();
assert_eq!(normalize_text(" Hello World ", &options), "Hello World");
}
#[test]
fn test_normalize_for_matching() {
let options = NormalizationOptions::for_matching();
assert_eq!(normalize_text("Hello, World!", &options), "hello world");
}
#[test]
fn test_estimate_tokens() {
assert_eq!(estimate_tokens("hello"), 2); assert_eq!(estimate_tokens("hello world"), 3); }
#[test]
fn test_count_words() {
assert_eq!(count_words("hello world"), 2);
assert_eq!(count_words(" hello world "), 2);
}
#[test]
fn test_extract_sentences() {
let text = "Hello world. How are you? I am fine!";
let sentences = extract_sentences(text);
assert_eq!(sentences.len(), 3);
assert_eq!(sentences[0], "Hello world");
}
#[test]
fn test_split_paragraphs() {
let text = "First paragraph.\n\nSecond paragraph.\n\nThird one.";
let paragraphs = split_paragraphs(text);
assert_eq!(paragraphs.len(), 3);
}
}