reasonkit/processing/
mod.rs

1//! Processing module for ReasonKit Core
2//!
3//! Provides document and text processing utilities for the RAG pipeline.
4//!
5//! ## Overview
6//!
7//! This module handles:
8//! - Text normalization and cleaning
9//! - Token counting and estimation
10//! - Text chunking strategies
11//! - Processing pipeline orchestration
12
13use crate::{Document, ProcessingState};
14
15/// Document chunking module
16pub mod chunking;
17
18/// Text normalization options
19#[derive(Debug, Clone, Default)]
20pub struct NormalizationOptions {
21    /// Remove extra whitespace
22    pub collapse_whitespace: bool,
23    /// Convert to lowercase
24    pub lowercase: bool,
25    /// Remove punctuation
26    pub remove_punctuation: bool,
27    /// Trim leading/trailing whitespace
28    pub trim: bool,
29}
30
31impl NormalizationOptions {
32    /// Default normalization for search indexing
33    pub fn for_indexing() -> Self {
34        Self {
35            collapse_whitespace: true,
36            lowercase: false,
37            remove_punctuation: false,
38            trim: true,
39        }
40    }
41
42    /// Aggressive normalization for matching
43    pub fn for_matching() -> Self {
44        Self {
45            collapse_whitespace: true,
46            lowercase: true,
47            remove_punctuation: true,
48            trim: true,
49        }
50    }
51}
52
53/// Normalize text according to options
54pub fn normalize_text(text: &str, options: &NormalizationOptions) -> String {
55    let mut result = text.to_string();
56
57    if options.trim {
58        result = result.trim().to_string();
59    }
60
61    if options.collapse_whitespace {
62        result = collapse_whitespace(&result);
63    }
64
65    if options.lowercase {
66        result = result.to_lowercase();
67    }
68
69    if options.remove_punctuation {
70        result = result
71            .chars()
72            .filter(|c| !c.is_ascii_punctuation())
73            .collect();
74    }
75
76    result
77}
78
79/// Collapse multiple whitespace characters into single spaces
80fn collapse_whitespace(text: &str) -> String {
81    let mut result = String::with_capacity(text.len());
82    let mut prev_whitespace = false;
83
84    for c in text.chars() {
85        if c.is_whitespace() {
86            if !prev_whitespace {
87                result.push(' ');
88            }
89            prev_whitespace = true;
90        } else {
91            result.push(c);
92            prev_whitespace = false;
93        }
94    }
95
96    result
97}
98
99/// Estimate token count for text (rough approximation: ~4 chars per token)
100pub fn estimate_tokens(text: &str) -> usize {
101    // Simple heuristic: ~4 characters per token for English text
102    // This is a rough estimate that works reasonably well for most cases
103    text.len().div_ceil(4)
104}
105
106/// Count words in text
107pub fn count_words(text: &str) -> usize {
108    text.split_whitespace().count()
109}
110
111/// Processing pipeline for documents
112pub struct ProcessingPipeline {
113    normalization: NormalizationOptions,
114}
115
116impl Default for ProcessingPipeline {
117    fn default() -> Self {
118        Self {
119            normalization: NormalizationOptions::for_indexing(),
120        }
121    }
122}
123
124impl ProcessingPipeline {
125    /// Create a new pipeline with custom normalization
126    pub fn with_normalization(normalization: NormalizationOptions) -> Self {
127        Self { normalization }
128    }
129
130    /// Process a document's content
131    pub fn process_content(&self, content: &str) -> String {
132        normalize_text(content, &self.normalization)
133    }
134
135    /// Update document processing status
136    pub fn mark_processing(doc: &mut Document) {
137        doc.processing.status = ProcessingState::Processing;
138    }
139
140    /// Mark document as processed
141    pub fn mark_complete(doc: &mut Document) {
142        doc.processing.status = ProcessingState::Completed;
143        doc.processing.indexed = true;
144    }
145
146    /// Mark document processing as failed
147    pub fn mark_failed(doc: &mut Document, error: &str) {
148        doc.processing.status = ProcessingState::Failed;
149        doc.processing.errors.push(error.to_string());
150    }
151}
152
153/// Extract sentences from text
154pub fn extract_sentences(text: &str) -> Vec<&str> {
155    // Simple sentence splitting on common terminators
156    // For production, consider using a proper NLP library
157    text.split(['.', '!', '?'])
158        .map(|s| s.trim())
159        .filter(|s| !s.is_empty())
160        .collect()
161}
162
163/// Split text into paragraphs
164pub fn split_paragraphs(text: &str) -> Vec<&str> {
165    text.split("\n\n")
166        .map(|s| s.trim())
167        .filter(|s| !s.is_empty())
168        .collect()
169}
170
171#[cfg(test)]
172mod tests {
173    use super::*;
174
175    #[test]
176    fn test_normalize_text() {
177        let options = NormalizationOptions::for_indexing();
178        assert_eq!(normalize_text("  Hello   World  ", &options), "Hello World");
179    }
180
181    #[test]
182    fn test_normalize_for_matching() {
183        let options = NormalizationOptions::for_matching();
184        assert_eq!(normalize_text("Hello, World!", &options), "hello world");
185    }
186
187    #[test]
188    fn test_estimate_tokens() {
189        // ~4 chars per token
190        assert_eq!(estimate_tokens("hello"), 2); // 5 chars -> ~2 tokens
191        assert_eq!(estimate_tokens("hello world"), 3); // 11 chars -> ~3 tokens
192    }
193
194    #[test]
195    fn test_count_words() {
196        assert_eq!(count_words("hello world"), 2);
197        assert_eq!(count_words("  hello   world  "), 2);
198    }
199
200    #[test]
201    fn test_extract_sentences() {
202        let text = "Hello world. How are you? I am fine!";
203        let sentences = extract_sentences(text);
204        assert_eq!(sentences.len(), 3);
205        assert_eq!(sentences[0], "Hello world");
206    }
207
208    #[test]
209    fn test_split_paragraphs() {
210        let text = "First paragraph.\n\nSecond paragraph.\n\nThird one.";
211        let paragraphs = split_paragraphs(text);
212        assert_eq!(paragraphs.len(), 3);
213    }
214}