Skip to main content

synapse_core/ingest/
processor.rs

1use anyhow::Result;
2use html2text::from_read;
3use std::io::Cursor;
4
5/// Configuration for text processing and chunking.
6#[derive(Debug, Clone)]
7pub struct ProcessorConfig {
8    /// Maximum number of characters per chunk.
9    pub chunk_size: usize,
10    /// Number of characters to overlap between chunks.
11    pub chunk_overlap: usize,
12}
13
14impl Default for ProcessorConfig {
15    fn default() -> Self {
16        Self {
17            chunk_size: 1000,
18            chunk_overlap: 200,
19        }
20    }
21}
22
23/// Advanced processor for text and HTML content.
24pub struct Processor {
25    config: ProcessorConfig,
26}
27
28impl Processor {
29    /// Creates a new Processor with the given configuration.
30    pub fn new(config: ProcessorConfig) -> Self {
31        Self { config }
32    }
33
34    /// Processes HTML content: sanitizes it to text and then chunks it.
35    pub fn process_html(&self, html: &str) -> Result<Vec<String>> {
36        // Use a reasonable width for text wrapping, e.g., 120.
37        // This helps maintain some structure while converting to text.
38        let text = from_read(Cursor::new(html), 120).map_err(|e| anyhow::anyhow!(e))?;
39        Ok(self.chunk_text(&text))
40    }
41
42    /// Splits text into overlapping chunks based on the configuration.
43    /// Tries to split on whitespace to preserve word boundaries.
44    pub fn chunk_text(&self, text: &str) -> Vec<String> {
45        if text.is_empty() {
46            return Vec::new();
47        }
48
49        let mut chunks = Vec::new();
50        // Split by whitespace but keep the delimiter to reconstruct faithfully
51        let words: Vec<&str> = text.split_inclusive(char::is_whitespace).collect();
52
53        let mut current_chunk = String::new();
54        let mut current_len = 0;
55        // Keep track of words in the current chunk to handle overlap efficiently
56        let mut current_words: Vec<&str> = Vec::new();
57
58        for word in words {
59            let word_len = word.len();
60
61            // If adding this word exceeds chunk_size, we finalize the current chunk
62            if current_len + word_len > self.config.chunk_size {
63                if !current_chunk.is_empty() {
64                    chunks.push(current_chunk.trim().to_string());
65                }
66
67                // Prepare the next chunk with overlap
68                let mut overlap_chunk = String::new();
69                let mut overlap_len = 0;
70                let mut new_current_words = Vec::new();
71
72                // Work backwards to find how many words fit in the overlap
73                for w in current_words.iter().rev() {
74                    if overlap_len + w.len() <= self.config.chunk_overlap {
75                        new_current_words.push(*w);
76                        overlap_len += w.len();
77                    } else {
78                        break;
79                    }
80                }
81                new_current_words.reverse();
82
83                // Reconstruct the overlap string
84                for w in &new_current_words {
85                    overlap_chunk.push_str(w);
86                }
87
88                current_chunk = overlap_chunk;
89                current_len = overlap_len;
90                current_words = new_current_words;
91            }
92
93            current_chunk.push_str(word);
94            current_len += word_len;
95            current_words.push(word);
96        }
97
98        // Add the last chunk if not empty
99        if !current_chunk.is_empty() {
100            chunks.push(current_chunk.trim().to_string());
101        }
102
103        chunks
104    }
105}
106
107#[cfg(test)]
108mod tests {
109    use super::*;
110
111    #[test]
112    fn test_chunk_text_simple() {
113        let config = ProcessorConfig {
114            chunk_size: 10,
115            chunk_overlap: 0,
116        };
117        let processor = Processor::new(config);
118        let text = "one two three four";
119        let chunks = processor.chunk_text(text);
120
121        // "one two " is 8 chars. "one two three" is 13 > 10.
122        // So chunk 1: "one two"
123        // chunk 2: "three four"
124
125        assert_eq!(chunks.len(), 2);
126        assert_eq!(chunks[0], "one two");
127        assert_eq!(chunks[1], "three four");
128    }
129
130    #[test]
131    fn test_chunk_text_overlap() {
132        let config = ProcessorConfig {
133            chunk_size: 15,
134            chunk_overlap: 6,
135        };
136        let processor = Processor::new(config);
137        let text = "one two three four five";
138        // "one two three" = 13 chars. " four" = 5. Total 18 > 15.
139        // Chunk 1: "one two three"
140        // Overlap: 6 chars. "three" (5) + " " (1) = 6.
141        // Next chunk starts with " three".
142        // " three four" = 11. " five" = 5. Total 16 > 15.
143        // Wait, " three four" is 11. " five" is 5. 11+5 = 16.
144        // So chunk 2: "three four"
145        // Overlap: 6 chars. "four" (4) + " " (1) = 5. "three" (5). 5 < 6.
146        // " three" (6).
147        // Next chunk starts with " four".
148        // " four five" = 10.
149
150        let chunks = processor.chunk_text(text);
151
152        assert!(chunks.len() >= 2);
153        assert_eq!(chunks[0], "one two three");
154        assert!(chunks[1].contains("three"));
155    }
156
157    #[test]
158    fn test_process_html() {
159        let config = ProcessorConfig::default();
160        let processor = Processor::new(config);
161        let html = "<html><body><h1>Title</h1><p>Paragraph 1.</p></body></html>";
162
163        let chunks = processor.process_html(html).unwrap();
164        assert!(!chunks.is_empty());
165        // html2text should convert h1 to # Title or similar depending on width, or just Title
166        // With width 120, it likely preserves some formatting or just outputs text.
167        // html2text default behavior for h1 is typically underlined or capitalized.
168
169        // Just check that we got some text back and tags are gone
170        let combined = chunks.join(" ");
171        assert!(combined.contains("Title"));
172        assert!(combined.contains("Paragraph 1"));
173        assert!(!combined.contains("<html>"));
174    }
175}