synapse_core/ingest/
processor.rs1use anyhow::Result;
2use html2text::from_read;
3use std::io::Cursor;
4
5#[derive(Debug, Clone)]
7pub struct ProcessorConfig {
8 pub chunk_size: usize,
10 pub chunk_overlap: usize,
12}
13
14impl Default for ProcessorConfig {
15 fn default() -> Self {
16 Self {
17 chunk_size: 1000,
18 chunk_overlap: 200,
19 }
20 }
21}
22
23pub struct Processor {
25 config: ProcessorConfig,
26}
27
28impl Processor {
29 pub fn new(config: ProcessorConfig) -> Self {
31 Self { config }
32 }
33
34 pub fn process_html(&self, html: &str) -> Result<Vec<String>> {
36 let text = from_read(Cursor::new(html), 120).map_err(|e| anyhow::anyhow!(e))?;
39 Ok(self.chunk_text(&text))
40 }
41
42 pub fn chunk_text(&self, text: &str) -> Vec<String> {
45 if text.is_empty() {
46 return Vec::new();
47 }
48
49 let mut chunks = Vec::new();
50 let words: Vec<&str> = text.split_inclusive(char::is_whitespace).collect();
52
53 let mut current_chunk = String::new();
54 let mut current_len = 0;
55 let mut current_words: Vec<&str> = Vec::new();
57
58 for word in words {
59 let word_len = word.len();
60
61 if current_len + word_len > self.config.chunk_size {
63 if !current_chunk.is_empty() {
64 chunks.push(current_chunk.trim().to_string());
65 }
66
67 let mut overlap_chunk = String::new();
69 let mut overlap_len = 0;
70 let mut new_current_words = Vec::new();
71
72 for w in current_words.iter().rev() {
74 if overlap_len + w.len() <= self.config.chunk_overlap {
75 new_current_words.push(*w);
76 overlap_len += w.len();
77 } else {
78 break;
79 }
80 }
81 new_current_words.reverse();
82
83 for w in &new_current_words {
85 overlap_chunk.push_str(w);
86 }
87
88 current_chunk = overlap_chunk;
89 current_len = overlap_len;
90 current_words = new_current_words;
91 }
92
93 current_chunk.push_str(word);
94 current_len += word_len;
95 current_words.push(word);
96 }
97
98 if !current_chunk.is_empty() {
100 chunks.push(current_chunk.trim().to_string());
101 }
102
103 chunks
104 }
105}
106
107#[cfg(test)]
108mod tests {
109 use super::*;
110
111 #[test]
112 fn test_chunk_text_simple() {
113 let config = ProcessorConfig {
114 chunk_size: 10,
115 chunk_overlap: 0,
116 };
117 let processor = Processor::new(config);
118 let text = "one two three four";
119 let chunks = processor.chunk_text(text);
120
121 assert_eq!(chunks.len(), 2);
126 assert_eq!(chunks[0], "one two");
127 assert_eq!(chunks[1], "three four");
128 }
129
130 #[test]
131 fn test_chunk_text_overlap() {
132 let config = ProcessorConfig {
133 chunk_size: 15,
134 chunk_overlap: 6,
135 };
136 let processor = Processor::new(config);
137 let text = "one two three four five";
138 let chunks = processor.chunk_text(text);
151
152 assert!(chunks.len() >= 2);
153 assert_eq!(chunks[0], "one two three");
154 assert!(chunks[1].contains("three"));
155 }
156
157 #[test]
158 fn test_process_html() {
159 let config = ProcessorConfig::default();
160 let processor = Processor::new(config);
161 let html = "<html><body><h1>Title</h1><p>Paragraph 1.</p></body></html>";
162
163 let chunks = processor.process_html(html).unwrap();
164 assert!(!chunks.is_empty());
165 let combined = chunks.join(" ");
171 assert!(combined.contains("Title"));
172 assert!(combined.contains("Paragraph 1"));
173 assert!(!combined.contains("<html>"));
174 }
175}