use anyhow::Result;
use html2text::from_read;
use std::io::Cursor;
#[derive(Debug, Clone)]
pub struct ProcessorConfig {
pub chunk_size: usize,
pub chunk_overlap: usize,
}
impl Default for ProcessorConfig {
fn default() -> Self {
Self {
chunk_size: 1000,
chunk_overlap: 200,
}
}
}
pub struct Processor {
config: ProcessorConfig,
}
impl Processor {
pub fn new(config: ProcessorConfig) -> Self {
Self { config }
}
pub fn process_html(&self, html: &str) -> Result<Vec<String>> {
let text = from_read(Cursor::new(html), 120).map_err(|e| anyhow::anyhow!(e))?;
Ok(self.chunk_text(&text))
}
pub fn chunk_text(&self, text: &str) -> Vec<String> {
if text.is_empty() {
return Vec::new();
}
let mut chunks = Vec::new();
let words: Vec<&str> = text.split_inclusive(char::is_whitespace).collect();
let mut current_chunk = String::new();
let mut current_len = 0;
let mut current_words: Vec<&str> = Vec::new();
for word in words {
let word_len = word.len();
if current_len + word_len > self.config.chunk_size {
if !current_chunk.is_empty() {
chunks.push(current_chunk.trim().to_string());
}
let mut overlap_chunk = String::new();
let mut overlap_len = 0;
let mut new_current_words = Vec::new();
for w in current_words.iter().rev() {
if overlap_len + w.len() <= self.config.chunk_overlap {
new_current_words.push(*w);
overlap_len += w.len();
} else {
break;
}
}
new_current_words.reverse();
for w in &new_current_words {
overlap_chunk.push_str(w);
}
current_chunk = overlap_chunk;
current_len = overlap_len;
current_words = new_current_words;
}
current_chunk.push_str(word);
current_len += word_len;
current_words.push(word);
}
if !current_chunk.is_empty() {
chunks.push(current_chunk.trim().to_string());
}
chunks
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_chunk_text_simple() {
let config = ProcessorConfig {
chunk_size: 10,
chunk_overlap: 0,
};
let processor = Processor::new(config);
let text = "one two three four";
let chunks = processor.chunk_text(text);
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0], "one two");
assert_eq!(chunks[1], "three four");
}
#[test]
fn test_chunk_text_overlap() {
let config = ProcessorConfig {
chunk_size: 15,
chunk_overlap: 6,
};
let processor = Processor::new(config);
let text = "one two three four five";
let chunks = processor.chunk_text(text);
assert!(chunks.len() >= 2);
assert_eq!(chunks[0], "one two three");
assert!(chunks[1].contains("three"));
}
#[test]
fn test_process_html() {
let config = ProcessorConfig::default();
let processor = Processor::new(config);
let html = "<html><body><h1>Title</h1><p>Paragraph 1.</p></body></html>";
let chunks = processor.process_html(html).unwrap();
assert!(!chunks.is_empty());
let combined = chunks.join(" ");
assert!(combined.contains("Title"));
assert!(combined.contains("Paragraph 1"));
assert!(!combined.contains("<html>"));
}
}