use crate::models::{Chunk, Document};
use crate::traits::{Chunker, Result};
pub struct HtmlSemanticChunker {
pub max_chunk_size: usize,
}
impl HtmlSemanticChunker {
pub fn new(max_chunk_size: usize) -> Self {
Self { max_chunk_size }
}
}
impl Chunker for HtmlSemanticChunker {
fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
let tags = ["<p>", "</div>", "<br>", "</h1>", "</h2>", "</h3>"];
let mut chunks = Vec::new();
let mut current_chunk = String::new();
let mut index = 0;
let mut last_pos = 0;
let content = &document.content;
while last_pos < content.len() {
let mut next_split = content.len();
for tag in tags.iter() {
if let Some(pos) = content[last_pos..].find(tag) {
if last_pos + pos < next_split {
next_split = last_pos + pos + tag.len();
}
}
}
let segment = &content[last_pos..next_split];
if current_chunk.len() + segment.len() > self.max_chunk_size
&& !current_chunk.is_empty()
{
chunks.push(Chunk {
document_id: document.id.clone(),
index,
text: current_chunk.trim().to_string(),
});
index += 1;
current_chunk.clear();
}
current_chunk.push_str(segment);
last_pos = next_split;
}
if !current_chunk.trim().is_empty() {
chunks.push(Chunk {
document_id: document.id.clone(),
index,
text: current_chunk.trim().to_string(),
});
}
Ok(chunks)
}
}