cerebro 1.1.8

A blazing-fast AI memory layer that enables teams of specialized agents to collaborate through a shared cognitive architecture.
Documentation
use crate::models::{Chunk, Document};
use crate::traits::{Chunker, Result};

pub struct HtmlSemanticChunker {
    pub max_chunk_size: usize,
}

impl HtmlSemanticChunker {
    pub fn new(max_chunk_size: usize) -> Self {
        Self { max_chunk_size }
    }
}

impl Chunker for HtmlSemanticChunker {
    fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
        let tags = ["<p>", "</div>", "<br>", "</h1>", "</h2>", "</h3>"];
        let mut chunks = Vec::new();
        let mut current_chunk = String::new();
        let mut index = 0;

        let mut last_pos = 0;
        let content = &document.content;

        while last_pos < content.len() {
            let mut next_split = content.len();
            for tag in tags.iter() {
                if let Some(pos) = content[last_pos..].find(tag) {
                    if last_pos + pos < next_split {
                        next_split = last_pos + pos + tag.len();
                    }
                }
            }

            let segment = &content[last_pos..next_split];

            if current_chunk.len() + segment.len() > self.max_chunk_size
                && !current_chunk.is_empty()
            {
                chunks.push(Chunk {
                    document_id: document.id.clone(),
                    index,
                    text: current_chunk.trim().to_string(),
                });
                index += 1;
                current_chunk.clear();
            }

            current_chunk.push_str(segment);
            last_pos = next_split;
        }

        if !current_chunk.trim().is_empty() {
            chunks.push(Chunk {
                document_id: document.id.clone(),
                index,
                text: current_chunk.trim().to_string(),
            });
        }

        Ok(chunks)
    }
}