cerebro 0.1.4

Blazing-fast, storage-agnostic semantic memory engine for AI Agents — written in pure Rust
pub mod semantic;

use crate::models::{Chunk, Document};
use crate::traits::{Chunker, Result};

pub struct RecursiveCharacterChunker {
    pub chunk_size: usize,
    pub chunk_overlap: usize,
}

impl RecursiveCharacterChunker {
    pub fn new(chunk_size: usize, chunk_overlap: usize) -> Self {
        Self { chunk_size, chunk_overlap }
    }
}

impl Chunker for RecursiveCharacterChunker {
    fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
        let mut chunks = Vec::new();
        let content = &document.content;

        if content.is_empty() {
            return Ok(chunks);
        }

        let mut start = 0;
        let mut index = 0;

        while start < content.len() {
            let end = (start + self.chunk_size).min(content.len());
            let mut safe_end = end;
            while !content.is_char_boundary(safe_end) && safe_end > start {
                safe_end -= 1;
            }
            if safe_end == start {
                safe_end = end;
            }

            let text = content[start..safe_end].to_string();
            chunks.push(Chunk {
                document_id: document.id.clone(),
                index,
                text,
            });

            if safe_end == content.len() {
                break;
            }

            start = safe_end.saturating_sub(self.chunk_overlap);
            while !content.is_char_boundary(start) && start > 0 {
                start -= 1;
            }
            index += 1;
        }

        Ok(chunks)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_recursive_chunker() {
        let chunker = RecursiveCharacterChunker::new(10, 2);
        let doc = Document::new("hello world this is a test");
        let chunks = chunker.chunk(&doc).unwrap();
        assert!(!chunks.is_empty());
        assert_eq!(chunks[0].text, "hello worl");
    }

    #[test]
    fn test_empty_document_produces_no_chunks() {
        let chunker = RecursiveCharacterChunker::new(100, 10);
        let doc = Document::new("");
        let chunks = chunker.chunk(&doc).unwrap();
        assert!(chunks.is_empty());
    }

    #[test]
    fn test_content_smaller_than_chunk_size() {
        let chunker = RecursiveCharacterChunker::new(1000, 100);
        let doc = Document::new("short");
        let chunks = chunker.chunk(&doc).unwrap();
        assert_eq!(chunks.len(), 1);
        assert_eq!(chunks[0].text, "short");
    }

    #[test]
    fn test_exact_chunk_size() {
        let chunker = RecursiveCharacterChunker::new(5, 0);
        let doc = Document::new("12345");
        let chunks = chunker.chunk(&doc).unwrap();
        assert_eq!(chunks.len(), 1);
        assert_eq!(chunks[0].text, "12345");
    }

    #[test]
    fn test_chunk_indices_are_sequential() {
        let chunker = RecursiveCharacterChunker::new(5, 0);
        let doc = Document::new("abcdefghijklmnopqrstuvwxyz");
        let chunks = chunker.chunk(&doc).unwrap();
        for (i, chunk) in chunks.iter().enumerate() {
            assert_eq!(chunk.index, i);
        }
    }

    #[test]
    fn test_chunk_document_id_matches() {
        let chunker = RecursiveCharacterChunker::new(10, 0);
        let doc = Document::new("some text that needs chunking");
        let doc_id = doc.id.clone();
        let chunks = chunker.chunk(&doc).unwrap();
        for chunk in &chunks {
            assert_eq!(chunk.document_id, doc_id);
        }
    }

    #[test]
    fn test_overlap_creates_overlapping_content() {
        let chunker = RecursiveCharacterChunker::new(10, 3);
        let doc = Document::new("0123456789ABCDEF");
        let chunks = chunker.chunk(&doc).unwrap();
        assert!(chunks.len() >= 2);
    }

    #[test]
    fn test_unicode_content_does_not_panic() {
        let chunker = RecursiveCharacterChunker::new(5, 1);
        let doc = Document::new("héllo wörld 🦀 rust");
        let result = chunker.chunk(&doc);
        assert!(result.is_ok());
        assert!(!result.unwrap().is_empty());
    }
}