use crate::error::{KreuzbergError, Result};
use crate::types::{Chunk, ChunkMetadata, PageBoundary};
use text_splitter::{Characters, ChunkCapacity, ChunkConfig};
use super::boundaries::calculate_page_range;
pub fn build_chunk_config(max_characters: usize, overlap: usize, trim: bool) -> Result<ChunkConfig<Characters>> {
ChunkConfig::new(ChunkCapacity::new(max_characters))
.with_overlap(overlap)
.map(|config| config.with_trim(trim))
.map_err(|e| KreuzbergError::validation(format!("Invalid chunking configuration: {}", e)))
}
pub fn build_chunks<'a, I>(
source_text: &'a str,
text_chunks: I,
page_boundaries: Option<&[PageBoundary]>,
) -> Result<Vec<Chunk>>
where
I: IntoIterator<Item = &'a str>,
{
let chunks_vec: Vec<&str> = text_chunks.into_iter().collect();
let total_chunks = chunks_vec.len();
let source_start = source_text.as_ptr() as usize;
let mut chunks = Vec::with_capacity(total_chunks);
for (index, chunk_text) in chunks_vec.into_iter().enumerate() {
let byte_start = chunk_text.as_ptr() as usize - source_start;
let byte_end = byte_start + chunk_text.len();
let (first_page, last_page) = if let Some(boundaries) = page_boundaries {
calculate_page_range(byte_start, byte_end, boundaries)?
} else {
(None, None)
};
chunks.push(Chunk {
content: chunk_text.to_string(),
embedding: None,
metadata: ChunkMetadata {
byte_start,
byte_end,
token_count: None,
chunk_index: index,
total_chunks,
first_page,
last_page,
heading_context: None,
},
});
}
Ok(chunks)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_build_chunk_config_valid() {
let result = build_chunk_config(100, 10, true);
assert!(result.is_ok());
}
#[test]
fn test_build_chunk_config_invalid_overlap() {
let result = build_chunk_config(10, 20, true);
assert!(result.is_err());
let err = result.unwrap_err();
assert!(matches!(err, KreuzbergError::Validation { .. }));
}
#[test]
fn test_build_chunks_empty() {
let source = "";
let text_chunks: Vec<&str> = vec![];
let result = build_chunks(source, text_chunks, None).unwrap();
assert_eq!(result.len(), 0);
}
#[test]
fn test_build_chunks_single() {
let source = "Single chunk";
let result = build_chunks(source, vec![source], None).unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].content, "Single chunk");
assert_eq!(result[0].metadata.chunk_index, 0);
assert_eq!(result[0].metadata.total_chunks, 1);
assert_eq!(result[0].metadata.byte_start, 0);
assert_eq!(result[0].metadata.byte_end, 12);
}
#[test]
fn test_build_chunks_with_page_boundaries() {
let source = "First chunkSecond chunk";
let text_chunks = vec![&source[0..11], &source[11..23]];
let boundaries = vec![
PageBoundary {
byte_start: 0,
byte_end: 11,
page_number: 1,
},
PageBoundary {
byte_start: 11,
byte_end: 23,
page_number: 2,
},
];
let result = build_chunks(source, text_chunks, Some(&boundaries)).unwrap();
assert_eq!(result.len(), 2);
assert_eq!(result[0].metadata.first_page, Some(1));
assert_eq!(result[1].metadata.first_page, Some(2));
}
#[test]
fn test_build_chunks_offset_from_source() {
let source = "AAAAABBBBBCCCCC";
let text_chunks = vec![&source[0..5], &source[3..8], &source[6..11]];
let result = build_chunks(source, text_chunks, None).unwrap();
assert_eq!(result.len(), 3);
assert_eq!(result[0].metadata.byte_start, 0);
assert_eq!(result[0].metadata.byte_end, 5);
assert_eq!(result[1].metadata.byte_start, 3);
assert_eq!(result[1].metadata.byte_end, 8);
assert_eq!(result[2].metadata.byte_start, 6);
assert_eq!(result[2].metadata.byte_end, 11);
}
#[test]
fn test_build_chunks_no_overlap() {
let source = "AAAAABBBBBCCCCC";
let text_chunks = vec![&source[0..5], &source[5..10], &source[10..15]];
let result = build_chunks(source, text_chunks, None).unwrap();
assert_eq!(result.len(), 3);
assert_eq!(result[0].metadata.byte_start, 0);
assert_eq!(result[0].metadata.byte_end, 5);
assert_eq!(result[1].metadata.byte_start, 5);
assert_eq!(result[1].metadata.byte_end, 10);
assert_eq!(result[2].metadata.byte_start, 10);
assert_eq!(result[2].metadata.byte_end, 15);
}
}