use crate::core::Chunk;
use crate::error::Result;
pub trait Chunker: Send + Sync {
fn chunk(
&self,
buffer_id: i64,
text: &str,
metadata: Option<&ChunkMetadata>,
) -> Result<Vec<Chunk>>;
fn name(&self) -> &'static str;
fn supports_parallel(&self) -> bool {
false
}
fn description(&self) -> &'static str {
"No description available"
}
fn validate(&self, metadata: Option<&ChunkMetadata>) -> Result<()> {
if let Some(meta) = metadata {
if meta.chunk_size == 0 {
return Err(crate::error::ChunkingError::InvalidConfig {
reason: "chunk_size must be > 0".to_string(),
}
.into());
}
if meta.overlap >= meta.chunk_size {
return Err(crate::error::ChunkingError::OverlapTooLarge {
overlap: meta.overlap,
size: meta.chunk_size,
}
.into());
}
}
Ok(())
}
}
#[derive(Debug, Clone, Default)]
pub struct ChunkMetadata {
pub source: Option<String>,
pub content_type: Option<String>,
pub chunk_size: usize,
pub overlap: usize,
pub preserve_lines: bool,
pub preserve_sentences: bool,
pub max_chunks: usize,
}
impl ChunkMetadata {
#[must_use]
pub fn new() -> Self {
Self {
chunk_size: super::DEFAULT_CHUNK_SIZE,
overlap: super::DEFAULT_OVERLAP,
preserve_lines: true,
preserve_sentences: false,
..Default::default()
}
}
#[must_use]
pub fn with_size(chunk_size: usize) -> Self {
Self {
chunk_size,
overlap: 0,
..Self::new()
}
}
#[must_use]
pub fn with_size_and_overlap(chunk_size: usize, overlap: usize) -> Self {
Self {
chunk_size,
overlap,
..Self::new()
}
}
#[must_use]
pub fn source(mut self, source: &str) -> Self {
self.source = Some(source.to_string());
self
}
#[must_use]
pub fn content_type(mut self, content_type: &str) -> Self {
self.content_type = Some(content_type.to_string());
self
}
#[must_use]
pub const fn preserve_lines(mut self, preserve: bool) -> Self {
self.preserve_lines = preserve;
self
}
#[must_use]
pub const fn preserve_sentences(mut self, preserve: bool) -> Self {
self.preserve_sentences = preserve;
self
}
#[must_use]
pub const fn max_chunks(mut self, max: usize) -> Self {
self.max_chunks = max;
self
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_chunk_metadata_defaults() {
let meta = ChunkMetadata::new();
assert_eq!(meta.chunk_size, super::super::DEFAULT_CHUNK_SIZE);
assert_eq!(meta.overlap, super::super::DEFAULT_OVERLAP);
assert!(meta.preserve_lines);
assert!(!meta.preserve_sentences);
}
#[test]
fn test_chunk_metadata_builder() {
let meta = ChunkMetadata::with_size_and_overlap(1000, 100)
.source("test.txt")
.content_type("txt")
.preserve_sentences(true)
.max_chunks(10);
assert_eq!(meta.chunk_size, 1000);
assert_eq!(meta.overlap, 100);
assert_eq!(meta.source, Some("test.txt".to_string()));
assert_eq!(meta.content_type, Some("txt".to_string()));
assert!(meta.preserve_sentences);
assert_eq!(meta.max_chunks, 10);
}
#[test]
fn test_chunk_metadata_with_size() {
let meta = ChunkMetadata::with_size(500);
assert_eq!(meta.chunk_size, 500);
assert_eq!(meta.overlap, 0);
}
#[test]
fn test_chunk_metadata_preserve_lines() {
let meta = ChunkMetadata::new().preserve_lines(false);
assert!(!meta.preserve_lines);
let meta = ChunkMetadata::new().preserve_lines(true);
assert!(meta.preserve_lines);
}
mod validation_tests {
use crate::chunking::FixedChunker;
use crate::chunking::traits::{ChunkMetadata, Chunker};
#[test]
fn test_chunker_validate_zero_chunk_size() {
let chunker = FixedChunker::with_size(100);
let meta = ChunkMetadata {
chunk_size: 0,
overlap: 0,
..Default::default()
};
let result = chunker.validate(Some(&meta));
assert!(result.is_err());
}
#[test]
fn test_chunker_validate_overlap_too_large() {
let chunker = FixedChunker::with_size(100);
let meta = ChunkMetadata {
chunk_size: 50,
overlap: 100, ..Default::default()
};
let result = chunker.validate(Some(&meta));
assert!(result.is_err());
}
#[test]
fn test_chunker_validate_valid() {
let chunker = FixedChunker::with_size(100);
let meta = ChunkMetadata {
chunk_size: 100,
overlap: 10,
..Default::default()
};
let result = chunker.validate(Some(&meta));
assert!(result.is_ok());
}
#[test]
fn test_chunker_validate_none() {
let chunker = FixedChunker::with_size(100);
let result = chunker.validate(None);
assert!(result.is_ok());
}
#[test]
fn test_chunker_supports_parallel() {
let chunker = FixedChunker::with_size(100);
assert!(!chunker.supports_parallel());
}
#[test]
fn test_chunker_description() {
let chunker = FixedChunker::with_size(100);
let desc = chunker.description();
assert!(!desc.is_empty());
}
#[test]
fn test_chunker_name() {
let chunker = FixedChunker::with_size(100);
assert_eq!(chunker.name(), "fixed");
}
}
struct MinimalChunker;
impl Chunker for MinimalChunker {
fn chunk(
&self,
_buffer_id: i64,
_text: &str,
_metadata: Option<&ChunkMetadata>,
) -> crate::error::Result<Vec<crate::core::Chunk>> {
Ok(vec![])
}
fn name(&self) -> &'static str {
"minimal"
}
}
#[test]
fn test_chunker_default_description() {
let chunker = MinimalChunker;
let desc = chunker.description();
assert_eq!(desc, "No description available");
}
#[test]
fn test_chunker_default_supports_parallel() {
let chunker = MinimalChunker;
assert!(!chunker.supports_parallel());
}
}