use crate::chunking::config::{ChunkerType, ChunkingConfig};
use crate::plugins::{Plugin, PostProcessor, ProcessingStage};
use crate::types::Metadata;
use crate::{ExtractionConfig, ExtractionResult, KreuzbergError, Result};
use async_trait::async_trait;
#[derive(Debug, Clone, Copy)]
pub struct ChunkingProcessor;
impl Plugin for ChunkingProcessor {
fn name(&self) -> &str {
"text-chunking"
}
fn version(&self) -> String {
env!("CARGO_PKG_VERSION").to_string()
}
fn initialize(&self) -> Result<()> {
Ok(())
}
fn shutdown(&self) -> Result<()> {
Ok(())
}
}
#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
impl PostProcessor for ChunkingProcessor {
async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig) -> Result<()> {
let chunking_config = match &config.chunking {
Some(cfg) => cfg,
None => return Ok(()),
};
let inferred = maybe_infer_yaml_chunker(chunking_config, &result.metadata);
let effective_config = inferred.as_ref().unwrap_or(chunking_config);
let chunking_result = crate::chunking::chunk_text(&result.content, effective_config, None)
.map_err(|e| KreuzbergError::Other(format!("Chunking failed: {}", e)))?;
result.chunks = Some(chunking_result.chunks);
Ok(())
}
fn processing_stage(&self) -> ProcessingStage {
ProcessingStage::Middle
}
fn should_process(&self, _result: &ExtractionResult, config: &ExtractionConfig) -> bool {
config.chunking.is_some()
}
fn estimated_duration_ms(&self, result: &ExtractionResult) -> u64 {
let text_length = result.content.len();
(text_length / 10240).max(1) as u64
}
}
fn maybe_infer_yaml_chunker(config: &ChunkingConfig, metadata: &Metadata) -> Option<ChunkingConfig> {
if config.chunker_type != ChunkerType::Text {
return None;
}
let is_structured = metadata
.additional
.get("data_format")
.and_then(|v| v.as_str())
.is_some_and(|fmt| fmt == "yaml" || fmt == "json");
is_structured.then(|| ChunkingConfig {
chunker_type: ChunkerType::Yaml,
..config.clone()
})
}
#[cfg(test)]
mod tests {
use super::*;
use crate::core::config::ChunkingConfig;
use crate::types::Metadata;
use std::borrow::Cow;
#[tokio::test]
async fn test_chunking_processor() {
let processor = ChunkingProcessor;
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 100,
overlap: 10,
trim: true,
chunker_type: crate::chunking::ChunkerType::Text,
..Default::default()
}),
..Default::default()
};
let mut result = ExtractionResult {
content: "This is a longer text that should be split into multiple chunks to test the chunking processor functionality.".to_string(),
mime_type: Cow::Borrowed("text/plain"),
..Default::default()
};
processor.process(&mut result, &config).await.unwrap();
assert!(result.chunks.is_some());
let chunks = result.chunks.unwrap();
assert!(!chunks.is_empty());
}
#[tokio::test]
async fn test_chunking_processor_no_config() {
let processor = ChunkingProcessor;
let config = ExtractionConfig::default();
let mut result = ExtractionResult {
content: "Some text".to_string(),
mime_type: Cow::Borrowed("text/plain"),
..Default::default()
};
processor.process(&mut result, &config).await.unwrap();
assert!(result.chunks.is_none());
}
#[test]
fn test_chunking_processor_plugin_interface() {
let processor = ChunkingProcessor;
assert_eq!(processor.name(), "text-chunking");
assert!(!processor.version().is_empty());
assert!(processor.initialize().is_ok());
assert!(processor.shutdown().is_ok());
}
#[test]
fn test_chunking_processor_stage() {
let processor = ChunkingProcessor;
assert_eq!(processor.processing_stage(), ProcessingStage::Middle);
}
#[test]
fn test_chunking_processor_should_process() {
let processor = ChunkingProcessor;
let result = ExtractionResult {
content: "Sample text".to_string(),
mime_type: Cow::Borrowed("text/plain"),
..Default::default()
};
let config_with_chunking = ExtractionConfig {
chunking: Some(crate::core::config::ChunkingConfig {
max_characters: 100,
overlap: 10,
trim: true,
chunker_type: crate::chunking::ChunkerType::Text,
..Default::default()
}),
..Default::default()
};
assert!(processor.should_process(&result, &config_with_chunking));
let config_without_chunking = ExtractionConfig::default();
assert!(!processor.should_process(&result, &config_without_chunking));
}
#[test]
fn test_chunking_processor_estimated_duration() {
let processor = ChunkingProcessor;
let short_result = ExtractionResult {
content: "Short".to_string(),
mime_type: Cow::Borrowed("text/plain"),
..Default::default()
};
let long_result = ExtractionResult {
content: "a".repeat(100000),
mime_type: Cow::Borrowed("text/plain"),
..Default::default()
};
let short_duration = processor.estimated_duration_ms(&short_result);
let long_duration = processor.estimated_duration_ms(&long_result);
assert!(long_duration > short_duration);
}
fn make_metadata_with_format(format: &str) -> Metadata {
let mut metadata = Metadata::default();
metadata
.additional
.insert(Cow::Borrowed("data_format"), serde_json::json!(format));
metadata
}
#[tokio::test]
async fn test_auto_infer_yaml_from_metadata() {
let processor = ChunkingProcessor;
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 10000,
overlap: 0,
trim: true,
chunker_type: crate::chunking::ChunkerType::Text,
..Default::default()
}),
..Default::default()
};
let yaml_content = "server:\n host: localhost\n port: 8080";
let mut result = ExtractionResult {
content: yaml_content.to_string(),
mime_type: Cow::Borrowed("text/yaml"),
metadata: make_metadata_with_format("yaml"),
..Default::default()
};
processor.process(&mut result, &config).await.unwrap();
let chunks = result.chunks.unwrap();
assert!(chunks[0].content.contains("# server > host"));
}
#[tokio::test]
async fn test_auto_infer_json_from_metadata() {
let processor = ChunkingProcessor;
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 10000,
overlap: 0,
trim: true,
chunker_type: crate::chunking::ChunkerType::Text,
..Default::default()
}),
..Default::default()
};
let json_content = r#"{"name": "test", "version": "1.0"}"#;
let mut result = ExtractionResult {
content: json_content.to_string(),
mime_type: Cow::Borrowed("application/json"),
metadata: make_metadata_with_format("json"),
..Default::default()
};
processor.process(&mut result, &config).await.unwrap();
let chunks = result.chunks.unwrap();
assert!(chunks[0].content.contains("# name"));
}
#[tokio::test]
async fn test_explicit_type_not_overridden() {
let processor = ChunkingProcessor;
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 10000,
overlap: 0,
trim: true,
chunker_type: crate::chunking::ChunkerType::Markdown,
..Default::default()
}),
..Default::default()
};
let yaml_content = "server:\n host: localhost\n port: 8080";
let mut result = ExtractionResult {
content: yaml_content.to_string(),
mime_type: Cow::Borrowed("text/yaml"),
metadata: make_metadata_with_format("yaml"),
..Default::default()
};
processor.process(&mut result, &config).await.unwrap();
let chunks = result.chunks.unwrap();
assert!(!chunks[0].content.contains("# server > host"));
}
#[tokio::test]
async fn test_missing_data_format_no_inference() {
let processor = ChunkingProcessor;
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 10000,
overlap: 0,
trim: true,
chunker_type: crate::chunking::ChunkerType::Text,
..Default::default()
}),
..Default::default()
};
let yaml_content = "server:\n host: localhost\n port: 8080";
let mut result = ExtractionResult {
content: yaml_content.to_string(),
mime_type: Cow::Borrowed("text/yaml"),
..Default::default()
};
processor.process(&mut result, &config).await.unwrap();
let chunks = result.chunks.unwrap();
assert!(!chunks[0].content.contains("# server > host"));
}
}