use crate::{
data::{AnnotatedDocument, Document, Extraction, CharInterval},
exceptions::LangExtractResult,
tokenizer::{TokenInterval, TokenizedText, Tokenizer, SentenceIterator},
};
use regex::Regex;
use semchunk_rs::Chunker;
use std::sync::Arc;
use once_cell::sync::Lazy;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ChunkingStrategy {
#[deprecated(note = "Use Semantic chunking for better results")]
FixedSize,
#[deprecated(note = "Use Semantic chunking for better results")]
Sentence,
#[deprecated(note = "Use Semantic chunking for better results")]
Paragraph,
Adaptive,
Semantic,
}
#[derive(Debug, Clone)]
pub struct TextChunk {
pub id: usize,
pub text: String,
pub char_offset: usize,
pub char_length: usize,
pub document_id: Option<String>,
pub has_overlap: bool,
pub overlap_info: Option<(usize, usize)>,
}
impl TextChunk {
pub fn new(
id: usize,
text: String,
char_offset: usize,
document_id: Option<String>,
) -> Self {
let char_length = text.len();
Self {
id,
text,
char_offset,
char_length,
document_id,
has_overlap: false,
overlap_info: None,
}
}
pub fn with_overlap(
id: usize,
text: String,
char_offset: usize,
document_id: Option<String>,
overlap_start: usize,
overlap_end: usize,
) -> Self {
let char_length = text.len();
Self {
id,
text,
char_offset,
char_length,
document_id,
has_overlap: overlap_start > 0 || overlap_end > 0,
overlap_info: Some((overlap_start, overlap_end)),
}
}
pub fn char_interval(&self) -> CharInterval {
CharInterval::new(
Some(self.char_offset),
Some(self.char_offset + self.char_length),
)
}
pub fn core_text(&self) -> &str {
if let Some((start_overlap, end_overlap)) = self.overlap_info {
let start = start_overlap;
let end = self.text.len().saturating_sub(end_overlap);
&self.text[start..end]
} else {
&self.text
}
}
}
#[derive(Debug, Clone)]
pub struct TokenChunk {
pub token_interval: TokenInterval,
pub document: Option<Arc<Document>>,
chunk_text: Option<String>,
char_interval: Option<CharInterval>,
custom_char_end: Option<usize>,
}
impl TokenChunk {
pub fn new(token_interval: TokenInterval, document: Option<Arc<Document>>) -> Self {
Self {
token_interval,
document,
chunk_text: None,
char_interval: None,
custom_char_end: None,
}
}
pub fn with_char_end(token_interval: TokenInterval, document: Option<Arc<Document>>, char_end: usize) -> Self {
Self {
token_interval,
document,
chunk_text: None,
char_interval: None,
custom_char_end: Some(char_end),
}
}
pub fn with_precomputed(
token_interval: TokenInterval,
document: Option<Arc<Document>>,
chunk_text: String,
char_interval: CharInterval,
) -> Self {
Self {
token_interval,
document,
chunk_text: Some(chunk_text),
char_interval: Some(char_interval),
custom_char_end: None,
}
}
pub fn with_precomputed_and_char_end(
token_interval: TokenInterval,
document: Option<Arc<Document>>,
chunk_text: String,
char_interval: CharInterval,
_char_end: usize,
) -> Self {
Self {
token_interval,
document,
chunk_text: Some(chunk_text),
char_interval: Some(char_interval),
custom_char_end: None, }
}
pub fn document_id(&self) -> Option<&str> {
self.document.as_ref()?.document_id.as_deref()
}
pub fn document_text(&self) -> Option<&TokenizedText> {
None
}
pub fn chunk_text(&self, tokenizer: &Tokenizer) -> LangExtractResult<String> {
if let Some(ref cached) = self.chunk_text {
return Ok(cached.clone());
}
if let Some(ref document) = self.document {
let tokenized = tokenizer.tokenize(&document.text)?;
if let Some(custom_end) = self.custom_char_end {
if !tokenized.tokens.is_empty() && self.token_interval.start_index < tokenized.tokens.len() {
let start_token = &tokenized.tokens[self.token_interval.start_index];
let start_char = start_token.char_interval.start_pos;
let end_char = std::cmp::min(custom_end, document.text.len());
return Ok(document.text[start_char..end_char].to_string());
}
}
let text = tokenizer.tokens_text(&tokenized, &self.token_interval)?;
Ok(text)
} else {
Err(crate::exceptions::LangExtractError::invalid_input(
"Document text must be set to access chunk text"
))
}
}
pub fn sanitized_chunk_text(&self, tokenizer: &Tokenizer) -> LangExtractResult<String> {
let text = self.chunk_text(tokenizer)?;
Ok(sanitize_text(&text)?)
}
pub fn additional_context(&self) -> Option<&str> {
self.document.as_ref()?.additional_context.as_deref()
}
pub fn char_interval(&self, tokenizer: &Tokenizer) -> LangExtractResult<CharInterval> {
if let Some(ref cached) = self.char_interval {
return Ok(cached.clone());
}
if let Some(ref document) = self.document {
let tokenized = tokenizer.tokenize(&document.text)?;
let tokens = &tokenized.tokens;
if self.token_interval.start_index >= tokens.len()
|| self.token_interval.end_index > tokens.len() {
return Err(crate::exceptions::LangExtractError::invalid_input(
"Token interval is out of bounds for the document"
));
}
let start_token = &tokens[self.token_interval.start_index];
let end_token = &tokens[self.token_interval.end_index - 1];
Ok(CharInterval {
start_pos: Some(start_token.char_interval.start_pos),
end_pos: Some(end_token.char_interval.end_pos),
})
} else {
Err(crate::exceptions::LangExtractError::invalid_input(
"Document text must be set to compute char interval"
))
}
}
}
static WHITESPACE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\s+").unwrap());
fn sanitize_text(text: &str) -> LangExtractResult<String> {
let sanitized = WHITESPACE_RE
.replace_all(text.trim(), " ")
.to_string();
if sanitized.is_empty() {
return Err(crate::exceptions::LangExtractError::invalid_input("Sanitized text is empty"));
}
Ok(sanitized)
}
#[derive(Debug, Clone)]
pub struct ChunkingConfig {
pub max_chunk_size: usize,
pub overlap_size: usize,
pub strategy: ChunkingStrategy,
pub min_chunk_size: usize,
pub respect_paragraphs: bool,
pub respect_sentences: bool,
pub semantic_similarity_threshold: f32,
pub semantic_max_chunks: Option<usize>,
}
impl Default for ChunkingConfig {
fn default() -> Self {
Self {
max_chunk_size: 2000,
overlap_size: 200,
strategy: ChunkingStrategy::Adaptive,
min_chunk_size: 100,
respect_paragraphs: true,
respect_sentences: true,
semantic_similarity_threshold: 0.7,
semantic_max_chunks: None,
}
}
}
pub struct TextChunker {
config: ChunkingConfig,
sentence_regex: Regex,
paragraph_regex: Regex,
}
impl TextChunker {
pub fn new() -> Self {
Self::with_config(ChunkingConfig::default())
}
pub fn with_config(config: ChunkingConfig) -> Self {
let sentence_regex = Regex::new(r"[.!?]+\s+").unwrap();
let paragraph_regex = Regex::new(r"\n\s*\n").unwrap();
Self {
config,
sentence_regex,
paragraph_regex,
}
}
pub fn chunk_document(&self, document: &Document) -> LangExtractResult<Vec<TextChunk>> {
self.chunk_text(&document.text, document.document_id.clone())
}
#[tracing::instrument(skip_all, fields(text_len = text.len(), strategy = ?self.config.strategy, max_chunk_size = self.config.max_chunk_size))]
pub fn chunk_text(&self, text: &str, document_id: Option<String>) -> LangExtractResult<Vec<TextChunk>> {
if text.len() <= self.config.max_chunk_size {
return Ok(vec![TextChunk::new(0, text.to_string(), 0, document_id)]);
}
#[allow(deprecated)]
match self.config.strategy {
ChunkingStrategy::FixedSize => self.chunk_fixed_size(text, document_id),
ChunkingStrategy::Sentence => self.chunk_by_sentences(text, document_id),
ChunkingStrategy::Paragraph => self.chunk_by_paragraphs(text, document_id),
ChunkingStrategy::Adaptive => self.chunk_adaptive(text, document_id),
ChunkingStrategy::Semantic => self.chunk_semantic(text, document_id),
}
}
fn chunk_fixed_size(&self, text: &str, document_id: Option<String>) -> LangExtractResult<Vec<TextChunk>> {
let mut chunks = Vec::new();
let mut chunk_id = 0;
let mut current_pos = 0;
while current_pos < text.len() {
let chunk_end = std::cmp::min(
current_pos + self.config.max_chunk_size,
text.len()
);
let chunk_text = text[current_pos..chunk_end].to_string();
let overlap_start = if chunk_id > 0 { self.config.overlap_size } else { 0 };
let overlap_end = if chunk_end < text.len() { self.config.overlap_size } else { 0 };
let chunk = TextChunk::with_overlap(
chunk_id,
chunk_text,
current_pos,
document_id.clone(),
overlap_start,
overlap_end,
);
chunks.push(chunk);
chunk_id += 1;
let step_size = self.config.max_chunk_size.saturating_sub(self.config.overlap_size);
current_pos += step_size;
}
Ok(chunks)
}
fn chunk_by_sentences(&self, text: &str, document_id: Option<String>) -> LangExtractResult<Vec<TextChunk>> {
let sentence_boundaries = self.find_sentence_boundaries(text);
self.chunk_by_boundaries(text, &sentence_boundaries, document_id)
}
fn chunk_by_paragraphs(&self, text: &str, document_id: Option<String>) -> LangExtractResult<Vec<TextChunk>> {
let paragraph_boundaries = self.find_paragraph_boundaries(text);
self.chunk_by_boundaries(text, ¶graph_boundaries, document_id)
}
fn chunk_adaptive(&self, text: &str, document_id: Option<String>) -> LangExtractResult<Vec<TextChunk>> {
self.chunk_semantic(text, document_id)
}
fn find_sentence_boundaries(&self, text: &str) -> Vec<usize> {
let mut boundaries = vec![0];
for mat in self.sentence_regex.find_iter(text) {
boundaries.push(mat.end());
}
if boundaries.last() != Some(&text.len()) {
boundaries.push(text.len()); }
boundaries
}
fn find_paragraph_boundaries(&self, text: &str) -> Vec<usize> {
let mut boundaries = vec![0];
for mat in self.paragraph_regex.find_iter(text) {
boundaries.push(mat.end());
}
if boundaries.last() != Some(&text.len()) {
boundaries.push(text.len()); }
boundaries
}
#[tracing::instrument(skip_all, fields(text_len = text.len()))]
fn chunk_semantic(&self, text: &str, document_id: Option<String>) -> LangExtractResult<Vec<TextChunk>> {
let bpe = tiktoken_rs::cl100k_base().map_err(|e| {
crate::exceptions::LangExtractError::invalid_input(
&format!("Failed to initialize tiktoken tokenizer: {}", e)
)
})?;
let token_counter = Box::new(move |s: &str| bpe.encode_with_special_tokens(s).len());
let chunker = Chunker::new(self.config.max_chunk_size, token_counter);
let semantic_chunks = chunker.chunk(text);
let mut chunks = Vec::new();
let mut current_pos = 0;
for (chunk_id, chunk_text) in semantic_chunks.into_iter().enumerate() {
let start_pos = if text[current_pos..].starts_with(&chunk_text) {
current_pos
} else if let Some(found_pos) = text[current_pos..].find(&chunk_text) {
log::warn!(
"Semantic chunk {} not contiguous at offset {}, found at {}",
chunk_id, current_pos, current_pos + found_pos
);
current_pos + found_pos
} else {
log::warn!(
"Semantic chunk {} text not found at offset {}, using current position",
chunk_id, current_pos
);
current_pos
};
let end_pos = start_pos + chunk_text.len();
let text_chunk = TextChunk::new(
chunk_id,
chunk_text.clone(),
start_pos,
document_id.clone(),
);
chunks.push(text_chunk);
current_pos = end_pos;
}
if chunks.is_empty() {
return Ok(vec![TextChunk::new(0, text.to_string(), 0, document_id)]);
}
let final_chunks = if let Some(max_chunks) = self.config.semantic_max_chunks {
if chunks.len() > max_chunks {
let mut merged_chunks = chunks[..max_chunks-1].to_vec();
let remaining_chunks = &chunks[max_chunks-1..];
let merged_start = remaining_chunks[0].char_offset;
let last = remaining_chunks.last().unwrap();
let merged_end = last.char_offset + last.char_length;
let merged_text = text[merged_start..merged_end].to_string();
let merged_chunk = TextChunk::new(
max_chunks - 1,
merged_text,
merged_start,
document_id,
);
merged_chunks.push(merged_chunk);
merged_chunks
} else {
chunks
}
} else {
chunks
};
Ok(final_chunks)
}
fn chunk_by_boundaries(
&self,
text: &str,
boundaries: &[usize],
document_id: Option<String>,
) -> LangExtractResult<Vec<TextChunk>> {
let mut chunks = Vec::new();
let mut chunk_id = 0;
let mut current_start = 0;
for &boundary in boundaries.iter().skip(1) {
let potential_chunk_size = boundary - current_start;
if potential_chunk_size <= self.config.max_chunk_size {
if potential_chunk_size >= self.config.min_chunk_size || chunks.is_empty() {
let chunk_text = text[current_start..boundary].to_string();
let chunk = TextChunk::new(chunk_id, chunk_text, current_start, document_id.clone());
chunks.push(chunk);
chunk_id += 1;
current_start = boundary;
}
} else {
let section = &text[current_start..boundary];
let mut section_chunks = self.chunk_fixed_size(section, document_id.clone())?;
for chunk in &mut section_chunks {
chunk.id = chunk_id;
chunk.char_offset += current_start;
chunk_id += 1;
}
chunks.extend(section_chunks);
current_start = boundary;
}
}
if chunks.is_empty() {
chunks.push(TextChunk::new(0, text.to_string(), 0, document_id));
}
Ok(chunks)
}
pub fn config(&self) -> &ChunkingConfig {
&self.config
}
}
impl Default for TextChunker {
fn default() -> Self {
Self::new()
}
}
pub struct ChunkIterator<'a> {
tokenized_text: &'a TokenizedText,
tokenizer: &'a Tokenizer,
max_char_buffer: usize,
sentence_iter: SentenceIterator<'a>,
broken_sentence: bool,
document: Option<Arc<Document>>,
}
impl<'a> ChunkIterator<'a> {
pub fn new(
text: &'a TokenizedText,
tokenizer: &'a Tokenizer,
max_char_buffer: usize,
document: Option<&Document>,
) -> LangExtractResult<Self> {
let sentence_iter = SentenceIterator::new(text, tokenizer, 0)?;
Ok(Self {
tokenized_text: text,
tokenizer,
max_char_buffer,
sentence_iter,
broken_sentence: false,
document: document.map(|d| Arc::new(d.clone())),
})
}
fn tokens_exceed_buffer(&self, token_interval: &TokenInterval) -> LangExtractResult<bool> {
let char_interval = self.get_char_interval_for_tokens(token_interval)?;
match (char_interval.start_pos, char_interval.end_pos) {
(Some(start), Some(end)) => Ok((end - start) > self.max_char_buffer),
_ => Ok(false), }
}
fn get_char_interval_for_tokens(&self, token_interval: &TokenInterval) -> LangExtractResult<CharInterval> {
if token_interval.start_index >= self.tokenized_text.tokens.len()
|| token_interval.end_index > self.tokenized_text.tokens.len() {
return Err(crate::exceptions::LangExtractError::invalid_input(
"Token interval is out of bounds"
));
}
let start_token = &self.tokenized_text.tokens[token_interval.start_index];
let end_token = &self.tokenized_text.tokens[token_interval.end_index - 1];
Ok(CharInterval {
start_pos: Some(start_token.char_interval.start_pos),
end_pos: Some(end_token.char_interval.end_pos),
})
}
fn precompute_chunk(&self, token_interval: &TokenInterval) -> (String, CharInterval) {
let tokens = &self.tokenized_text.tokens;
if token_interval.start_index < tokens.len() && token_interval.end_index <= tokens.len() {
let start_token = &tokens[token_interval.start_index];
let end_token = &tokens[token_interval.end_index - 1];
let start_char = start_token.char_interval.start_pos;
let end_char = end_token.char_interval.end_pos;
let text = if let Some(ref doc) = self.document {
doc.text[start_char..end_char].to_string()
} else {
String::new()
};
let interval = CharInterval {
start_pos: Some(start_char),
end_pos: Some(end_char),
};
(text, interval)
} else {
(String::new(), CharInterval::new(None, None))
}
}
fn make_precomputed_chunk(&self, token_interval: TokenInterval) -> TokenChunk {
let (text, interval) = self.precompute_chunk(&token_interval);
TokenChunk::with_precomputed(token_interval, self.document.clone(), text, interval)
}
}
impl<'a> Iterator for ChunkIterator<'a> {
type Item = LangExtractResult<TokenChunk>;
fn next(&mut self) -> Option<Self::Item> {
let sentence = match self.sentence_iter.next() {
Some(Ok(sentence)) => sentence,
Some(Err(e)) => return Some(Err(e)),
None => return None,
};
let curr_chunk = match TokenInterval::new(
sentence.start_index,
sentence.start_index + 1
) {
Ok(interval) => interval,
Err(e) => return Some(Err(e)),
};
match self.tokens_exceed_buffer(&curr_chunk) {
Ok(true) => {
match SentenceIterator::new(
self.tokenized_text,
self.tokenizer,
sentence.start_index + 1,
) {
Ok(new_iter) => {
self.sentence_iter = new_iter;
self.broken_sentence = curr_chunk.end_index < sentence.end_index;
}
Err(e) => return Some(Err(e)),
}
return Some(Ok(self.make_precomputed_chunk(curr_chunk)));
}
Ok(false) => {}, Err(e) => return Some(Err(e)),
}
let mut start_of_new_line = None;
let mut curr_chunk = curr_chunk;
for token_index in curr_chunk.start_index..sentence.end_index {
if self.tokenized_text.tokens[token_index].first_token_after_newline {
start_of_new_line = Some(token_index);
}
let test_chunk = match TokenInterval::new(curr_chunk.start_index, token_index + 1) {
Ok(interval) => interval,
Err(e) => return Some(Err(e)),
};
match self.tokens_exceed_buffer(&test_chunk) {
Ok(true) => {
if let Some(newline_pos) = start_of_new_line {
if newline_pos > curr_chunk.start_index {
curr_chunk = match TokenInterval::new(curr_chunk.start_index, newline_pos) {
Ok(interval) => interval,
Err(e) => return Some(Err(e)),
};
}
}
match SentenceIterator::new(
self.tokenized_text,
self.tokenizer,
curr_chunk.end_index,
) {
Ok(new_iter) => {
self.sentence_iter = new_iter;
self.broken_sentence = true;
}
Err(e) => return Some(Err(e)),
}
return Some(Ok(self.make_precomputed_chunk(curr_chunk)));
}
Ok(false) => {
curr_chunk = test_chunk;
}
Err(e) => return Some(Err(e)),
}
}
if self.broken_sentence {
self.broken_sentence = false;
} else {
while let Some(next_sentence_result) = self.sentence_iter.next() {
let next_sentence = match next_sentence_result {
Ok(sentence) => sentence,
Err(e) => return Some(Err(e)),
};
let test_chunk = match TokenInterval::new(curr_chunk.start_index, next_sentence.end_index) {
Ok(interval) => interval,
Err(e) => return Some(Err(e)),
};
match self.tokens_exceed_buffer(&test_chunk) {
Ok(true) => {
match SentenceIterator::new(
self.tokenized_text,
self.tokenizer,
curr_chunk.end_index,
) {
Ok(new_iter) => {
self.sentence_iter = new_iter;
}
Err(e) => return Some(Err(e)),
}
break;
}
Ok(false) => {
curr_chunk = test_chunk;
}
Err(e) => return Some(Err(e)),
}
}
}
Some(Ok(self.make_precomputed_chunk(curr_chunk)))
}
}
pub struct ResultAggregator {
similarity_threshold: f32,
merge_overlaps: bool,
}
impl ResultAggregator {
pub fn new() -> Self {
Self {
similarity_threshold: 0.8,
merge_overlaps: true,
}
}
pub fn with_settings(similarity_threshold: f32, merge_overlaps: bool) -> Self {
Self {
similarity_threshold,
merge_overlaps,
}
}
pub fn aggregate_chunk_results(
&self,
chunk_results: Vec<ChunkResult>,
original_text: String,
document_id: Option<String>,
) -> LangExtractResult<AnnotatedDocument> {
let mut all_extractions = Vec::new();
for chunk_result in chunk_results {
if let Some(extractions) = chunk_result.extractions {
all_extractions.extend(extractions);
}
}
let deduplicated_extractions = if self.merge_overlaps {
self.deduplicate_extractions(all_extractions)?
} else {
all_extractions
};
let mut annotated_doc = AnnotatedDocument::with_extractions(deduplicated_extractions, original_text);
annotated_doc.document_id = document_id;
Ok(annotated_doc)
}
fn deduplicate_extractions(&self, extractions: Vec<Extraction>) -> LangExtractResult<Vec<Extraction>> {
let mut unique_extractions = Vec::new();
for extraction in extractions {
let mut is_duplicate = false;
for existing in &unique_extractions {
if self.are_similar_extractions(&extraction, existing) {
is_duplicate = true;
break;
}
}
if !is_duplicate {
unique_extractions.push(extraction);
}
}
Ok(unique_extractions)
}
fn are_similar_extractions(&self, e1: &Extraction, e2: &Extraction) -> bool {
if e1.extraction_class == e2.extraction_class {
let similarity = self.text_similarity(&e1.extraction_text, &e2.extraction_text);
return similarity >= self.similarity_threshold;
}
if let (Some(interval1), Some(interval2)) = (&e1.char_interval, &e2.char_interval) {
if interval1.overlaps_with(interval2) {
let similarity = self.text_similarity(&e1.extraction_text, &e2.extraction_text);
return similarity >= self.similarity_threshold;
}
}
false
}
fn text_similarity(&self, text1: &str, text2: &str) -> f32 {
if text1 == text2 {
return 1.0;
}
let words1: std::collections::HashSet<&str> = text1.split_whitespace().collect();
let words2: std::collections::HashSet<&str> = text2.split_whitespace().collect();
if words1.is_empty() && words2.is_empty() {
return 1.0;
}
let intersection = words1.intersection(&words2).count();
let union = words1.union(&words2).count();
if union == 0 {
0.0
} else {
intersection as f32 / union as f32
}
}
}
impl Default for ResultAggregator {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub struct ChunkResult {
pub chunk_id: usize,
pub extractions: Option<Vec<Extraction>>,
pub char_offset: usize,
pub char_length: usize,
pub success: bool,
pub error: Option<String>,
pub processing_time: Option<std::time::Duration>,
}
impl ChunkResult {
pub fn success(
chunk_id: usize,
extractions: Vec<Extraction>,
char_offset: usize,
char_length: usize,
) -> Self {
Self {
chunk_id,
extractions: Some(extractions),
char_offset,
char_length,
success: true,
error: None,
processing_time: None,
}
}
pub fn failure(
chunk_id: usize,
char_offset: usize,
char_length: usize,
error: String,
) -> Self {
Self {
chunk_id,
extractions: None,
char_offset,
char_length,
success: false,
error: Some(error),
processing_time: None,
}
}
pub fn with_processing_time(mut self, duration: std::time::Duration) -> Self {
self.processing_time = Some(duration);
self
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tokenizer::Tokenizer;
fn create_tokenizer() -> Tokenizer {
Tokenizer::new().expect("Failed to create tokenizer")
}
fn create_document(text: &str) -> Document {
Document::new(text.to_string())
}
#[test]
fn test_fixed_size_chunking() {
let chunker = TextChunker::with_config(ChunkingConfig {
max_chunk_size: 20,
overlap_size: 5,
strategy: ChunkingStrategy::FixedSize,
..Default::default()
});
let text = "This is a test document with some text that needs to be chunked into smaller pieces.";
let chunks = chunker.chunk_text(text, None).unwrap();
assert!(chunks.len() > 1);
for chunk in &chunks {
assert!(chunk.char_length <= 20);
}
}
#[test]
fn test_sentence_chunking() {
let chunker = TextChunker::with_config(ChunkingConfig {
max_chunk_size: 50,
strategy: ChunkingStrategy::Sentence,
..Default::default()
});
let text = "First sentence. Second sentence! Third sentence? Fourth sentence.";
let chunks = chunker.chunk_text(text, None).unwrap();
assert!(chunks.len() > 0);
for chunk in &chunks {
println!("Chunk: '{}'", chunk.text);
}
}
#[test]
fn test_small_text_no_chunking() {
let chunker = TextChunker::new();
let text = "Short text.";
let chunks = chunker.chunk_text(text, None).unwrap();
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].text, text);
}
#[test]
fn test_chunk_char_interval() {
let chunk = TextChunk::new(0, "test".to_string(), 10, None);
let interval = chunk.char_interval();
assert_eq!(interval.start_pos, Some(10));
assert_eq!(interval.end_pos, Some(14));
}
#[test]
fn test_chunk_with_overlap() {
let chunk = TextChunk::with_overlap(
0,
"overlap test text".to_string(),
0,
None,
3,
4,
);
assert!(chunk.has_overlap);
assert_eq!(chunk.overlap_info, Some((3, 4)));
assert_eq!(chunk.core_text(), "rlap test ");
}
#[test]
fn test_multi_sentence_chunk() {
let tokenizer = create_tokenizer();
let text = "This is a sentence. This is a longer sentence.";
let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
let document = create_document(text);
let mut chunk_iter = ChunkIterator::new(&tokenized, &tokenizer, 50, Some(&document))
.expect("Failed to create chunk iterator");
let first_chunk = chunk_iter.next()
.expect("Should have a chunk")
.expect("Chunk creation should succeed");
let chunk_text = first_chunk.chunk_text(&tokenizer)
.expect("Failed to get chunk text");
assert!(chunk_text.contains("This is a sentence."));
assert!(chunk_text.contains("This is a longer sentence."));
}
#[test]
fn test_sentence_breaking() {
let tokenizer = create_tokenizer();
let text = "This is a very long sentence that definitely exceeds the buffer.";
let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
let document = create_document(text);
let chunk_iter = ChunkIterator::new(&tokenized, &tokenizer, 20, Some(&document))
.expect("Failed to create chunk iterator");
let chunks: Result<Vec<_>, _> = chunk_iter.collect();
let chunks = chunks.expect("Chunk iteration should succeed");
assert!(chunks.len() > 1, "Should break long sentence into multiple chunks");
for chunk in &chunks {
let chunk_text = chunk.chunk_text(&tokenizer)
.expect("Failed to get chunk text");
assert!(chunk_text.len() <= 25, "Chunk should not vastly exceed buffer: '{}'", chunk_text); }
}
#[test]
fn test_oversized_token() {
let tokenizer = create_tokenizer();
let text = "Short antidisestablishmentarianism word.";
let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
let document = create_document(text);
let chunk_iter = ChunkIterator::new(&tokenized, &tokenizer, 10, Some(&document))
.expect("Failed to create chunk iterator");
let chunks: Result<Vec<_>, _> = chunk_iter.collect();
let chunks = chunks.expect("Chunk iteration should succeed");
assert!(chunks.len() > 1, "Should break into multiple chunks");
let long_word_chunk = chunks.iter().find(|chunk| {
chunk.chunk_text(&tokenizer)
.map(|text| text.contains("antidisestablishmentarianism"))
.unwrap_or(false)
});
assert!(long_word_chunk.is_some(), "Should find chunk containing the long word");
}
#[test]
fn test_newline_preference_for_breaking() {
let tokenizer = create_tokenizer();
let text = "First part of sentence\nSecond part of sentence continues here";
let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
let document = create_document(text);
let chunk_iter = ChunkIterator::new(&tokenized, &tokenizer, 25, Some(&document))
.expect("Failed to create chunk iterator");
let chunks: Result<Vec<_>, _> = chunk_iter.collect();
let chunks = chunks.expect("Chunk iteration should succeed");
assert!(chunks.len() > 1, "Should break into multiple chunks");
let first_chunk_text = chunks[0].chunk_text(&tokenizer)
.expect("Failed to get first chunk text");
assert!(!first_chunk_text.contains("continues"),
"First chunk should not contain text after newline: '{}'", first_chunk_text);
}
#[test]
fn test_empty_text_handling() {
let tokenizer = create_tokenizer();
let text = "";
let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
let document = create_document(text);
let mut chunk_iter = ChunkIterator::new(&tokenized, &tokenizer, 100, Some(&document))
.expect("Failed to create chunk iterator");
let result = chunk_iter.next();
assert!(result.is_none(), "Empty text should produce no chunks");
}
#[test]
fn test_single_sentence_chunk() {
let tokenizer = create_tokenizer();
let text = "Short sentence.";
let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
let document = create_document(text);
let mut chunk_iter = ChunkIterator::new(&tokenized, &tokenizer, 100, Some(&document))
.expect("Failed to create chunk iterator");
let chunk = chunk_iter.next()
.expect("Should have a chunk")
.expect("Chunk creation should succeed");
let chunk_text = chunk.chunk_text(&tokenizer)
.expect("Failed to get chunk text");
assert_eq!(chunk_text, text);
assert!(chunk_iter.next().is_none(), "Should have only one chunk");
}
#[test]
fn test_token_chunk_properties() {
let tokenizer = create_tokenizer();
let text = "Test sentence.";
let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
let document = create_document(text);
let token_interval = crate::tokenizer::TokenInterval::new(0, tokenized.tokens.len())
.expect("Failed to create token interval");
let chunk = TokenChunk::new(token_interval, Some(Arc::new(document)));
let chunk_text = chunk.chunk_text(&tokenizer)
.expect("Failed to get chunk text");
assert_eq!(chunk_text, text);
let sanitized = chunk.sanitized_chunk_text(&tokenizer)
.expect("Failed to get sanitized text");
assert_eq!(sanitized, text);
let char_interval = chunk.char_interval(&tokenizer)
.expect("Failed to get char interval");
assert_eq!(char_interval.start_pos, Some(0));
assert_eq!(char_interval.end_pos, Some(text.len()));
}
#[test]
fn test_progressive_chunking() {
let tokenizer = create_tokenizer();
let text = "Short. Medium length sentence here. Very long sentence that might need to be broken up depending on buffer size.";
let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
let document = create_document(text);
let chunk_iter = ChunkIterator::new(&tokenized, &tokenizer, 40, Some(&document))
.expect("Failed to create chunk iterator");
let chunks: Result<Vec<_>, _> = chunk_iter.collect();
let chunks = chunks.expect("Chunk iteration should succeed");
assert!(chunks.len() > 1, "Should produce multiple chunks");
println!("Debug: {} chunks created", chunks.len());
for (i, chunk) in chunks.iter().enumerate() {
let chunk_text = chunk.chunk_text(&tokenizer).expect("Failed to get chunk text");
println!("Chunk {}: {:?} (interval: {:?})", i, chunk_text, chunk.token_interval);
}
let mut reconstructed = String::new();
for chunk in &chunks {
let chunk_text = chunk.chunk_text(&tokenizer)
.expect("Failed to get chunk text");
reconstructed.push_str(&chunk_text);
}
println!("Original: {:?}", text);
println!("Reconstructed: {:?}", reconstructed);
assert!(chunks.len() >= 2, "Should produce multiple chunks for long text");
}
#[test]
fn test_chunk_without_document() {
let tokenizer = create_tokenizer();
let token_interval = crate::tokenizer::TokenInterval::new(0, 1)
.expect("Failed to create token interval");
let chunk = TokenChunk::new(token_interval, None);
let result = chunk.chunk_text(&tokenizer);
assert!(result.is_err(), "Should return error when no document is set");
assert!(chunk.document_id().is_none());
assert!(chunk.additional_context().is_none());
}
#[test]
fn test_semantic_chunking_basic() {
let chunker = TextChunker::with_config(ChunkingConfig {
strategy: ChunkingStrategy::Semantic,
max_chunk_size: 1000,
semantic_similarity_threshold: 0.7,
..Default::default()
});
let text = "Machine learning is a subset of artificial intelligence. It involves training algorithms on data to make predictions. Deep learning uses neural networks with multiple layers. Natural language processing helps computers understand human language.";
let chunks = chunker.chunk_text(text, Some("test_doc".to_string())).unwrap();
assert!(chunks.len() > 0, "Should create at least one chunk");
assert!(chunks.len() <= 10, "Should not create too many chunks");
for (i, chunk) in chunks.iter().enumerate() {
assert_eq!(chunk.id, i);
assert!(!chunk.text.is_empty());
assert!(chunk.char_length > 0);
assert_eq!(chunk.document_id, Some("test_doc".to_string()));
}
for i in 0..chunks.len() - 1 {
let current_end = chunks[i].char_offset + chunks[i].char_length;
let next_start = chunks[i + 1].char_offset;
assert!(current_end <= next_start, "Chunks should not overlap");
}
}
#[test]
fn test_semantic_chunking_empty_text() {
let chunker = TextChunker::with_config(ChunkingConfig {
strategy: ChunkingStrategy::Semantic,
..Default::default()
});
let text = "";
let chunks = chunker.chunk_text(text, None).unwrap();
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].text, "");
assert_eq!(chunks[0].char_length, 0);
assert_eq!(chunks[0].char_offset, 0);
}
#[test]
fn test_semantic_chunking_small_text() {
let chunker = TextChunker::with_config(ChunkingConfig {
strategy: ChunkingStrategy::Semantic,
max_chunk_size: 1000,
..Default::default()
});
let text = "Short text that fits in one chunk.";
let chunks = chunker.chunk_text(text, None).unwrap();
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].text, text);
assert_eq!(chunks[0].char_offset, 0);
assert_eq!(chunks[0].char_length, text.len());
}
#[test]
fn test_semantic_chunking_with_max_chunks() {
let chunker = TextChunker::with_config(ChunkingConfig {
strategy: ChunkingStrategy::Semantic,
max_chunk_size: 500,
semantic_similarity_threshold: 0.5, semantic_max_chunks: Some(3),
..Default::default()
});
let text = "This is a very long text about artificial intelligence and machine learning. It contains multiple paragraphs with different topics. The first paragraph discusses AI fundamentals. The second paragraph covers machine learning techniques. The third paragraph explores deep learning applications. The fourth paragraph examines natural language processing. This should create multiple semantic chunks that will need to be merged due to the max_chunks limit.";
let chunks = chunker.chunk_text(text, None).unwrap();
assert!(chunks.len() <= 3, "Should not exceed max_chunks limit: got {}, limit is 3", chunks.len());
assert!(!chunks.is_empty(), "Should create at least one chunk");
}
#[test]
fn test_semantic_chunking_similarity_threshold() {
let text = "Python is a programming language. Java is also a programming language. The weather is nice today. I like to eat pizza. Programming involves writing code. Food is essential for life.";
let low_threshold_chunker = TextChunker::with_config(ChunkingConfig {
strategy: ChunkingStrategy::Semantic,
max_chunk_size: 200,
semantic_similarity_threshold: 0.3, ..Default::default()
});
let high_threshold_chunker = TextChunker::with_config(ChunkingConfig {
strategy: ChunkingStrategy::Semantic,
max_chunk_size: 200,
semantic_similarity_threshold: 0.9, ..Default::default()
});
let low_threshold_chunks = low_threshold_chunker.chunk_text(text, None).unwrap();
let high_threshold_chunks = high_threshold_chunker.chunk_text(text, None).unwrap();
println!("Low threshold chunks: {}, High threshold chunks: {}",
low_threshold_chunks.len(), high_threshold_chunks.len());
assert!(!low_threshold_chunks.is_empty());
assert!(!high_threshold_chunks.is_empty());
}
#[test]
fn test_semantic_chunking_preserves_text() {
let chunker = TextChunker::with_config(ChunkingConfig {
strategy: ChunkingStrategy::Semantic,
max_chunk_size: 100,
semantic_similarity_threshold: 0.7,
..Default::default()
});
let text = "The quick brown fox jumps over the lazy dog. This is a test sentence. Machine learning is fascinating.";
let chunks = chunker.chunk_text(text, None).unwrap();
let mut reconstructed = String::new();
for chunk in &chunks {
reconstructed.push_str(&chunk.text);
}
assert_eq!(reconstructed.trim(), text.trim());
}
#[test]
fn test_semantic_chunking_error_handling() {
let chunker = TextChunker::with_config(ChunkingConfig {
strategy: ChunkingStrategy::Semantic,
max_chunk_size: 10, semantic_similarity_threshold: 2.0, ..Default::default()
});
let text = "This is a test text for semantic chunking error handling.";
let result = chunker.chunk_text(text, None);
match result {
Ok(chunks) => {
assert!(!chunks.is_empty());
for chunk in chunks {
assert!(!chunk.text.is_empty());
}
}
Err(e) => {
println!("Expected error occurred: {}", e);
}
}
}
#[test]
fn test_semantic_vs_fixed_size_chunking() {
let text = "Natural language processing is a field of artificial intelligence. It focuses on the interaction between computers and human language. Machine learning algorithms power many NLP applications. Deep learning has revolutionized computer vision and NLP.";
let semantic_chunker = TextChunker::with_config(ChunkingConfig {
strategy: ChunkingStrategy::Semantic,
max_chunk_size: 150,
semantic_similarity_threshold: 0.7,
..Default::default()
});
#[allow(deprecated)]
let fixed_chunker = TextChunker::with_config(ChunkingConfig {
strategy: ChunkingStrategy::FixedSize,
max_chunk_size: 150,
..Default::default()
});
let semantic_chunks = semantic_chunker.chunk_text(text, None).unwrap();
let fixed_chunks = fixed_chunker.chunk_text(text, None).unwrap();
println!("Semantic chunks: {}, Fixed chunks: {}", semantic_chunks.len(), fixed_chunks.len());
println!("Text length: {}", text.len());
assert!(!semantic_chunks.is_empty());
assert!(!fixed_chunks.is_empty());
}
#[test]
fn test_semantic_chunking_integration() {
let mut config = ChunkingConfig::default();
config.strategy = ChunkingStrategy::Semantic;
config.max_chunk_size = 100;
let chunker = TextChunker::with_config(config);
let text = "This is a test document. It has multiple sentences with different topics. The first sentence introduces the topic. The second sentence provides more details. The third sentence concludes the discussion.";
let chunks = chunker.chunk_text(text, Some("integration_test".to_string())).unwrap();
assert!(!chunks.is_empty());
assert!(chunks.len() <= 10);
for chunk in &chunks {
assert!(!chunk.text.is_empty());
assert!(chunk.char_length > 0);
assert_eq!(chunk.document_id, Some("integration_test".to_string()));
}
for i in 0..chunks.len() - 1 {
let current_end = chunks[i].char_offset + chunks[i].char_length;
let next_start = chunks[i + 1].char_offset;
assert!(current_end <= next_start, "Chunks should not overlap");
}
println!("Semantic chunking integration test passed with {} chunks", chunks.len());
}
#[test]
fn test_semantic_chunking_with_document_id() {
let chunker = TextChunker::with_config(ChunkingConfig {
strategy: ChunkingStrategy::Semantic,
max_chunk_size: 100,
..Default::default()
});
let text = "This is a test document with multiple sentences. Each sentence should be processed correctly. The document ID should be preserved.";
let document_id = Some("doc_123".to_string());
let chunks = chunker.chunk_text(text, document_id.clone()).unwrap();
for chunk in &chunks {
assert_eq!(chunk.document_id, document_id);
}
}
}