use crate::{Entity, Result};
#[cfg(feature = "chunking")]
use text_splitter::TextSplitter;
#[derive(Debug, Clone)]
pub struct ChunkConfig {
pub chunk_size: usize,
pub overlap: usize,
pub respect_sentences: bool,
pub buffer_size: usize,
}
impl Default for ChunkConfig {
fn default() -> Self {
Self {
chunk_size: 10_000,
overlap: 100,
respect_sentences: true,
buffer_size: 1000,
}
}
}
impl ChunkConfig {
pub fn no_chunking() -> Self {
Self {
chunk_size: usize::MAX,
overlap: 0,
respect_sentences: false,
buffer_size: usize::MAX,
}
}
pub fn long_document() -> Self {
Self {
chunk_size: 50_000,
overlap: 200,
respect_sentences: true,
buffer_size: 5000,
}
}
pub fn realtime() -> Self {
Self {
chunk_size: 1000,
overlap: 50,
respect_sentences: false,
buffer_size: 100,
}
}
}
#[derive(Debug, Clone)]
pub struct TextChunk {
pub text: String,
pub char_offset: usize,
}
pub fn chunk_text(text: &str, config: &ChunkConfig) -> Vec<TextChunk> {
let chars: Vec<char> = text.chars().collect();
let text_len = chars.len();
if text_len == 0 {
return Vec::new();
}
if text_len <= config.chunk_size {
return vec![TextChunk {
text: text.to_string(),
char_offset: 0,
}];
}
let mut chunks = Vec::new();
let mut position = 0;
while position < text_len {
let chunk_end = (position + config.chunk_size).min(text_len);
let actual_end = if chunk_end >= text_len {
text_len
} else if config.respect_sentences {
find_sentence_boundary(&chars, position, chunk_end)
} else {
find_word_boundary(&chars, chunk_end)
};
let chunk_str: String = chars[position..actual_end].iter().collect();
chunks.push(TextChunk {
text: chunk_str,
char_offset: position,
});
if actual_end >= text_len {
break;
}
let overlap_position = actual_end.saturating_sub(config.overlap);
position = if overlap_position <= position {
position + 1
} else {
overlap_position
};
}
chunks
}
pub fn find_sentence_boundary(chars: &[char], start: usize, target: usize) -> usize {
let search_start = target.saturating_sub(200);
for i in (search_start..target).rev() {
if i >= chars.len() {
continue;
}
let c = chars[i];
let is_cjk_punct = c == '。' || c == '!' || c == '?';
let is_latin_punct = c == '.' || c == '!' || c == '?';
if is_cjk_punct
|| (is_latin_punct && (i + 1 >= chars.len() || chars[i + 1].is_whitespace()))
{
let mut end = i + 1;
while end < chars.len() && chars[end].is_whitespace() {
end += 1;
}
if end > start {
return end;
}
}
}
find_word_boundary(chars, target)
}
pub fn find_word_boundary(chars: &[char], target: usize) -> usize {
let target = target.min(chars.len());
if target >= chars.len() {
return chars.len();
}
for i in (0..target).rev() {
if chars[i].is_whitespace() {
return i + 1;
}
}
target
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum OverlapStrategy {
KeepFirst,
KeepHighestConfidence,
KeepLongerSameType,
KeepShortest,
}
pub fn deduplicate_overlapping(entities: &mut Vec<Entity>, strategy: OverlapStrategy) {
if entities.len() <= 1 {
return;
}
let result = match strategy {
OverlapStrategy::KeepFirst => {
entities.sort_by(|a, b| {
a.start().cmp(&b.start()).then(
b.confidence
.partial_cmp(&a.confidence)
.expect("confidence values should be comparable"),
)
});
let mut out = Vec::new();
let mut last_end = 0;
for entity in entities.drain(..) {
if entity.start() >= last_end {
last_end = entity.end();
out.push(entity);
}
}
out
}
OverlapStrategy::KeepHighestConfidence => {
entities.sort_by(|a, b| {
b.confidence
.partial_cmp(&a.confidence)
.unwrap_or(std::cmp::Ordering::Equal)
});
let mut out = Vec::with_capacity(entities.len());
for entity in entities.drain(..) {
let overlaps = out
.iter()
.any(|e: &Entity| entity.start() < e.end() && entity.end() > e.start());
if !overlaps {
out.push(entity);
}
}
out.sort_by_key(|e| e.start());
out
}
OverlapStrategy::KeepLongerSameType => {
entities.sort_by_key(|e| (e.start(), e.end()));
let mut out: Vec<Entity> = Vec::with_capacity(entities.len());
for entity in entities.drain(..) {
let overlapping_idx = out.iter().rposition(|prev: &Entity| {
entity.start() < prev.end()
&& prev.start() < entity.end()
&& prev.entity_type == entity.entity_type
});
if let Some(idx) = overlapping_idx {
let prev_len = out[idx].end() - out[idx].start();
let cand_len = entity.end() - entity.start();
if cand_len > prev_len {
out[idx] = entity;
}
} else {
out.push(entity);
}
}
out
}
OverlapStrategy::KeepShortest => {
entities.sort_unstable_by(|a, b| {
let len_a = a.end() - a.start();
let len_b = b.end() - b.start();
len_a.cmp(&len_b).then_with(|| {
b.confidence
.partial_cmp(&a.confidence)
.unwrap_or(std::cmp::Ordering::Equal)
})
});
let mut out: Vec<Entity> = Vec::with_capacity(entities.len());
for entity in entities.drain(..) {
let is_superset_of_existing = out
.iter()
.any(|kept| entity.start() <= kept.start() && entity.end() >= kept.end());
if is_superset_of_existing {
continue;
}
let overlaps_existing = out
.iter()
.any(|kept| entity.start() < kept.end() && kept.start() < entity.end());
if !overlaps_existing {
out.push(entity);
}
}
out.sort_unstable_by_key(|e| e.start());
out
}
};
*entities = result;
}
pub fn extract_chunked_parallel<F>(
text: &str,
config: &ChunkConfig,
extract_fn: F,
) -> Result<Vec<Entity>>
where
F: Fn(&str, usize) -> Result<Vec<Entity>> + Send + Sync,
{
let chunks = chunk_text(text, config);
if chunks.is_empty() {
return Ok(Vec::new());
}
if chunks.len() == 1 {
return extract_fn(&chunks[0].text, chunks[0].char_offset);
}
let results: Vec<Result<Vec<Entity>>> = std::thread::scope(|s| {
let extract_fn = &extract_fn;
let handles: Vec<_> = chunks
.iter()
.map(|chunk| s.spawn(move || extract_fn(&chunk.text, chunk.char_offset)))
.collect();
handles.into_iter().map(|h| h.join().unwrap()).collect()
});
let mut seen = std::collections::HashSet::new();
let mut all_entities = Vec::new();
for result in results {
let entities = result?;
for entity in entities {
if seen.insert((entity.start(), entity.end())) {
all_entities.push(entity);
}
}
}
all_entities.sort_by_key(|e| (e.start(), e.end()));
deduplicate_overlapping(&mut all_entities, OverlapStrategy::KeepLongerSameType);
Ok(all_entities)
}
#[cfg(feature = "chunking")]
#[cfg_attr(docsrs, doc(cfg(feature = "chunking")))]
pub fn chunk_text_semantic(text: &str, chunk_capacity: usize) -> Vec<TextChunk> {
if text.is_empty() || chunk_capacity == 0 {
return Vec::new();
}
let splitter = TextSplitter::new(chunk_capacity);
splitter
.chunk_char_indices(text)
.map(|idx| TextChunk {
text: idx.chunk.to_string(),
char_offset: idx.char_offset,
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
use crate::EntityType;
#[test]
fn test_chunk_config_presets() {
let _no_chunk = ChunkConfig::no_chunking();
let _long = ChunkConfig::long_document();
let _realtime = ChunkConfig::realtime();
}
#[test]
fn test_find_sentence_boundary() {
let text: Vec<char> = "Hello world. This is a test.".chars().collect();
let boundary = find_sentence_boundary(&text, 0, 20);
assert!(boundary > 0);
assert!(boundary <= 20);
}
#[test]
fn test_find_word_boundary_at_end() {
let chars: Vec<char> = "hello world".chars().collect();
assert_eq!(find_word_boundary(&chars, 100), chars.len());
assert_eq!(find_word_boundary(&chars, chars.len()), chars.len());
}
#[test]
fn test_find_word_boundary_mid_word() {
let chars: Vec<char> = "hello world foo".chars().collect();
let boundary = find_word_boundary(&chars, 14);
assert_eq!(boundary, 12, "should break before 'foo'");
}
#[test]
fn test_find_word_boundary_no_whitespace() {
let chars: Vec<char> = "abcdefghij".chars().collect();
assert_eq!(find_word_boundary(&chars, 5), 5);
}
#[test]
fn test_find_sentence_boundary_cjk_punctuation() {
let text: Vec<char> = "这是测试。下一句话开始了".chars().collect();
let boundary = find_sentence_boundary(&text, 0, text.len());
assert!(boundary <= text.len());
assert!(boundary >= 5, "should be at or after the CJK period");
}
#[test]
fn test_find_sentence_boundary_no_punctuation() {
let chars: Vec<char> = "no punctuation here at all".chars().collect();
let boundary = find_sentence_boundary(&chars, 0, 20);
assert!(boundary > 0 && boundary <= 20);
}
#[test]
fn test_find_sentence_boundary_exclamation_and_question() {
let chars: Vec<char> = "Wow! Really? Yes indeed.".chars().collect();
let boundary = find_sentence_boundary(&chars, 0, 10);
assert_eq!(boundary, 5, "should split after 'Wow! '");
}
#[test]
fn test_chunk_config_no_chunking_values() {
let cfg = ChunkConfig::no_chunking();
assert_eq!(cfg.chunk_size, usize::MAX);
assert_eq!(cfg.overlap, 0);
assert!(!cfg.respect_sentences);
assert_eq!(cfg.buffer_size, usize::MAX);
}
#[test]
fn test_chunk_config_long_document_values() {
let cfg = ChunkConfig::long_document();
assert_eq!(cfg.chunk_size, 50_000);
assert_eq!(cfg.overlap, 200);
assert!(cfg.respect_sentences);
assert_eq!(cfg.buffer_size, 5000);
}
#[test]
fn test_chunk_config_realtime_values() {
let cfg = ChunkConfig::realtime();
assert_eq!(cfg.chunk_size, 1000);
assert_eq!(cfg.overlap, 50);
assert!(!cfg.respect_sentences);
assert_eq!(cfg.buffer_size, 100);
}
#[test]
fn test_chunk_config_default_values() {
let cfg = ChunkConfig::default();
assert_eq!(cfg.chunk_size, 10_000);
assert_eq!(cfg.overlap, 100);
assert!(cfg.respect_sentences);
assert_eq!(cfg.buffer_size, 1000);
}
#[test]
fn test_chunk_text_small_text_single_chunk() {
let config = ChunkConfig::default(); let chunks = chunk_text("Hello world.", &config);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].text, "Hello world.");
assert_eq!(chunks[0].char_offset, 0);
}
#[test]
fn test_chunk_text_empty() {
let config = ChunkConfig::default();
let chunks = chunk_text("", &config);
assert!(chunks.is_empty());
}
#[test]
fn test_chunk_text_splits_large_text() {
let config = ChunkConfig {
chunk_size: 20,
overlap: 5,
respect_sentences: false,
buffer_size: 100,
};
let text =
"Alice met Bob in Paris. Charlie visited London yesterday. Dave works in Tokyo today.";
let chunks = chunk_text(text, &config);
assert!(chunks.len() > 1, "should split into multiple chunks");
for i in 1..chunks.len() {
assert!(
chunks[i].char_offset > chunks[i - 1].char_offset,
"chunk offsets must increase"
);
}
let last = &chunks[chunks.len() - 1];
let total_covered = last.char_offset + last.text.chars().count();
assert!(
total_covered >= text.chars().count(),
"chunks must cover all text"
);
}
#[test]
fn test_chunk_text_respects_sentences() {
let config = ChunkConfig {
chunk_size: 30,
overlap: 5,
respect_sentences: true,
buffer_size: 100,
};
let text = "First sentence here. Second sentence here. Third sentence here.";
let chunks = chunk_text(text, &config);
assert!(chunks.len() >= 2);
assert!(
chunks[0].text.ends_with(". ") || chunks[0].text.ends_with('.'),
"first chunk should end near sentence boundary: {:?}",
chunks[0].text
);
}
#[test]
fn test_chunk_text_overlap_creates_redundancy() {
let config = ChunkConfig {
chunk_size: 20,
overlap: 10,
respect_sentences: false,
buffer_size: 100,
};
let text = "0123456789 abcdefghij klmnopqrst uvwxyz";
let chunks = chunk_text(text, &config);
assert!(chunks.len() >= 2);
if chunks.len() >= 2 {
let c0_end = chunks[0].char_offset + chunks[0].text.chars().count();
let c1_start = chunks[1].char_offset;
assert!(
c1_start < c0_end,
"overlap should cause chunk start ({}) < prev chunk end ({})",
c1_start,
c0_end
);
}
}
#[test]
fn test_chunk_text_unicode() {
let config = ChunkConfig {
chunk_size: 10,
overlap: 3,
respect_sentences: false,
buffer_size: 100,
};
let text = "東京は日本の首都です。パリはフランスの首都です。";
let chunks = chunk_text(text, &config);
assert!(chunks.len() >= 2);
let chars: Vec<char> = text.chars().collect();
for chunk in &chunks {
let expected: String = chars
[chunk.char_offset..chunk.char_offset + chunk.text.chars().count()]
.iter()
.collect();
assert_eq!(chunk.text, expected, "chunk text must match offset slice");
}
}
#[test]
fn test_overlap_strategy_keep_first_basic() {
let mut entities = vec![
Entity::new("New York", EntityType::Location, 0, 8, 0.7),
Entity::new("New York City", EntityType::Location, 0, 13, 0.9),
];
deduplicate_overlapping(&mut entities, OverlapStrategy::KeepFirst);
assert_eq!(entities.len(), 1);
assert_eq!(entities[0].text, "New York City");
}
#[test]
fn test_overlap_strategy_keep_first_non_overlapping() {
let mut entities = vec![
Entity::new("Alice", EntityType::Person, 0, 5, 0.9),
Entity::new("Bob", EntityType::Person, 10, 13, 0.8),
];
deduplicate_overlapping(&mut entities, OverlapStrategy::KeepFirst);
assert_eq!(entities.len(), 2);
}
#[test]
fn test_overlap_strategy_keep_first_chain() {
let mut entities = vec![
Entity::new("AB", EntityType::Person, 0, 5, 0.9),
Entity::new("BC", EntityType::Person, 3, 8, 0.8),
Entity::new("CD", EntityType::Person, 6, 10, 0.7),
];
deduplicate_overlapping(&mut entities, OverlapStrategy::KeepFirst);
assert_eq!(entities.len(), 2);
assert_eq!(entities[0].text, "AB");
assert_eq!(entities[1].text, "CD");
}
#[test]
fn test_overlap_strategy_keep_highest_confidence() {
let mut entities = vec![
Entity::new("New York", EntityType::Location, 0, 8, 0.9),
Entity::new("York City", EntityType::Location, 4, 13, 0.7),
];
deduplicate_overlapping(&mut entities, OverlapStrategy::KeepHighestConfidence);
assert_eq!(entities.len(), 1);
assert_eq!(entities[0].text, "New York");
}
#[test]
fn test_overlap_strategy_keep_highest_confidence_preserves_position_order() {
let mut entities = vec![
Entity::new("Alice", EntityType::Person, 20, 25, 0.95),
Entity::new("Bob", EntityType::Person, 0, 3, 0.5),
];
deduplicate_overlapping(&mut entities, OverlapStrategy::KeepHighestConfidence);
assert_eq!(entities.len(), 2);
assert_eq!(entities[0].text, "Bob");
assert_eq!(entities[1].text, "Alice");
}
#[test]
fn test_overlap_strategy_keep_highest_confidence_three_way() {
let mut entities = vec![
Entity::new("A", EntityType::Person, 0, 5, 0.5),
Entity::new("B", EntityType::Person, 3, 8, 0.9),
Entity::new("C", EntityType::Person, 6, 10, 0.7),
];
deduplicate_overlapping(&mut entities, OverlapStrategy::KeepHighestConfidence);
assert_eq!(entities.len(), 1);
assert_eq!(entities[0].text, "B");
}
#[test]
fn test_overlap_strategy_keep_longer_same_type_basic() {
let mut entities = vec![
Entity::new("New York", EntityType::Location, 0, 8, 0.7),
Entity::new("New York City", EntityType::Location, 0, 13, 0.6),
];
deduplicate_overlapping(&mut entities, OverlapStrategy::KeepLongerSameType);
assert_eq!(entities.len(), 1);
assert_eq!(entities[0].text, "New York City");
}
#[test]
fn test_overlap_strategy_keep_longer_same_type_different_types_preserved() {
let mut entities = vec![
Entity::new("New York", EntityType::Location, 0, 8, 0.7),
Entity::new("New York Times", EntityType::Organization, 0, 14, 0.6),
];
deduplicate_overlapping(&mut entities, OverlapStrategy::KeepLongerSameType);
assert_eq!(entities.len(), 2);
}
#[test]
fn test_overlap_strategy_keep_longer_same_type_shorter_kept_when_different_type() {
let mut entities = vec![
Entity::new("Paris", EntityType::Location, 0, 5, 0.9),
Entity::new("Paris Hilton", EntityType::Person, 0, 12, 0.8),
];
deduplicate_overlapping(&mut entities, OverlapStrategy::KeepLongerSameType);
assert_eq!(entities.len(), 2);
assert_eq!(entities[0].text, "Paris");
assert_eq!(entities[1].text, "Paris Hilton");
}
#[test]
fn test_overlap_strategy_keep_shortest_drops_supersets() {
let mut entities = vec![
Entity::new(
"Department of Defense",
EntityType::Organization,
4,
25,
0.8,
),
Entity::new(
"The Department of Defense",
EntityType::Organization,
0,
25,
0.7,
),
];
deduplicate_overlapping(&mut entities, OverlapStrategy::KeepShortest);
assert_eq!(entities.len(), 1);
assert_eq!(entities[0].text, "Department of Defense");
}
#[test]
fn test_overlap_strategy_keep_shortest_no_overlap() {
let mut entities = vec![
Entity::new("IBM", EntityType::Organization, 0, 3, 0.9),
Entity::new("NASA", EntityType::Organization, 10, 14, 0.8),
];
deduplicate_overlapping(&mut entities, OverlapStrategy::KeepShortest);
assert_eq!(entities.len(), 2);
}
#[test]
fn test_overlap_strategy_keep_shortest_partial_overlap_dropped() {
let mut entities = vec![
Entity::new("AB", EntityType::Person, 0, 5, 0.9),
Entity::new("BC", EntityType::Person, 3, 8, 0.8),
];
deduplicate_overlapping(&mut entities, OverlapStrategy::KeepShortest);
assert_eq!(entities.len(), 1);
assert_eq!(entities[0].text, "AB");
}
#[test]
fn test_overlap_strategy_empty_input() {
for strategy in [
OverlapStrategy::KeepFirst,
OverlapStrategy::KeepHighestConfidence,
OverlapStrategy::KeepLongerSameType,
OverlapStrategy::KeepShortest,
] {
let mut entities: Vec<Entity> = vec![];
deduplicate_overlapping(&mut entities, strategy);
assert!(entities.is_empty());
}
}
#[test]
fn test_overlap_strategy_single_entity() {
for strategy in [
OverlapStrategy::KeepFirst,
OverlapStrategy::KeepHighestConfidence,
OverlapStrategy::KeepLongerSameType,
OverlapStrategy::KeepShortest,
] {
let mut entities = vec![Entity::new("Alice", EntityType::Person, 0, 5, 0.9)];
deduplicate_overlapping(&mut entities, strategy);
assert_eq!(entities.len(), 1);
assert_eq!(entities[0].text, "Alice");
}
}
#[test]
fn test_overlap_strategy_result_sorted_by_position() {
for strategy in [
OverlapStrategy::KeepFirst,
OverlapStrategy::KeepHighestConfidence,
OverlapStrategy::KeepLongerSameType,
OverlapStrategy::KeepShortest,
] {
let mut entities = vec![
Entity::new("C", EntityType::Person, 20, 25, 0.5),
Entity::new("A", EntityType::Person, 0, 3, 0.9),
Entity::new("B", EntityType::Person, 10, 15, 0.7),
];
deduplicate_overlapping(&mut entities, strategy);
for i in 1..entities.len() {
assert!(
entities[i].start() >= entities[i - 1].start(),
"result must be sorted by position for {:?}",
strategy
);
}
}
}
#[test]
fn test_keep_longer_same_type_non_adjacent_interleaved() {
let mut entities = vec![
Entity::new("A", EntityType::Location, 0, 10, 0.8),
Entity::new("B", EntityType::Person, 5, 12, 0.8),
Entity::new("C", EntityType::Location, 8, 15, 0.8),
];
deduplicate_overlapping(&mut entities, OverlapStrategy::KeepLongerSameType);
assert!(
entities.iter().any(|e| e.text == "B"),
"Person entity B must be preserved (different type)"
);
let locs: Vec<&Entity> = entities
.iter()
.filter(|e| e.entity_type == EntityType::Location)
.collect();
assert_eq!(locs.len(), 1, "exactly one Location should survive");
assert_eq!(
locs[0].text, "A",
"longer Location A should be kept over shorter C"
);
for i in 1..entities.len() {
assert!(entities[i].start() >= entities[i - 1].start());
}
}
#[test]
fn test_keep_longer_same_type_non_adjacent_candidate_wins() {
let mut entities = vec![
Entity::new("A", EntityType::Location, 0, 10, 0.8),
Entity::new("B", EntityType::Person, 5, 12, 0.8),
Entity::new("C", EntityType::Location, 8, 20, 0.8),
];
deduplicate_overlapping(&mut entities, OverlapStrategy::KeepLongerSameType);
assert!(
entities.iter().any(|e| e.text == "B"),
"Person entity B must be preserved"
);
let locs: Vec<&Entity> = entities
.iter()
.filter(|e| e.entity_type == EntityType::Location)
.collect();
assert_eq!(locs.len(), 1, "exactly one Location should survive");
assert_eq!(
locs[0].text, "C",
"longer Location C[8,20] should replace shorter A[0,10]"
);
}
#[test]
fn test_keep_first_contained_span_dropped() {
let mut entities = vec![
Entity::new("outer", EntityType::Organization, 0, 5, 0.9),
Entity::new("inner", EntityType::Organization, 1, 3, 0.95),
];
deduplicate_overlapping(&mut entities, OverlapStrategy::KeepFirst);
assert_eq!(entities.len(), 1);
assert_eq!(
entities[0].text, "outer",
"outer span should be kept; inner contained span dropped"
);
}
#[test]
fn test_keep_first_outer_dropped_when_inner_higher_confidence() {
let mut entities = vec![
Entity::new("outer", EntityType::Organization, 0, 5, 0.5),
Entity::new("inner", EntityType::Organization, 1, 3, 0.95),
];
deduplicate_overlapping(&mut entities, OverlapStrategy::KeepFirst);
assert_eq!(entities.len(), 1);
assert_eq!(entities[0].text, "outer");
}
#[test]
fn test_keep_highest_confidence_nan_does_not_crash() {
let mut entities = vec![
Entity::new("Alice", EntityType::Person, 0, 5, 0.9),
Entity::new("NaN entity", EntityType::Person, 3, 8, f64::NAN),
];
deduplicate_overlapping(&mut entities, OverlapStrategy::KeepHighestConfidence);
assert!(
!entities.is_empty(),
"result must be non-empty after NaN entity processing"
);
}
#[test]
fn test_keep_highest_confidence_nan_non_overlapping_no_crash() {
let mut entities = vec![
Entity::new("Alice", EntityType::Person, 0, 5, 0.9),
Entity::new("NaN ent", EntityType::Person, 20, 27, f64::NAN),
Entity::new("Bob", EntityType::Person, 10, 13, 0.7),
];
deduplicate_overlapping(&mut entities, OverlapStrategy::KeepHighestConfidence);
assert!(
entities.iter().any(|e| e.text == "Alice"),
"Alice must be kept"
);
assert!(entities.iter().any(|e| e.text == "Bob"), "Bob must be kept");
}
#[test]
fn test_find_sentence_boundary_cjk_no_whitespace_after_period() {
let text = "这是第一句。这是第二句。这是第三句";
let chars: Vec<char> = text.chars().collect();
let boundary = find_sentence_boundary(&chars, 0, 10);
assert_eq!(
boundary, 6,
"should split immediately after 。 (index 5), placing boundary at char 6"
);
}
#[test]
fn test_find_sentence_boundary_cjk_second_period() {
let text = "这是第一句。这是第二句。这是第三句";
let chars: Vec<char> = text.chars().collect();
let boundary = find_sentence_boundary(&chars, 0, chars.len() - 1);
assert_eq!(
boundary, 12,
"should split after second 。 at index 11, placing boundary at char 12"
);
}
#[test]
fn test_extract_chunked_parallel_offset_adjustment() {
let text: String = "x".repeat(60);
let config = ChunkConfig {
chunk_size: 30,
overlap: 10,
respect_sentences: false,
buffer_size: 100,
};
let expected_offsets: Vec<usize> = chunk_text(&text, &config)
.iter()
.map(|c| c.char_offset)
.collect();
let result = extract_chunked_parallel(&text, &config, |_chunk, char_offset| {
Ok(vec![Entity::new(
"token",
EntityType::Organization,
5 + char_offset,
10 + char_offset,
0.9,
)])
});
let entities = result.expect("extract_chunked_parallel must not error");
for off in &expected_offsets {
assert!(
entities
.iter()
.any(|e| e.start() == 5 + off && e.end() == 10 + off),
"global entity [{}, {}] must be present; got: {:?}",
5 + off,
10 + off,
entities
.iter()
.map(|e| (e.start(), e.end()))
.collect::<Vec<_>>()
);
}
assert_eq!(
entities.len(),
expected_offsets.len(),
"one entity per chunk, no duplicates; got: {:?}",
entities
.iter()
.map(|e| (e.start(), e.end()))
.collect::<Vec<_>>()
);
for i in 1..entities.len() {
assert!(entities[i].start() >= entities[i - 1].start());
}
}
#[test]
fn test_extract_chunked_parallel_boundary_dedup() {
let text: String = "x".repeat(60);
let config = ChunkConfig {
chunk_size: 30,
overlap: 10,
respect_sentences: false,
buffer_size: 100,
};
let result = extract_chunked_parallel(&text, &config, |_chunk, _char_offset| {
Ok(vec![Entity::new("shared", EntityType::Person, 5, 10, 0.9)])
});
let entities = result.expect("must not error");
assert_eq!(
entities.len(),
1,
"duplicate global span [5,10] must be deduplicated; got {} entities",
entities.len()
);
assert_eq!(entities[0].start(), 5);
assert_eq!(entities[0].end(), 10);
}
#[test]
fn test_extract_chunked_parallel_empty_text() {
let result = extract_chunked_parallel("", &ChunkConfig::default(), |_chunk, _offset| {
Ok(vec![Entity::new("x", EntityType::Person, 0, 1, 0.9)])
});
let entities = result.expect("must not error on empty text");
assert!(entities.is_empty(), "empty text must produce no entities");
}
#[test]
fn test_chunk_text_overlap_equals_chunk_size_terminates() {
let config = ChunkConfig {
chunk_size: 5,
overlap: 5, respect_sentences: false,
buffer_size: 100,
};
let text = "abcdefghijklmno"; let chunks = chunk_text(text, &config);
assert!(!chunks.is_empty(), "must produce at least one chunk");
for i in 1..chunks.len() {
assert!(
chunks[i].char_offset > chunks[i - 1].char_offset,
"chunk offsets must strictly increase even when overlap == chunk_size"
);
}
}
#[test]
fn test_chunk_text_overlap_greater_than_chunk_size_terminates() {
let config = ChunkConfig {
chunk_size: 4,
overlap: 10, respect_sentences: false,
buffer_size: 100,
};
let text = "abcdefghij"; let chunks = chunk_text(text, &config);
assert!(!chunks.is_empty());
for i in 1..chunks.len() {
assert!(
chunks[i].char_offset > chunks[i - 1].char_offset,
"forward progress must hold when overlap > chunk_size"
);
}
}
#[test]
fn test_keep_shortest_same_length_same_confidence_deterministic() {
let mut entities = vec![
Entity::new("abc", EntityType::Person, 0, 3, 0.7),
Entity::new("def", EntityType::Person, 10, 13, 0.7),
];
deduplicate_overlapping(&mut entities, OverlapStrategy::KeepShortest);
assert_eq!(
entities.len(),
2,
"non-overlapping same-length same-confidence entities must both survive"
);
let make = || {
vec![
Entity::new("AB", EntityType::Organization, 0, 5, 0.8),
Entity::new("BC", EntityType::Organization, 3, 8, 0.8),
]
};
let mut first_run = make();
deduplicate_overlapping(&mut first_run, OverlapStrategy::KeepShortest);
assert_eq!(
first_run.len(),
1,
"overlapping same-length entities: one must be dropped"
);
let kept_text = first_run[0].text.clone();
let mut second_run = make();
deduplicate_overlapping(&mut second_run, OverlapStrategy::KeepShortest);
assert_eq!(second_run.len(), 1);
assert_eq!(
second_run[0].text, kept_text,
"KeepShortest must be deterministic: same input must always keep the same entity"
);
}
#[test]
fn test_all_strategies_empty_list_is_noop() {
for strategy in [
OverlapStrategy::KeepFirst,
OverlapStrategy::KeepHighestConfidence,
OverlapStrategy::KeepLongerSameType,
OverlapStrategy::KeepShortest,
] {
let mut entities: Vec<Entity> = Vec::new();
deduplicate_overlapping(&mut entities, strategy);
assert!(
entities.is_empty(),
"empty input must remain empty for {:?}",
strategy
);
}
}
}