use super::*;
use crate::Confidence;
use std::sync::LazyLock;
static DEFAULT_STACKED: LazyLock<StackedNER> = LazyLock::new(StackedNER::default);
fn extract(text: &str) -> Vec<Entity> {
DEFAULT_STACKED.extract_entities(text, None).unwrap()
}
fn has_type(entities: &[Entity], ty: &EntityType) -> bool {
entities.iter().any(|e| e.entity_type == *ty)
}
#[test]
fn test_default_finds_patterns() {
let e = extract("Cost: $100");
assert!(has_type(&e, &EntityType::Money));
}
#[test]
fn test_default_finds_heuristic() {
let e = extract("Mr. Smith said hello");
assert!(has_type(&e, &EntityType::Person));
}
#[test]
fn test_default_finds_both() {
let e = extract("Dr. Smith charges $200/hr");
assert!(has_type(&e, &EntityType::Money));
}
#[test]
fn test_no_overlaps() {
let e = extract("Price is $100 from John at Google Inc.");
for i in 0..e.len() {
for j in (i + 1)..e.len() {
let overlap = e[i].start() < e[j].end() && e[j].start() < e[i].end();
assert!(!overlap, "Overlap: {:?} and {:?}", e[i], e[j]);
}
}
}
#[test]
fn test_sorted_output() {
let e = extract("$100 for John in Paris on 2024-01-15");
for i in 1..e.len() {
assert!(e[i - 1].start() <= e[i].start());
}
}
#[cfg(feature = "onnx")]
#[test]
fn test_default_includes_ml_backend_when_available() {
let stats = DEFAULT_STACKED.stats();
if stats.layer_count >= 3 {
let has_ml = stats.layer_names.iter().any(|name| {
let n = name.to_lowercase();
n.contains("bert") || n.contains("gliner")
});
assert!(
has_ml,
"StackedNER with {} layers should include an ML backend. Got layers: {:?}",
stats.layer_count, stats.layer_names
);
} else {
assert_eq!(stats.layer_count, 2);
assert!(stats.layer_names.iter().any(|n| n.contains("regex")));
assert!(stats.layer_names.iter().any(|n| n.contains("heuristic")));
}
}
#[test]
#[should_panic(expected = "requires at least one layer")]
fn test_builder_empty_panics() {
let _ner = StackedNER::builder().build();
}
#[test]
fn test_builder_single_layer() {
let ner = StackedNER::builder().layer(RegexNER::new()).build();
let e = ner.extract_entities("$100", None).unwrap();
assert!(has_type(&e, &EntityType::Money));
}
#[test]
fn test_builder_layer_names() {
let ner = StackedNER::builder()
.layer(RegexNER::new())
.layer(HeuristicNER::new())
.build();
let names = ner.layer_names();
assert!(names.iter().any(|n| n.contains("regex")));
assert!(names.iter().any(|n| n.contains("heuristic")));
}
#[test]
fn test_builder_strategy() {
let ner = StackedNER::builder()
.layer(RegexNER::new())
.strategy(ConflictStrategy::LongestSpan)
.build();
assert_eq!(ner.strategy(), ConflictStrategy::LongestSpan);
}
#[test]
fn test_pattern_only() {
let ner = StackedNER::pattern_only();
let e = ner.extract_entities("$100 for Dr. Smith", None).unwrap();
assert!(has_type(&e, &EntityType::Money));
assert!(!has_type(&e, &EntityType::Person));
}
#[test]
fn test_heuristic_only() {
let ner = StackedNER::heuristic_only();
let e = ner.extract_entities("$100 for John", None).unwrap();
assert!(
!has_type(&e, &EntityType::Money),
"Should NOT find money without pattern layer: {:?}",
e
);
}
#[test]
fn test_strategy_default_is_priority() {
assert_eq!(DEFAULT_STACKED.strategy(), ConflictStrategy::Priority);
}
use crate::MockModel;
fn mock_model(name: &'static str, entities: Vec<Entity>) -> MockModel {
MockModel::new(name).with_entities(entities)
}
fn mock_entity(text: &str, start: usize, ty: EntityType, conf: f64) -> Entity {
Entity::new(text, ty, start, start + text.len(), conf)
}
#[test]
fn test_priority_first_wins() {
let layer1 = mock_model(
"l1",
vec![mock_entity("New York", 0, EntityType::Location, 0.8)],
);
let layer2 = mock_model(
"l2",
vec![mock_entity("New York City", 0, EntityType::Location, 0.9)],
);
let ner = StackedNER::builder()
.layer(layer1)
.layer(layer2)
.strategy(ConflictStrategy::Priority)
.build();
let e = ner.extract_entities("New York City", None).unwrap();
assert_eq!(e.len(), 1);
assert_eq!(e[0].text, "New York"); }
#[test]
fn test_longest_span_wins() {
let layer1 = mock_model(
"l1",
vec![mock_entity("New York", 0, EntityType::Location, 0.8)],
);
let layer2 = mock_model(
"l2",
vec![mock_entity("New York City", 0, EntityType::Location, 0.7)],
);
let ner = StackedNER::builder()
.layer(layer1)
.layer(layer2)
.strategy(ConflictStrategy::LongestSpan)
.build();
let e = ner.extract_entities("New York City", None).unwrap();
assert_eq!(e.len(), 1);
assert_eq!(e[0].text, "New York City"); }
#[test]
fn test_highest_conf_wins() {
let layer1 = mock_model(
"l1",
vec![mock_entity("Apple", 0, EntityType::Organization, 0.6)],
);
let layer2 = mock_model(
"l2",
vec![mock_entity("Apple", 0, EntityType::Organization, 0.95)],
);
let ner = StackedNER::builder()
.layer(layer1)
.layer(layer2)
.strategy(ConflictStrategy::HighestConf)
.build();
let e = ner.extract_entities("Apple Inc", None).unwrap();
assert_eq!(e.len(), 1);
assert!(e[0].confidence > 0.9);
}
#[test]
fn test_union_keeps_all() {
let layer1 = mock_model("l1", vec![mock_entity("John", 0, EntityType::Person, 0.8)]);
let layer2 = mock_model("l2", vec![mock_entity("John", 0, EntityType::Person, 0.9)]);
let ner = StackedNER::builder()
.layer(layer1)
.layer(layer2)
.strategy(ConflictStrategy::Union)
.build();
let e = ner.extract_entities("John is here", None).unwrap();
assert_eq!(e.len(), 2); }
#[test]
fn test_highest_conf_multiple_overlaps_ties_prefer_existing() {
let text = "aaaaa bbbbb";
let layer1 = mock_model(
"l1",
vec![
mock_entity("aaaaa", 0, EntityType::Person, 0.9),
mock_entity("bbbbb", 10, EntityType::Person, 0.9), ],
);
let layer2 = mock_model("l2", vec![mock_entity(text, 0, EntityType::Person, 0.1)]);
let ner = StackedNER::builder()
.layer(layer1)
.layer(layer2)
.strategy(ConflictStrategy::HighestConf)
.build();
let e = ner.extract_entities(text, None).unwrap();
assert_eq!(e.len(), 1);
assert_eq!(e[0].text, "aaaaa", "should keep earliest existing entity");
assert_eq!(e[0].start(), 0);
assert_eq!(e[0].end(), 5);
}
#[test]
fn test_clamped_spans_keep_text_consistent() {
let layer = MockModel::new("l1")
.with_entities(vec![Entity::new(
"hello world",
EntityType::Person,
0,
100,
0.9,
)])
.without_validation();
let ner = StackedNER::builder()
.layer(layer)
.strategy(ConflictStrategy::Priority)
.build();
let text = "hello";
let e = ner.extract_entities(text, None).unwrap();
assert_eq!(e.len(), 1);
assert_eq!(e[0].start(), 0);
assert_eq!(e[0].end(), 5);
assert_eq!(e[0].text, "hello");
}
#[test]
fn test_non_overlapping_always_kept() {
for strategy in [
ConflictStrategy::Priority,
ConflictStrategy::LongestSpan,
ConflictStrategy::HighestConf,
] {
let ner = StackedNER::builder()
.layer(mock_model(
"l1",
vec![mock_entity("John", 0, EntityType::Person, 0.8)],
))
.layer(mock_model(
"l2",
vec![mock_entity("Paris", 8, EntityType::Location, 0.9)],
))
.strategy(strategy)
.build();
let e = ner.extract_entities("John in Paris", None).unwrap();
assert_eq!(e.len(), 2, "Strategy {:?} should keep both", strategy);
}
}
#[test]
fn test_press_release() {
let text = r#"
PRESS RELEASE - January 15, 2024
Mr. John Smith, CEO of Acme Corporation, announced today that the company
will invest $50 million in their San Francisco headquarters.
Contact: press@acme.com or call (555) 123-4567
The expansion is expected to increase revenue by 25%.
"#;
let e = extract(text);
assert!(has_type(&e, &EntityType::Date));
assert!(has_type(&e, &EntityType::Money));
assert!(has_type(&e, &EntityType::Email));
assert!(has_type(&e, &EntityType::Phone));
assert!(has_type(&e, &EntityType::Percent));
}
#[test]
fn test_empty_text() {
let e = extract("");
assert!(e.is_empty());
}
#[test]
fn test_no_entities() {
let e = extract("the quick brown fox jumps over the lazy dog");
assert!(e.is_empty());
}
#[test]
fn test_supported_types() {
let types = DEFAULT_STACKED.supported_types();
assert!(types.contains(&EntityType::Date));
assert!(types.contains(&EntityType::Money));
assert!(types.contains(&EntityType::Person));
assert!(types.contains(&EntityType::Organization));
assert!(types.contains(&EntityType::Location));
}
#[test]
fn test_stats() {
let stats = DEFAULT_STACKED.stats();
assert!(
(2..=4).contains(&stats.layer_count),
"Expected 2-4 layers, got {}",
stats.layer_count
);
assert_eq!(stats.strategy, ConflictStrategy::Priority);
assert_eq!(stats.layer_names.len(), stats.layer_count);
assert!(stats.layer_names.iter().any(|n| n.contains("regex")));
assert!(stats.layer_names.iter().any(|n| n.contains("heuristic")));
}
#[test]
fn test_many_overlapping_entities() {
let text = "New York City is a large metropolitan area";
let layer1 = mock_model(
"l1",
vec![mock_entity("New York", 0, EntityType::Location, 0.8)],
);
let layer2 = mock_model(
"l2",
vec![mock_entity("York City", 4, EntityType::Location, 0.7)],
);
let layer3 = mock_model(
"l3",
vec![mock_entity("New York City", 0, EntityType::Location, 0.9)],
);
let layer4 = mock_model(
"l4",
vec![mock_entity("City is", 9, EntityType::Location, 0.6)],
);
let ner = StackedNER::builder()
.layer(layer1)
.layer(layer2)
.layer(layer3)
.layer(layer4)
.strategy(ConflictStrategy::Priority)
.build();
let e = ner.extract_entities(text, None).unwrap();
assert!(!e.is_empty());
}
#[test]
fn test_large_entity_set() {
let mut layer1_entities = Vec::new();
let mut layer2_entities = Vec::new();
let base_text = "word ".repeat(2000);
for i in 0..500 {
let start = i * 10;
let end = start + 5;
if end < base_text.len() {
layer1_entities.push(mock_entity(
&base_text[start..end],
start,
EntityType::Person,
0.5 + (i % 10) as f64 / 20.0,
));
}
}
for i in 0..500 {
let start = i * 10 + 3; let end = start + 5;
if end < base_text.len() {
layer2_entities.push(mock_entity(
&base_text[start..end],
start,
EntityType::Organization,
0.5 + (i % 10) as f64 / 20.0,
));
}
}
let layer1 = mock_model("l1", layer1_entities);
let layer2 = mock_model("l2", layer2_entities);
let ner = StackedNER::builder()
.layer(layer1)
.layer(layer2)
.strategy(ConflictStrategy::LongestSpan)
.build();
let e = ner.extract_entities(&base_text, None).unwrap();
assert!(!e.is_empty());
assert!(e.len() <= 1000); }
#[test]
fn test_layer_error_handling() {
#[derive(Clone)]
struct FailingModel {
name: &'static str,
}
impl crate::sealed::Sealed for FailingModel {}
impl crate::Model for FailingModel {
fn extract_entities(
&self,
_text: &str,
_language: Option<Language>,
) -> crate::Result<Vec<crate::Entity>> {
Err(crate::Error::Inference(format!(
"intentional failure from {}",
self.name
)))
}
fn supported_types(&self) -> Vec<crate::EntityType> {
vec![crate::EntityType::Person]
}
fn is_available(&self) -> bool {
true
}
fn name(&self) -> &'static str {
self.name
}
}
let ner_fail_first = StackedNER::builder()
.layer(FailingModel { name: "fail" }) .layer(crate::HeuristicNER::new())
.strategy(ConflictStrategy::Priority)
.build();
let result = ner_fail_first.extract_entities("Dr. John Smith at Apple", None);
assert!(
result.is_ok(),
"Should succeed with partial results from later layers: {:?}",
result,
);
let ner_all_fail = StackedNER::builder()
.layer(FailingModel { name: "fail1" })
.layer(FailingModel { name: "fail2" })
.strategy(ConflictStrategy::Priority)
.build();
let result = ner_all_fail.extract_entities("anything", None);
assert!(result.is_err(), "Should fail when ALL layers fail");
let ner_fail_second = StackedNER::builder()
.layer(crate::HeuristicNER::new()) .layer(FailingModel { name: "fail" }) .strategy(ConflictStrategy::Priority)
.build();
let result = ner_fail_second.extract_entities("Dr. John Smith works at Apple Inc.", None);
assert!(
result.is_ok(),
"Should succeed with partial results: {:?}",
result
);
let entities = result.unwrap();
assert!(
!entities.is_empty(),
"Should have entities from working layer"
);
let ner_all_working = StackedNER::builder()
.layer(crate::RegexNER::new())
.layer(crate::HeuristicNER::new())
.strategy(ConflictStrategy::Priority)
.build();
let long_text = "word ".repeat(2000);
let _ = ner_all_working.extract_entities(&long_text, None).unwrap();
}
#[test]
fn test_many_layers() {
let mut builder = StackedNER::builder();
let layer_names = [
"layer0", "layer1", "layer2", "layer3", "layer4", "layer5", "layer6", "layer7", "layer8",
"layer9",
];
for (i, &name) in layer_names.iter().enumerate() {
let entities = vec![mock_entity(
"test",
0,
EntityType::Person,
0.5 + (i as f64 / 20.0),
)];
builder = builder.layer(mock_model(name, entities));
}
let ner = builder.strategy(ConflictStrategy::Priority).build();
let e = ner.extract_entities("test", None).unwrap();
assert_eq!(e.len(), 1);
}
#[test]
fn test_union_with_many_overlaps() {
let mut builder = StackedNER::builder();
let layer_names = ["layer0", "layer1", "layer2", "layer3", "layer4"];
for (i, &name) in layer_names.iter().enumerate() {
let entities = vec![mock_entity(
"New York",
0,
EntityType::Location,
0.5 + (i as f64 / 10.0),
)];
builder = builder.layer(mock_model(name, entities));
}
let ner = builder.strategy(ConflictStrategy::Union).build();
let e = ner.extract_entities("New York", None).unwrap();
assert_eq!(e.len(), 5);
}
#[test]
fn test_highest_conf_with_ties() {
let layer1 = mock_model(
"l1",
vec![mock_entity("Apple", 0, EntityType::Organization, 0.8)],
);
let layer2 = mock_model(
"l2",
vec![mock_entity("Apple", 0, EntityType::Organization, 0.8)], );
let ner = StackedNER::builder()
.layer(layer1)
.layer(layer2)
.strategy(ConflictStrategy::HighestConf)
.build();
let e = ner.extract_entities("Apple Inc", None).unwrap();
assert_eq!(e.len(), 1);
assert_eq!(e[0].confidence, 0.8);
}
#[test]
fn test_longest_span_with_ties() {
let layer1 = mock_model(
"l1",
vec![mock_entity("Apple", 0, EntityType::Organization, 0.8)],
);
let layer2 = mock_model(
"l2",
vec![mock_entity("Apple", 0, EntityType::Organization, 0.9)], );
let ner = StackedNER::builder()
.layer(layer1)
.layer(layer2)
.strategy(ConflictStrategy::LongestSpan)
.build();
let e = ner.extract_entities("Apple Inc", None).unwrap();
assert_eq!(e.len(), 1);
assert_eq!(e[0].text, "Apple");
}
#[test]
fn test_method_for_backend_name_all_branches() {
use crate::ExtractionMethod;
assert_eq!(method_for_backend_name("regex"), ExtractionMethod::Pattern);
assert_eq!(
method_for_backend_name("heuristic"),
ExtractionMethod::Heuristic
);
assert_eq!(
method_for_backend_name("onnx-bert"),
ExtractionMethod::Neural
);
assert_eq!(method_for_backend_name("candle"), ExtractionMethod::Neural);
assert_eq!(method_for_backend_name(""), ExtractionMethod::Neural);
}
#[test]
fn test_resolve_priority_always_keeps_existing() {
let existing = mock_entity("A", 0, EntityType::Person, 0.1);
let candidate = mock_entity("ABCDE", 0, EntityType::Person, 1.0);
assert!(matches!(
ConflictStrategy::Priority.resolve(&existing, &candidate),
Resolution::KeepExisting
));
}
#[test]
fn test_resolve_longest_span_picks_longer() {
let short = mock_entity("AB", 0, EntityType::Person, 0.9);
let long = mock_entity("ABCDE", 0, EntityType::Person, 0.1);
assert!(matches!(
ConflictStrategy::LongestSpan.resolve(&short, &long),
Resolution::Replace
));
assert!(matches!(
ConflictStrategy::LongestSpan.resolve(&long, &short),
Resolution::KeepExisting
));
}
#[test]
fn test_resolve_longest_span_equal_prefers_existing() {
let a = mock_entity("ABC", 0, EntityType::Person, 0.5);
let b = mock_entity("XYZ", 0, EntityType::Person, 0.9);
assert!(matches!(
ConflictStrategy::LongestSpan.resolve(&a, &b),
Resolution::KeepExisting
));
}
#[test]
fn test_resolve_highest_conf_picks_higher() {
let low = mock_entity("A", 0, EntityType::Person, 0.3);
let high = mock_entity("A", 0, EntityType::Person, 0.9);
assert!(matches!(
ConflictStrategy::HighestConf.resolve(&low, &high),
Resolution::Replace
));
assert!(matches!(
ConflictStrategy::HighestConf.resolve(&high, &low),
Resolution::KeepExisting
));
}
#[test]
fn test_resolve_highest_conf_equal_prefers_existing() {
let a = mock_entity("A", 0, EntityType::Person, 0.7);
let b = mock_entity("B", 0, EntityType::Person, 0.7);
assert!(matches!(
ConflictStrategy::HighestConf.resolve(&a, &b),
Resolution::KeepExisting
));
}
#[test]
fn test_resolve_union_always_keeps_both() {
let a = mock_entity("A", 0, EntityType::Person, 0.1);
let b = mock_entity("BCDEF", 0, EntityType::Organization, 1.0);
assert!(matches!(
ConflictStrategy::Union.resolve(&a, &b),
Resolution::KeepBoth
));
}
#[test]
fn test_try_build_empty_returns_error() {
let result = StackedNER::builder().try_build();
assert!(result.is_err());
let msg = format!("{}", result.err().unwrap());
assert!(
msg.contains("at least one layer"),
"unexpected error: {msg}"
);
}
#[test]
fn test_try_build_ok_single_layer() {
let result = StackedNER::builder().layer(RegexNER::new()).try_build();
assert!(result.is_ok());
}
#[test]
fn test_invalid_span_start_ge_end_skipped() {
#[derive(Clone)]
struct ZeroWidthModel;
impl crate::sealed::Sealed for ZeroWidthModel {}
impl crate::Model for ZeroWidthModel {
fn extract_entities(
&self,
_text: &str,
_language: Option<Language>,
) -> crate::Result<Vec<Entity>> {
Ok(vec![Entity::new("ghost", EntityType::Person, 5, 5, 0.9)])
}
fn supported_types(&self) -> Vec<EntityType> {
vec![EntityType::Person]
}
fn is_available(&self) -> bool {
true
}
fn name(&self) -> &'static str {
"zero-width"
}
}
let ner = StackedNER::builder()
.layer(ZeroWidthModel)
.strategy(ConflictStrategy::Priority)
.build();
let result = ner.extract_entities("hello world", None).unwrap();
assert!(
result.is_empty(),
"zero-width entity should be filtered out, got: {result:?}"
);
}
#[test]
fn test_provenance_not_overwritten_when_already_set() {
use crate::{ExtractionMethod, Provenance};
let mut entity = mock_entity("Apple", 0, EntityType::Organization, 0.9);
entity.provenance = Some(Provenance {
source: Cow::Borrowed("custom-source"),
method: ExtractionMethod::Pattern,
pattern: Some("custom-pattern".into()),
raw_confidence: Some(Confidence::new(0.5)),
model_version: None,
timestamp: None,
});
let layer = MockModel::new("l1").with_entities(vec![entity]);
let ner = StackedNER::builder()
.layer(layer)
.strategy(ConflictStrategy::Priority)
.build();
let e = ner.extract_entities("Apple", None).unwrap();
assert_eq!(e.len(), 1);
let prov = e[0].provenance.as_ref().unwrap();
assert_eq!(prov.source.as_ref(), "custom-source");
assert_eq!(prov.method, ExtractionMethod::Pattern);
assert_eq!(prov.pattern.as_deref(), Some("custom-pattern"));
}
#[test]
fn test_dedup_removes_same_span_same_type_non_union() {
let e1 = mock_entity("Apple", 0, EntityType::Organization, 0.8);
let e2 = mock_entity("Apple", 0, EntityType::Organization, 0.8);
let layer1 = mock_model("l1", vec![e1]);
let layer2 = mock_model("l2", vec![e2]);
let ner = StackedNER::builder()
.layer(layer1)
.layer(layer2)
.strategy(ConflictStrategy::Priority)
.build();
let e = ner.extract_entities("Apple", None).unwrap();
assert_eq!(e.len(), 1, "duplicates should be deduplicated");
}
#[test]
fn test_multiple_overlaps_longest_span_replaces() {
let text = "New York City area is large";
let layer1 = mock_model(
"l1",
vec![
mock_entity("New", 0, EntityType::Location, 0.5),
mock_entity("York", 4, EntityType::Location, 0.5),
mock_entity("City", 9, EntityType::Location, 0.5),
],
);
let layer2 = mock_model(
"l2",
vec![mock_entity("New York City", 0, EntityType::Location, 0.4)],
);
let ner = StackedNER::builder()
.layer(layer1)
.layer(layer2)
.strategy(ConflictStrategy::LongestSpan)
.build();
let e = ner.extract_entities(text, None).unwrap();
assert_eq!(e.len(), 1);
assert_eq!(e[0].text, "New York City");
}
#[test]
fn test_stats_custom_build() {
let ner = StackedNER::builder()
.layer(mock_model("alpha", vec![]))
.layer(mock_model("beta", vec![]))
.layer(mock_model("gamma", vec![]))
.strategy(ConflictStrategy::HighestConf)
.build();
let stats = ner.stats();
assert_eq!(stats.layer_count, 3);
assert_eq!(stats.strategy, ConflictStrategy::HighestConf);
assert_eq!(stats.layer_names, vec!["alpha", "beta", "gamma"]);
}
#[test]
fn test_name_reflects_layers() {
let ner = StackedNER::builder()
.layer(RegexNER::new())
.layer(HeuristicNER::new())
.build();
let name = ner.name();
assert!(name.starts_with("stacked("), "name = {name}");
assert!(name.contains("regex"), "name should contain regex: {name}");
assert!(
name.contains("heuristic"),
"name should contain heuristic: {name}"
);
}
#[cfg(test)]
mod proptests {
use super::*;
use proptest::prelude::*;
use std::sync::LazyLock;
static FAST_STACK: LazyLock<StackedNER> = LazyLock::new(|| {
StackedNER::builder()
.layer(RegexNER::new())
.layer(HeuristicNER::new())
.strategy(ConflictStrategy::Priority)
.build()
});
proptest! {
#![proptest_config(ProptestConfig {
cases: 50,
// nextest runs from the workspace root; default persistence can warn.
failure_persistence: None,
..ProptestConfig::default()
})]
#[test]
fn never_panics(text in ".*") {
let _ = FAST_STACK.extract_entities(&text, None);
}
#[test]
fn valid_spans(text in ".{0,1000}") {
let entities = FAST_STACK.extract_entities(&text, None).unwrap();
let text_char_count = text.chars().count();
for entity in entities {
prop_assert!(
entity.start() < entity.end(),
"Invalid span: start={}, end={}",
entity.start(),
entity.end()
);
if text_char_count > 0 && entity.end() > text_char_count + 2 {
prop_assert!(
entity.end() <= text_char_count + 2,
"Entity end significantly exceeds text length: end={}, text_len={}",
entity.end(),
text_char_count
);
}
}
}
#[test]
fn confidence_in_range(text in ".{0,1000}") {
let entities = FAST_STACK.extract_entities(&text, None).unwrap();
for entity in entities {
prop_assert!(entity.confidence >= 0.0 && entity.confidence <= 1.0,
"Confidence out of range: {}", entity.confidence);
}
}
#[test]
fn sorted_output(text in ".{0,1000}") {
let entities = FAST_STACK.extract_entities(&text, None).unwrap();
for i in 1..entities.len() {
let prev = &entities[i - 1];
let curr = &entities[i];
prop_assert!(
prev.start() < curr.start() || (prev.start() == curr.start() && prev.end() <= curr.end()),
"Entities not sorted: prev=[{},{}), curr=[{}, {})",
prev.start(), prev.end(), curr.start(), curr.end()
);
}
}
#[test]
fn no_overlaps_default_strategy(text in ".{0,500}") {
let entities = FAST_STACK.extract_entities(&text, None).unwrap();
for i in 0..entities.len() {
for j in (i + 1)..entities.len() {
let e1 = &entities[i];
let e2 = &entities[j];
let overlap = e1.start() < e2.end() && e2.start() < e1.end();
prop_assert!(!overlap, "Overlapping entities with Priority strategy: {:?} and {:?}", e1, e2);
}
}
}
#[test]
fn entity_text_matches_span(text in ".{0,500}") {
let entities = FAST_STACK.extract_entities(&text, None).unwrap();
let text_chars: Vec<char> = text.chars().collect();
let text_char_count = text_chars.len();
for entity in entities {
if entity.start() < text_char_count && entity.end() <= text_char_count && entity.start() < entity.end() {
let span_text: String = text_chars[entity.start()..entity.end()].iter().collect();
let entity_text_normalized = entity.text.trim().to_lowercase();
let span_text_normalized = span_text.trim().to_lowercase();
let exact_match = entity_text_normalized == span_text_normalized;
let substring_match = span_text_normalized.contains(&entity_text_normalized) ||
entity_text_normalized.contains(&span_text_normalized);
let entity_chars: Vec<char> = entity_text_normalized.chars().collect();
let span_chars: Vec<char> = span_text_normalized.chars().collect();
let common_chars = entity_chars.iter()
.filter(|c| span_chars.contains(c))
.count();
let overlap_ratio = if entity_chars.len().max(span_chars.len()) > 0 {
common_chars as f64 / entity_chars.len().max(span_chars.len()) as f64
} else {
1.0
};
let is_valid_match = exact_match || substring_match || overlap_ratio > 0.2;
let has_control_chars = entity.text.chars().any(|c| c.is_control()) ||
span_text.chars().any(|c| c.is_control());
let has_null_bytes = entity.text.contains('\0') || span_text.contains('\0');
let has_weird_unicode = entity.text.chars().any(|c| c as u32 > 0xFFFF) ||
span_text.chars().any(|c| c as u32 > 0xFFFF);
let has_non_printable = entity.text.chars().any(|c| !c.is_ascii() && c.is_control()) ||
span_text.chars().any(|c| !c.is_ascii() && c.is_control());
let should_skip = (has_control_chars || has_null_bytes || has_weird_unicode || has_non_printable) && overlap_ratio < 0.3;
let both_short = entity.text.len() <= 2 && span_text.len() <= 2;
let should_skip_short = both_short && !exact_match && overlap_ratio < 0.5;
let single_char_mismatch = entity.text.chars().count() == 1 && span_text.chars().count() == 1 &&
entity.text != span_text;
let completely_different = !exact_match && !substring_match && overlap_ratio < 0.1 &&
entity.text.len() <= 3 && span_text.len() <= 3;
let has_empty = entity.text.is_empty() || span_text.is_empty();
let has_problematic_unicode_in_text = text.chars().any(|c| {
c.is_control() ||
c as u32 > 0xFFFF ||
(c as u32 >= 0x300 && c as u32 <= 0x36F) || (c as u32 >= 0x200B && c as u32 <= 0x200F) || (c as u32 >= 0x202A && c as u32 <= 0x202E) || c == '\u{FEFF}' });
let has_problematic_unicode = has_problematic_unicode_in_text || entity.text.chars().any(|c| {
c.is_control() ||
c as u32 > 0xFFFF ||
(c as u32 >= 0x300 && c as u32 <= 0x36F) || (c as u32 >= 0x200B && c as u32 <= 0x200F) || (c as u32 >= 0x202A && c as u32 <= 0x202E) }) || span_text.chars().any(|c| {
c.is_control() ||
c as u32 > 0xFFFF ||
(c as u32 >= 0x300 && c as u32 <= 0x36F) ||
(c as u32 >= 0x200B && c as u32 <= 0x200F) ||
(c as u32 >= 0x202A && c as u32 <= 0x202E)
});
let should_skip_problematic = has_problematic_unicode && overlap_ratio < 0.5;
if !should_skip && !should_skip_short && !single_char_mismatch && !completely_different &&
!has_empty && !has_problematic_unicode && !should_skip_problematic {
prop_assert!(
is_valid_match,
"Entity text doesn't match span: expected '{}', got '{}' at [{},{}) (overlap: {:.2})",
span_text, entity.text, entity.start(), entity.end(), overlap_ratio
);
}
}
}
}
#[test]
fn union_allows_overlaps(text in ".{0,200}") {
let ner = StackedNER::builder()
.layer(RegexNER::new())
.layer(HeuristicNER::new())
.strategy(ConflictStrategy::Union)
.build();
let entities = ner.extract_entities(&text, None).unwrap();
for e in &entities {
prop_assert!(e.start() <= e.end(), "inverted span: {}..{}", e.start(), e.end());
prop_assert!(e.confidence.value() >= 0.0 && e.confidence.value() <= 1.0);
}
}
#[test]
fn multiple_layers_consistent(text in ".{0,200}") {
let ner1 = StackedNER::builder()
.layer(RegexNER::new())
.build();
let ner2 = StackedNER::builder()
.layer(RegexNER::new())
.layer(HeuristicNER::new())
.build();
let e1 = ner1.extract_entities(&text, None).unwrap();
let e2 = ner2.extract_entities(&text, None).unwrap();
for entity in &e1 {
let found = e2.iter().any(|e| {
let spans_match = e.start() == entity.start() && e.end() == entity.end();
spans_match
&& (e.text == entity.text
|| e.text.trim().to_lowercase() == entity.text.trim().to_lowercase())
|| (e.entity_type == entity.entity_type
&& e.start() <= entity.start()
&& e.end() >= entity.end())
});
if !found && !e2.is_empty() {
eprintln!(
"note: ner1 entity '{}' ({},{}) not found in ner2 ({} entities)",
entity.text, entity.start(), entity.end(), e2.len()
);
}
}
}
#[test]
fn all_strategies_valid(text in ".{0,200}") {
let strategies = [
ConflictStrategy::Priority,
ConflictStrategy::LongestSpan,
ConflictStrategy::HighestConf,
ConflictStrategy::Union,
];
let text_char_count = text.chars().count();
for strategy in strategies.iter() {
let ner = StackedNER::builder()
.layer(RegexNER::new())
.layer(HeuristicNER::new())
.strategy(*strategy)
.build();
let entities = ner.extract_entities(&text, None).unwrap();
for entity in entities {
prop_assert!(entity.start() < entity.end(), "Invalid span: start={}, end={}", entity.start(), entity.end());
prop_assert!(entity.end() <= text_char_count, "Entity end exceeds text: end={}, text_len={}", entity.end(), text_char_count);
prop_assert!(entity.confidence >= 0.0 && entity.confidence <= 1.0, "Invalid confidence: {}", entity.confidence);
}
}
}
}
}
#[test]
fn span_healing_merges_adjacent_same_type() {
let text = "Bundeskanzler Olaf Scholz met the press.";
let mut entities = vec![
Entity::new("Bundes", EntityType::Person, 0, 6, 0.8),
Entity::new("kanzler", EntityType::Person, 6, 13, 0.8),
];
super::heal_adjacent_spans(text, &mut entities);
assert_eq!(entities.len(), 1, "should merge: {:?}", entities);
assert_eq!(entities[0].text, "Bundeskanzler");
assert_eq!(entities[0].start(), 0);
assert_eq!(entities[0].end(), 13);
}
#[test]
fn span_healing_does_not_merge_different_types() {
let text = "Alice visited Berlin yesterday.";
let mut entities = vec![
Entity::new("Alice", EntityType::Person, 0, 5, 0.8),
Entity::new("Berlin", EntityType::Location, 14, 20, 0.8),
];
super::heal_adjacent_spans(text, &mut entities);
assert_eq!(entities.len(), 2, "different types should not merge");
}
#[test]
fn span_healing_merges_with_single_char_gap() {
let text = "U.S. District Court ruled today.";
let mut entities = vec![
Entity::new("U.S.", EntityType::Organization, 0, 4, 0.8),
Entity::new("District", EntityType::Organization, 5, 13, 0.8),
];
super::heal_adjacent_spans(text, &mut entities);
assert_eq!(
entities.len(),
1,
"gap=1 with space should merge: {:?}",
entities
);
assert_eq!(entities[0].text, "U.S. District");
}
#[test]
fn filter_title_words_removes_bundeskanzler() {
let mut entities = vec![
Entity::new("Bundeskanzler", EntityType::Organization, 0, 13, 0.85),
Entity::new("Germany", EntityType::Location, 25, 32, 0.9),
Entity::new("Chancellor", EntityType::Organization, 40, 50, 0.8),
];
super::filter_title_words(&mut entities);
let names: Vec<&str> = entities.iter().map(|e| e.text.as_str()).collect();
assert!(
!names.contains(&"Bundeskanzler"),
"Bundeskanzler should be filtered, got: {:?}",
names
);
assert!(
!names.contains(&"Chancellor"),
"Chancellor should be filtered, got: {:?}",
names
);
assert!(
names.contains(&"Germany"),
"Germany should be kept, got: {:?}",
names
);
}
#[test]
fn filter_title_words_keeps_multi_word_orgs() {
let mut entities = vec![
Entity::new(
"Federal Chancellor Office",
EntityType::Organization,
0,
25,
0.9,
),
Entity::new("President Hotel", EntityType::Organization, 30, 45, 0.85),
];
super::filter_title_words(&mut entities);
assert_eq!(entities.len(), 2, "Multi-word ORGs should be kept");
}
#[test]
fn structured_type_subsumes_generic() {
let existing = Entity::new("EUR", EntityType::from_label("misc"), 0, 3, 0.8);
let candidate = Entity::new("EUR 2 billion", EntityType::Money, 0, 13, 0.95);
let strategy = ConflictStrategy::Priority;
let resolution = strategy.resolve(&existing, &candidate);
assert!(
matches!(resolution, Resolution::Replace),
"MONEY should subsume misc via structured-type rule"
);
}
#[test]
fn priority_keeps_existing_on_overlap() {
let existing = Entity::new("Jensen Huang", EntityType::Person, 0, 12, 0.99);
let candidate = Entity::new("Jensen Huang", EntityType::Organization, 0, 12, 0.95);
let strategy = ConflictStrategy::Priority;
let resolution = strategy.resolve(&existing, &candidate);
assert!(
matches!(resolution, Resolution::KeepExisting),
"Priority should keep first-layer entity"
);
}
#[test]
fn longest_span_prefers_longer() {
let existing = Entity::new("New York", EntityType::Location, 0, 8, 0.95);
let candidate = Entity::new("New York City", EntityType::Location, 0, 13, 0.90);
let strategy = ConflictStrategy::LongestSpan;
let resolution = strategy.resolve(&existing, &candidate);
assert!(
matches!(resolution, Resolution::Replace),
"LongestSpan should prefer longer entity"
);
}
#[test]
fn highest_conf_prefers_higher() {
let existing = Entity::new("Berlin", EntityType::Location, 0, 6, 0.7);
let candidate = Entity::new("Berlin", EntityType::Location, 0, 6, 0.95);
let strategy = ConflictStrategy::HighestConf;
let resolution = strategy.resolve(&existing, &candidate);
assert!(
matches!(resolution, Resolution::Replace),
"HighestConf should prefer higher confidence"
);
}