anno/backends/stacked/mod.rs
1//! Stacked NER.
2//!
3//! `StackedNER` composes multiple extractors (regex, heuristics, and optionally ML backends)
4//! and then resolves overlaps via a small conflict strategy (priority/longest/confidence/union).
5//!
6//! This module intentionally keeps the API surface small. For user-facing guidance and
7//! provenance details, see `docs/BACKENDS.md` and the repo README.
8
9use super::heuristic::HeuristicNER;
10use super::regex::RegexNER;
11use crate::{Entity, EntityType, Model, Result};
12use itertools::Itertools;
13use std::borrow::Cow;
14use std::sync::Arc;
15
16fn method_for_layer_name(layer_name: &str) -> anno_core::ExtractionMethod {
17 match layer_name {
18 // Our built-in IDs are lowercase and stable.
19 "regex" => anno_core::ExtractionMethod::Pattern,
20 "heuristic" => anno_core::ExtractionMethod::Heuristic,
21 // Legacy backend id (deprecated, but still used in tests/compositions).
22 "rule" => anno_core::ExtractionMethod::Heuristic,
23 // For everything else, this is the least-wrong default.
24 // (E.g. ONNX/Candle transformer backends, CRF, etc.)
25 _ => anno_core::ExtractionMethod::Neural,
26 }
27}
28
29// =============================================================================
30// Conflict Resolution
31// =============================================================================
32
33/// Strategy for resolving overlapping entity spans.
34#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
35pub enum ConflictStrategy {
36 /// First layer to claim a span wins. Simple and predictable.
37 #[default]
38 Priority,
39
40 /// Longest span wins. Prefers "New York City" over "New York".
41 LongestSpan,
42
43 /// Highest confidence score wins.
44 HighestConf,
45
46 /// Keep all entities, even if they overlap.
47 /// Useful when downstream processing handles disambiguation.
48 Union,
49}
50
51impl ConflictStrategy {
52 /// Resolve a conflict between two overlapping entities.
53 ///
54 /// # Arguments
55 /// * `existing` - Entity already in the result set (from earlier layer)
56 /// * `candidate` - New entity from current layer
57 ///
58 /// # Design Note
59 ///
60 /// When confidence/length are equal, we prefer `existing` to respect
61 /// layer priority (earlier layers have higher priority).
62 fn resolve(&self, existing: &Entity, candidate: &Entity) -> Resolution {
63 match self {
64 ConflictStrategy::Priority => Resolution::KeepExisting,
65
66 ConflictStrategy::LongestSpan => {
67 let existing_len = existing.end - existing.start;
68 let candidate_len = candidate.end - candidate.start;
69 if candidate_len > existing_len {
70 Resolution::Replace
71 } else if candidate_len < existing_len {
72 Resolution::KeepExisting
73 } else {
74 // Equal length: prefer existing (earlier layer has priority)
75 Resolution::KeepExisting
76 }
77 }
78
79 ConflictStrategy::HighestConf => {
80 // Prefer higher confidence, but if equal, prefer existing (earlier layer)
81 if candidate.confidence > existing.confidence {
82 Resolution::Replace
83 } else if candidate.confidence < existing.confidence {
84 Resolution::KeepExisting
85 } else {
86 // Equal confidence: prefer existing (earlier layer has priority)
87 Resolution::KeepExisting
88 }
89 }
90
91 ConflictStrategy::Union => Resolution::KeepBoth,
92 }
93 }
94}
95
96#[derive(Debug)]
97enum Resolution {
98 KeepExisting,
99 Replace,
100 KeepBoth,
101}
102
103// =============================================================================
104// StackedNER
105// =============================================================================
106
107/// Composable NER that combines multiple backends.
108///
109/// `StackedNER` accepts **any backend that implements `Model`**, not just regex and heuristics.
110/// You can combine pattern-based, heuristic-based, and ML-based backends in any order.
111///
112/// # Design
113///
114/// Different backends excel at different tasks:
115///
116/// | Backend Type | Best For | Trade-off |
117/// |--------------|----------|-----------|
118/// | Pattern (`RegexNER`) | Structured entities (dates, money, emails) | Can't do named entities |
119/// | Heuristic (`HeuristicNER`) | Named entities (no deps) | Lower accuracy than ML |
120/// | ML (`GLiNER`, `NuNER`, `BertNEROnnx`, etc.) | Everything, high accuracy | Heavy dependencies, slower |
121///
122/// `StackedNER` runs backends in order, merging results according to the
123/// configured [`ConflictStrategy`].
124///
125/// # Default Configuration
126///
127/// `StackedNER::default()` creates a Pattern + Heuristic configuration:
128/// - Layer 1: `RegexNER` (dates, money, emails, etc.)
129/// - Layer 2: `HeuristicNER` (person, org, location)
130///
131/// This provides solid NER coverage with zero ML dependencies.
132///
133/// # Examples
134///
135/// Zero-dependency default (Pattern + Heuristic):
136///
137/// ```rust
138/// use anno::{Model, StackedNER};
139///
140/// let ner = StackedNER::default();
141/// let entities = ner.extract_entities("Dr. Smith charges $100/hr", None).unwrap();
142/// ```
143///
144/// Custom stack with pattern + heuristic:
145///
146/// ```rust
147/// use anno::{Model, RegexNER, HeuristicNER, StackedNER};
148/// use anno::backends::stacked::ConflictStrategy;
149///
150/// let ner = StackedNER::builder()
151/// .layer(RegexNER::new())
152/// .layer(HeuristicNER::new())
153/// .strategy(ConflictStrategy::LongestSpan)
154/// .build();
155/// ```
156///
157/// **Composing with ML backends** (requires `onnx` or `candle` feature):
158///
159/// ```rust,no_run
160/// #[cfg(feature = "onnx")]
161/// {
162/// use anno::{Model, StackedNER, GLiNEROnnx, RegexNER, HeuristicNER};
163/// use anno::backends::stacked::ConflictStrategy;
164///
165/// // ML-first: ML runs first, then patterns fill gaps
166/// let ner = StackedNER::with_ml_first(
167/// Box::new(GLiNEROnnx::new("onnx-community/gliner_small-v2.1").unwrap())
168/// );
169///
170/// // ML-fallback: patterns/heuristics first, ML as fallback
171/// let ner = StackedNER::with_ml_fallback(
172/// Box::new(GLiNEROnnx::new("onnx-community/gliner_small-v2.1").unwrap())
173/// );
174///
175/// // Custom stack: any combination of backends
176/// let ner = StackedNER::builder()
177/// .layer(RegexNER::new()) // High-precision structured entities
178/// .layer_boxed(Box::new(GLiNEROnnx::new("onnx-community/gliner_small-v2.1").unwrap())) // ML layer
179/// .layer(HeuristicNER::new()) // Quick named entities
180/// .strategy(ConflictStrategy::HighestConf) // Resolve conflicts by confidence
181/// .build();
182/// }
183/// ```
184///
185/// You can stack multiple ML backends, mix ONNX and Candle backends, or create any
186/// combination that fits your use case. The builder accepts any `Model` implementation.
187pub struct StackedNER {
188 layers: Vec<Arc<dyn Model + Send + Sync>>,
189 strategy: ConflictStrategy,
190 name: String,
191 /// Cached static name (avoids Box::leak on every name() call)
192 name_static: std::sync::OnceLock<&'static str>,
193}
194
195/// Builder for [`StackedNER`] with fluent configuration.
196#[derive(Default)]
197pub struct StackedNERBuilder {
198 layers: Vec<Box<dyn Model + Send + Sync>>,
199 strategy: ConflictStrategy,
200}
201
202impl StackedNERBuilder {
203 /// Add a layer (order matters: earlier = higher priority).
204 #[must_use]
205 pub fn layer<M: Model + Send + Sync + 'static>(mut self, model: M) -> Self {
206 self.layers.push(Box::new(model));
207 self
208 }
209
210 /// Add a boxed layer.
211 #[must_use]
212 pub fn layer_boxed(mut self, model: Box<dyn Model + Send + Sync>) -> Self {
213 self.layers.push(model);
214 self
215 }
216
217 /// Set the conflict resolution strategy.
218 #[must_use]
219 pub fn strategy(mut self, strategy: ConflictStrategy) -> Self {
220 self.strategy = strategy;
221 self
222 }
223
224 /// Build the configured StackedNER.
225 ///
226 /// # Panics
227 ///
228 /// Panics if no layers are provided (empty stack is invalid).
229 #[must_use]
230 pub fn build(self) -> StackedNER {
231 self.try_build().expect(
232 "StackedNER requires at least one layer. Use StackedNER::builder().layer(...).build()",
233 )
234 }
235
236 /// Build the configured StackedNER without panicking.
237 ///
238 /// This is useful when the stack is assembled dynamically (e.g., from CLI flags)
239 /// and an empty stack should be handled as an error instead of aborting.
240 pub fn try_build(self) -> crate::Result<StackedNER> {
241 if self.layers.is_empty() {
242 return Err(crate::Error::InvalidInput(
243 "StackedNER requires at least one layer".to_string(),
244 ));
245 }
246
247 let name = format!(
248 "stacked({})",
249 self.layers
250 .iter()
251 .map(|l| l.name())
252 .collect::<Vec<_>>()
253 .join("+")
254 );
255
256 Ok(StackedNER {
257 layers: self.layers.into_iter().map(Arc::from).collect(),
258 strategy: self.strategy,
259 name,
260 name_static: std::sync::OnceLock::new(),
261 })
262 }
263}
264
265impl StackedNER {
266 /// Create default configuration: Pattern + Statistical layers.
267 ///
268 /// This provides zero-dependency NER with:
269 /// - High-precision structured entity extraction (dates, money, etc.)
270 /// - Heuristic named entity extraction (person, org, location)
271 #[must_use]
272 pub fn new() -> Self {
273 Self::default()
274 }
275
276 /// Create a builder for custom configuration.
277 #[must_use]
278 pub fn builder() -> StackedNERBuilder {
279 StackedNERBuilder::default()
280 }
281
282 /// Create with explicit layers and default priority strategy.
283 #[must_use]
284 pub fn with_layers(layers: Vec<Box<dyn Model + Send + Sync>>) -> Self {
285 let mut builder = Self::builder().strategy(ConflictStrategy::Priority);
286 for layer in layers {
287 builder = builder.layer_boxed(layer);
288 }
289 builder.build()
290 }
291
292 /// Create with custom heuristic threshold.
293 ///
294 /// Higher threshold = fewer but higher confidence heuristic entities.
295 /// Note: HeuristicNER does not currently support dynamic thresholding
296 /// in constructor, so this method ignores the parameter for now but maintains API compat.
297 #[must_use]
298 pub fn with_heuristic_threshold(_threshold: f64) -> Self {
299 Self::builder()
300 .layer(RegexNER::new())
301 .layer(HeuristicNER::new())
302 .build()
303 }
304
305 /// Backwards compatibility alias.
306 #[deprecated(since = "0.3.0", note = "Use with_heuristic_threshold instead")]
307 #[must_use]
308 pub fn with_statistical_threshold(threshold: f64) -> Self {
309 Self::with_heuristic_threshold(threshold)
310 }
311
312 /// Pattern-only configuration (no heuristic layer).
313 ///
314 /// Extracts only structured entities: dates, times, money, percentages,
315 /// emails, URLs, phone numbers.
316 #[must_use]
317 pub fn pattern_only() -> Self {
318 Self::builder().layer(RegexNER::new()).build()
319 }
320
321 /// Heuristic-only configuration (no pattern layer).
322 ///
323 /// Extracts only named entities: person, organization, location.
324 #[must_use]
325 pub fn heuristic_only() -> Self {
326 Self::builder().layer(HeuristicNER::new()).build()
327 }
328
329 /// Backwards compatibility alias.
330 #[deprecated(since = "0.3.0", note = "Use heuristic_only instead")]
331 #[must_use]
332 pub fn statistical_only() -> Self {
333 Self::heuristic_only()
334 }
335
336 /// Add an ML backend as highest priority.
337 ///
338 /// ML runs first, then Pattern fills structured gaps, then Heuristic.
339 #[must_use]
340 pub fn with_ml_first(ml_backend: Box<dyn Model + Send + Sync>) -> Self {
341 Self::builder()
342 .layer_boxed(ml_backend)
343 .layer(RegexNER::new())
344 .layer(HeuristicNER::new())
345 .build()
346 }
347
348 /// Add an ML backend as fallback (lowest priority).
349 ///
350 /// Pattern runs first (high precision), then Heuristic, then ML.
351 #[must_use]
352 pub fn with_ml_fallback(ml_backend: Box<dyn Model + Send + Sync>) -> Self {
353 Self::builder()
354 .layer(RegexNER::new())
355 .layer(HeuristicNER::new())
356 .layer_boxed(ml_backend)
357 .build()
358 }
359
360 /// Get the number of layers.
361 #[must_use]
362 pub fn num_layers(&self) -> usize {
363 self.layers.len()
364 }
365
366 /// Get layer names in priority order.
367 #[must_use]
368 pub fn layer_names(&self) -> Vec<String> {
369 self.layers
370 .iter()
371 .map(|l| l.name().to_string())
372 .collect_vec()
373 }
374
375 /// Get the conflict strategy.
376 #[must_use]
377 pub fn strategy(&self) -> ConflictStrategy {
378 self.strategy
379 }
380
381 /// Get statistics about the stack configuration.
382 ///
383 /// Returns a summary of layer count, strategy, and layer names.
384 /// Useful for debugging and monitoring.
385 #[must_use]
386 pub fn stats(&self) -> StackStats {
387 StackStats {
388 layer_count: self.layers.len(),
389 strategy: self.strategy,
390 layer_names: self.layer_names(),
391 }
392 }
393}
394
395/// Statistics about a StackedNER configuration.
396///
397/// Provides insight into the stack's structure for debugging and monitoring.
398#[derive(Debug, Clone)]
399pub struct StackStats {
400 /// Number of layers in the stack.
401 pub layer_count: usize,
402 /// Conflict resolution strategy.
403 pub strategy: ConflictStrategy,
404 /// Names of all layers in priority order (earliest = highest priority).
405 pub layer_names: Vec<String>,
406}
407
408impl Default for StackedNER {
409 /// Default configuration: Best available model stack.
410 ///
411 /// Tries to include ML backends (GLiNER, BERT) when available, falling back to
412 /// Pattern + Heuristic for zero-dependency operation.
413 ///
414 /// Downloads are allowed by default; opt out by setting `ANNO_NO_DOWNLOADS=1`
415 /// (or `HF_HUB_OFFLINE=1` to force HuggingFace offline mode).
416 ///
417 /// Priority:
418 /// 1. BERT ONNX (if `onnx` feature and model available) - strong default for standard NER
419 /// 2. GLiNER (if `onnx` feature and model available) - zero-shot, broader label set
420 /// 3. Pattern + Heuristic (always available) - zero dependencies
421 fn default() -> Self {
422 // Try BERT first for standard NER (usually best on PER/ORG/LOC/MISC).
423 #[cfg(feature = "onnx")]
424 {
425 fn no_downloads() -> bool {
426 match std::env::var("ANNO_NO_DOWNLOADS") {
427 Ok(v) => matches!(
428 v.trim().to_ascii_lowercase().as_str(),
429 "1" | "true" | "yes" | "y" | "on"
430 ),
431 Err(_) => false,
432 }
433 }
434
435 struct EnvVarGuard {
436 key: &'static str,
437 prev: Option<String>,
438 }
439
440 impl EnvVarGuard {
441 fn set(key: &'static str, value: &str) -> Self {
442 let prev = std::env::var(key).ok();
443 std::env::set_var(key, value);
444 Self { key, prev }
445 }
446 }
447
448 impl Drop for EnvVarGuard {
449 fn drop(&mut self) {
450 match &self.prev {
451 Some(v) => std::env::set_var(self.key, v),
452 None => std::env::remove_var(self.key),
453 }
454 }
455 }
456
457 // Opt-out policy: allow downloads unless explicitly disabled.
458 // GLiNER/BERT loaders use `hf_hub`, which honors `HF_HUB_OFFLINE=1`.
459 let _offline = no_downloads().then(|| EnvVarGuard::set("HF_HUB_OFFLINE", "1"));
460
461 use crate::backends::onnx::BertNEROnnx;
462 use crate::DEFAULT_BERT_ONNX_MODEL;
463 if let Ok(bert) = BertNEROnnx::new(DEFAULT_BERT_ONNX_MODEL) {
464 return Self::builder()
465 .layer_boxed(Box::new(bert))
466 .layer(RegexNER::new())
467 .layer(HeuristicNER::new())
468 .build();
469 }
470
471 // Fallback to GLiNER (zero-shot, broader label set).
472 use crate::{GLiNEROnnx, DEFAULT_GLINER_MODEL};
473 if let Ok(gliner) = GLiNEROnnx::new(DEFAULT_GLINER_MODEL) {
474 return Self::builder()
475 .layer_boxed(Box::new(gliner))
476 .layer(RegexNER::new())
477 .layer(HeuristicNER::new())
478 .build();
479 }
480 }
481
482 // Ultimate fallback: Pattern + Heuristic (zero dependencies)
483 Self::builder()
484 .layer(RegexNER::new())
485 .layer(HeuristicNER::new())
486 .build()
487 }
488}
489
490impl Model for StackedNER {
491 #[cfg_attr(feature = "production", tracing::instrument(skip(self, text), fields(text_len = text.len(), num_layers = self.layers.len())))]
492 fn extract_entities(&self, text: &str, language: Option<&str>) -> Result<Vec<Entity>> {
493 // Performance: Pre-allocate entities vec with estimated capacity
494 // Most texts have 0-20 entities, but we'll start with a reasonable default
495 let mut entities: Vec<Entity> = Vec::with_capacity(16);
496 let mut layer_errors = Vec::new();
497
498 // Performance optimization: Cache text length (O(n) operation, called many times)
499 // This is shared across all backends and called in hot loops
500 // ROI: High - called once per extract_entities, saves O(n) per entity in loop
501 let text_char_count = text.chars().count();
502
503 for layer in &self.layers {
504 let layer_name = layer.name();
505
506 // Try to extract from this layer, but continue on error if other layers succeeded
507 let layer_entities = match layer.extract_entities(text, language) {
508 Ok(ents) => ents,
509 Err(e) => {
510 // Log error but continue with other layers (partial results)
511 layer_errors.push((layer_name.to_string(), format!("{}", e)));
512 if entities.is_empty() {
513 // If no entities found yet, fail fast
514 return Err(e);
515 }
516 // Otherwise, continue with partial results
517 continue;
518 }
519 };
520
521 for mut candidate in layer_entities {
522 // Defensive: Clamp entity offsets to valid range
523 // Some backends may produce out-of-bounds offsets in edge cases (Unicode, control chars)
524 // Use cached text_char_count instead of recalculating (performance optimization)
525 if candidate.end > text_char_count {
526 log::debug!(
527 "StackedNER: Clamping entity end offset from {} to {} (text length: {})",
528 candidate.end,
529 text_char_count,
530 text_char_count
531 );
532 candidate.end = text_char_count;
533 // Keep `entity.text` consistent with the adjusted span (Unicode-safe).
534 //
535 // This only triggers on buggy/out-of-bounds backends, but when it does,
536 // returning a span/text mismatch is more confusing than truncating text.
537 if candidate.start < candidate.end {
538 candidate.text = crate::offset::TextSpan::from_chars(
539 text,
540 candidate.start,
541 candidate.end,
542 )
543 .extract(text)
544 .to_string();
545 }
546 }
547 if candidate.start >= candidate.end || candidate.start > text_char_count {
548 // Invalid span - skip this entity
549 log::debug!(
550 "StackedNER: Skipping entity with invalid span: start={}, end={}, text_len={}",
551 candidate.start,
552 candidate.end,
553 text_char_count
554 );
555 continue;
556 }
557
558 // Add provenance tracking if not already set
559 if candidate.provenance.is_none() {
560 candidate.provenance = Some(anno_core::Provenance {
561 source: Cow::Borrowed(layer_name),
562 method: method_for_layer_name(layer_name),
563 pattern: None,
564 raw_confidence: Some(candidate.confidence),
565 model_version: None,
566 timestamp: None,
567 });
568 }
569
570 // Find ALL overlapping entities (not just first)
571 //
572 // Performance: O(n) per candidate, O(n²) overall for n entities.
573 // For large entity sets, consider optimizing with:
574 // - Interval tree: O(n log n) construction, O(log n + k) query (k = overlaps)
575 // - Sorted intervals with binary search: O(n log n) sort, O(log n + k) query
576 // Current implementation prioritizes correctness and simplicity.
577 //
578 // Note: Entities are sorted at the end, but during conflict resolution
579 // we process candidates in layer order, so we can't assume sorted order here.
580 let overlapping_indices: Vec<usize> = entities
581 .iter()
582 .enumerate()
583 .filter_map(|(idx, e)| {
584 // Check if candidate overlaps with existing entity
585 // Overlap: !(candidate.end <= e.start || candidate.start >= e.end)
586 if candidate.end > e.start && candidate.start < e.end {
587 Some(idx)
588 } else {
589 None
590 }
591 })
592 .collect();
593
594 match overlapping_indices.len() {
595 0 => {
596 // No overlap - add directly
597 entities.push(candidate);
598 }
599 1 => {
600 // Single overlap - resolve normally
601 let idx = overlapping_indices[0];
602 match self.strategy.resolve(&entities[idx], &candidate) {
603 Resolution::KeepExisting => {}
604 Resolution::Replace => {
605 entities[idx] = candidate;
606 }
607 Resolution::KeepBoth => {
608 entities.push(candidate);
609 }
610 }
611 }
612 _ => {
613 // Multiple overlaps - need to handle carefully
614 // Strategy: resolve with the "best" existing entity based on strategy,
615 // then check if candidate should replace it
616 let best_idx = overlapping_indices
617 .iter()
618 .max_by(|&&a, &&b| {
619 // Find the "best" existing entity to compare against
620 match self.strategy {
621 ConflictStrategy::Priority => {
622 // Earlier in list = higher priority
623 a.cmp(&b).reverse()
624 }
625 ConflictStrategy::LongestSpan => {
626 let len_a = entities[a].end - entities[a].start;
627 let len_b = entities[b].end - entities[b].start;
628 len_a.cmp(&len_b).then_with(|| b.cmp(&a))
629 }
630 ConflictStrategy::HighestConf => entities[a]
631 .confidence
632 .partial_cmp(&entities[b].confidence)
633 .unwrap_or(std::cmp::Ordering::Equal)
634 .then_with(|| b.cmp(&a)),
635 ConflictStrategy::Union => {
636 // For union, we'll keep all, so just pick first
637 a.cmp(&b)
638 }
639 }
640 })
641 .copied()
642 .unwrap_or(overlapping_indices[0]);
643
644 match self.strategy {
645 ConflictStrategy::Union => {
646 // Keep candidate and all existing overlapping entities
647 entities.push(candidate);
648 }
649 _ => {
650 // Resolve with best existing entity
651 match self.strategy.resolve(&entities[best_idx], &candidate) {
652 Resolution::KeepExisting => {
653 // Remove other overlapping entities (they're subsumed)
654 // Sort indices descending to remove from end
655 let mut to_remove: Vec<usize> = overlapping_indices
656 .into_iter()
657 .filter(|&idx| idx != best_idx)
658 .collect();
659 // Performance: Use unstable sort (we don't need stable sort here)
660 to_remove.sort_unstable_by(|a, b| b.cmp(a));
661 for idx in to_remove {
662 entities.remove(idx);
663 }
664 }
665 Resolution::Replace => {
666 // Replace best and remove others
667 let mut to_remove: Vec<usize> = overlapping_indices
668 .into_iter()
669 .filter(|&idx| idx != best_idx)
670 .collect();
671 // Performance: Use unstable sort (we don't need stable sort here)
672 to_remove.sort_unstable_by(|a, b| b.cmp(a));
673
674 // Adjust best_idx based on how many entities we remove before it
675 let removed_before_best =
676 to_remove.iter().filter(|&&idx| idx < best_idx).count();
677 let adjusted_best_idx = best_idx - removed_before_best;
678
679 // Remove entities (in descending order to preserve indices)
680 for idx in to_remove {
681 entities.remove(idx);
682 }
683
684 // Now use adjusted index
685 entities[adjusted_best_idx] = candidate;
686 }
687 Resolution::KeepBoth => {
688 // Remove others, keep best and candidate
689 let mut to_remove: Vec<usize> = overlapping_indices
690 .into_iter()
691 .filter(|&idx| idx != best_idx)
692 .collect();
693 // Performance: Use unstable sort (we don't need stable sort here)
694 to_remove.sort_unstable_by(|a, b| b.cmp(a));
695 // Remove entities (best_idx remains valid since we don't remove it)
696 for idx in to_remove {
697 entities.remove(idx);
698 }
699 entities.push(candidate);
700 }
701 }
702 }
703 }
704 }
705 }
706 }
707 }
708
709 // Sort by position (start, then end) with deterministic tie-breaks.
710 //
711 // We include additional keys so exact-tie cases (same span) produce stable ordering,
712 // and so dedup-by-span+type (below) works reliably if duplicates slip through.
713 entities.sort_unstable_by(|a, b| {
714 let a_ty = a.entity_type.as_label();
715 let b_ty = b.entity_type.as_label();
716 let a_src = a
717 .provenance
718 .as_ref()
719 .map(|p| p.source.as_ref())
720 .unwrap_or("");
721 let b_src = b
722 .provenance
723 .as_ref()
724 .map(|p| p.source.as_ref())
725 .unwrap_or("");
726
727 (a.start, a.end, a_ty, a_src, a.text.as_str()).cmp(&(
728 b.start,
729 b.end,
730 b_ty,
731 b_src,
732 b.text.as_str(),
733 ))
734 });
735
736 // Remove any duplicates that might have been created (defensive)
737 // Only deduplicate if not using Union strategy (Union intentionally allows overlaps)
738 if self.strategy != ConflictStrategy::Union {
739 // Two entities are duplicates if they have same span and type
740 // Performance: dedup_by is O(n) and efficient for sorted vec
741 entities.dedup_by(|a, b| {
742 a.start == b.start && a.end == b.end && a.entity_type == b.entity_type
743 });
744 }
745
746 // If we had errors but got partial results, log them but return success
747 if !layer_errors.is_empty() && !entities.is_empty() {
748 log::warn!(
749 "StackedNER: Some layers failed but returning partial results. Errors: {:?}",
750 layer_errors
751 );
752 }
753
754 // Validate final entities (defensive programming)
755 // This catches bugs in individual backends that might produce invalid spans
756 for entity in &entities {
757 if entity.start >= entity.end {
758 log::warn!(
759 "StackedNER: Invalid entity span detected: start={}, end={}, text={:?}, type={:?}",
760 entity.start,
761 entity.end,
762 entity.text,
763 entity.entity_type
764 );
765 }
766 }
767
768 Ok(entities)
769 }
770
771 fn supported_types(&self) -> Vec<EntityType> {
772 // Use itertools for efficient deduplication
773 self.layers
774 .iter()
775 .flat_map(|layer| layer.supported_types())
776 .sorted_by(|a, b| format!("{:?}", a).cmp(&format!("{:?}", b)))
777 .dedup()
778 .collect_vec()
779 }
780
781 fn is_available(&self) -> bool {
782 self.layers.iter().any(|l| l.is_available())
783 }
784
785 fn name(&self) -> &'static str {
786 // Use OnceLock to cache the static string, avoiding repeated memory leaks
787 self.name_static
788 .get_or_init(|| Box::leak(self.name.clone().into_boxed_str()))
789 }
790
791 fn description(&self) -> &'static str {
792 "Stacked NER (multi-backend composition)"
793 }
794
795 fn capabilities(&self) -> crate::ModelCapabilities {
796 crate::ModelCapabilities {
797 batch_capable: true,
798 optimal_batch_size: Some(32),
799 streaming_capable: true,
800 recommended_chunk_size: Some(8_000),
801 ..Default::default()
802 }
803 }
804}
805
806// =============================================================================
807// Type Aliases for Backwards Compatibility
808// =============================================================================
809
810/// Alias for backwards compatibility.
811#[deprecated(since = "0.2.0", note = "Use StackedNER instead")]
812pub type LayeredNER = StackedNER;
813
814/// Alias for backwards compatibility.
815#[deprecated(since = "0.2.0", note = "Use StackedNER::default() instead")]
816pub type TieredNER = StackedNER;
817
818/// Alias for backwards compatibility.
819#[deprecated(since = "0.2.0", note = "Use StackedNER instead")]
820pub type CompositeNER = StackedNER;
821
822// Capability markers: StackedNER combines pattern and heuristic extraction
823impl crate::StructuredEntityCapable for StackedNER {}
824impl crate::NamedEntityCapable for StackedNER {}
825
826// =============================================================================
827// BatchCapable and StreamingCapable Trait Implementations
828// =============================================================================
829
830impl crate::BatchCapable for StackedNER {
831 fn extract_entities_batch(
832 &self,
833 texts: &[&str],
834 language: Option<&str>,
835 ) -> Result<Vec<Vec<Entity>>> {
836 texts
837 .iter()
838 .map(|text| self.extract_entities(text, language))
839 .collect()
840 }
841
842 fn optimal_batch_size(&self) -> Option<usize> {
843 Some(32) // Combination of pattern + heuristic
844 }
845}
846
847impl crate::StreamingCapable for StackedNER {
848 fn recommended_chunk_size(&self) -> usize {
849 8_000 // Slightly smaller due to multi-layer processing
850 }
851}
852
853// =============================================================================
854// Tests
855// =============================================================================
856
857#[cfg(test)]
858mod tests;