anno/backends/stacked/
mod.rs

1//! Stacked NER.
2//!
3//! `StackedNER` composes multiple extractors (regex, heuristics, and optionally ML backends)
4//! and then resolves overlaps via a small conflict strategy (priority/longest/confidence/union).
5//!
6//! This module intentionally keeps the API surface small. For user-facing guidance and
7//! provenance details, see `docs/BACKENDS.md` and the repo README.
8
9use super::heuristic::HeuristicNER;
10use super::regex::RegexNER;
11use crate::{Entity, EntityType, Model, Result};
12use itertools::Itertools;
13use std::borrow::Cow;
14use std::sync::Arc;
15
16fn method_for_layer_name(layer_name: &str) -> anno_core::ExtractionMethod {
17    match layer_name {
18        // Our built-in IDs are lowercase and stable.
19        "regex" => anno_core::ExtractionMethod::Pattern,
20        "heuristic" => anno_core::ExtractionMethod::Heuristic,
21        // Legacy backend id (deprecated, but still used in tests/compositions).
22        "rule" => anno_core::ExtractionMethod::Heuristic,
23        // For everything else, this is the least-wrong default.
24        // (E.g. ONNX/Candle transformer backends, CRF, etc.)
25        _ => anno_core::ExtractionMethod::Neural,
26    }
27}
28
29// =============================================================================
30// Conflict Resolution
31// =============================================================================
32
33/// Strategy for resolving overlapping entity spans.
34#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
35pub enum ConflictStrategy {
36    /// First layer to claim a span wins. Simple and predictable.
37    #[default]
38    Priority,
39
40    /// Longest span wins. Prefers "New York City" over "New York".
41    LongestSpan,
42
43    /// Highest confidence score wins.
44    HighestConf,
45
46    /// Keep all entities, even if they overlap.
47    /// Useful when downstream processing handles disambiguation.
48    Union,
49}
50
51impl ConflictStrategy {
52    /// Resolve a conflict between two overlapping entities.
53    ///
54    /// # Arguments
55    /// * `existing` - Entity already in the result set (from earlier layer)
56    /// * `candidate` - New entity from current layer
57    ///
58    /// # Design Note
59    ///
60    /// When confidence/length are equal, we prefer `existing` to respect
61    /// layer priority (earlier layers have higher priority).
62    fn resolve(&self, existing: &Entity, candidate: &Entity) -> Resolution {
63        match self {
64            ConflictStrategy::Priority => Resolution::KeepExisting,
65
66            ConflictStrategy::LongestSpan => {
67                let existing_len = existing.end - existing.start;
68                let candidate_len = candidate.end - candidate.start;
69                if candidate_len > existing_len {
70                    Resolution::Replace
71                } else if candidate_len < existing_len {
72                    Resolution::KeepExisting
73                } else {
74                    // Equal length: prefer existing (earlier layer has priority)
75                    Resolution::KeepExisting
76                }
77            }
78
79            ConflictStrategy::HighestConf => {
80                // Prefer higher confidence, but if equal, prefer existing (earlier layer)
81                if candidate.confidence > existing.confidence {
82                    Resolution::Replace
83                } else if candidate.confidence < existing.confidence {
84                    Resolution::KeepExisting
85                } else {
86                    // Equal confidence: prefer existing (earlier layer has priority)
87                    Resolution::KeepExisting
88                }
89            }
90
91            ConflictStrategy::Union => Resolution::KeepBoth,
92        }
93    }
94}
95
96#[derive(Debug)]
97enum Resolution {
98    KeepExisting,
99    Replace,
100    KeepBoth,
101}
102
103// =============================================================================
104// StackedNER
105// =============================================================================
106
107/// Composable NER that combines multiple backends.
108///
109/// `StackedNER` accepts **any backend that implements `Model`**, not just regex and heuristics.
110/// You can combine pattern-based, heuristic-based, and ML-based backends in any order.
111///
112/// # Design
113///
114/// Different backends excel at different tasks:
115///
116/// | Backend Type | Best For | Trade-off |
117/// |--------------|----------|-----------|
118/// | Pattern (`RegexNER`) | Structured entities (dates, money, emails) | Can't do named entities |
119/// | Heuristic (`HeuristicNER`) | Named entities (no deps) | Lower accuracy than ML |
120/// | ML (`GLiNER`, `NuNER`, `BertNEROnnx`, etc.) | Everything, high accuracy | Heavy dependencies, slower |
121///
122/// `StackedNER` runs backends in order, merging results according to the
123/// configured [`ConflictStrategy`].
124///
125/// # Default Configuration
126///
127/// `StackedNER::default()` creates a Pattern + Heuristic configuration:
128/// - Layer 1: `RegexNER` (dates, money, emails, etc.)
129/// - Layer 2: `HeuristicNER` (person, org, location)
130///
131/// This provides solid NER coverage with zero ML dependencies.
132///
133/// # Examples
134///
135/// Zero-dependency default (Pattern + Heuristic):
136///
137/// ```rust
138/// use anno::{Model, StackedNER};
139///
140/// let ner = StackedNER::default();
141/// let entities = ner.extract_entities("Dr. Smith charges $100/hr", None).unwrap();
142/// ```
143///
144/// Custom stack with pattern + heuristic:
145///
146/// ```rust
147/// use anno::{Model, RegexNER, HeuristicNER, StackedNER};
148/// use anno::backends::stacked::ConflictStrategy;
149///
150/// let ner = StackedNER::builder()
151///     .layer(RegexNER::new())
152///     .layer(HeuristicNER::new())
153///     .strategy(ConflictStrategy::LongestSpan)
154///     .build();
155/// ```
156///
157/// **Composing with ML backends** (requires `onnx` or `candle` feature):
158///
159/// ```rust,no_run
160/// #[cfg(feature = "onnx")]
161/// {
162/// use anno::{Model, StackedNER, GLiNEROnnx, RegexNER, HeuristicNER};
163/// use anno::backends::stacked::ConflictStrategy;
164///
165/// // ML-first: ML runs first, then patterns fill gaps
166/// let ner = StackedNER::with_ml_first(
167///     Box::new(GLiNEROnnx::new("onnx-community/gliner_small-v2.1").unwrap())
168/// );
169///
170/// // ML-fallback: patterns/heuristics first, ML as fallback
171/// let ner = StackedNER::with_ml_fallback(
172///     Box::new(GLiNEROnnx::new("onnx-community/gliner_small-v2.1").unwrap())
173/// );
174///
175/// // Custom stack: any combination of backends
176/// let ner = StackedNER::builder()
177///     .layer(RegexNER::new())           // High-precision structured entities
178///     .layer_boxed(Box::new(GLiNEROnnx::new("onnx-community/gliner_small-v2.1").unwrap()))  // ML layer
179///     .layer(HeuristicNER::new())       // Quick named entities
180///     .strategy(ConflictStrategy::HighestConf)  // Resolve conflicts by confidence
181///     .build();
182/// }
183/// ```
184///
185/// You can stack multiple ML backends, mix ONNX and Candle backends, or create any
186/// combination that fits your use case. The builder accepts any `Model` implementation.
187pub struct StackedNER {
188    layers: Vec<Arc<dyn Model + Send + Sync>>,
189    strategy: ConflictStrategy,
190    name: String,
191    /// Cached static name (avoids Box::leak on every name() call)
192    name_static: std::sync::OnceLock<&'static str>,
193}
194
195/// Builder for [`StackedNER`] with fluent configuration.
196#[derive(Default)]
197pub struct StackedNERBuilder {
198    layers: Vec<Box<dyn Model + Send + Sync>>,
199    strategy: ConflictStrategy,
200}
201
202impl StackedNERBuilder {
203    /// Add a layer (order matters: earlier = higher priority).
204    #[must_use]
205    pub fn layer<M: Model + Send + Sync + 'static>(mut self, model: M) -> Self {
206        self.layers.push(Box::new(model));
207        self
208    }
209
210    /// Add a boxed layer.
211    #[must_use]
212    pub fn layer_boxed(mut self, model: Box<dyn Model + Send + Sync>) -> Self {
213        self.layers.push(model);
214        self
215    }
216
217    /// Set the conflict resolution strategy.
218    #[must_use]
219    pub fn strategy(mut self, strategy: ConflictStrategy) -> Self {
220        self.strategy = strategy;
221        self
222    }
223
224    /// Build the configured StackedNER.
225    ///
226    /// # Panics
227    ///
228    /// Panics if no layers are provided (empty stack is invalid).
229    #[must_use]
230    pub fn build(self) -> StackedNER {
231        self.try_build().expect(
232            "StackedNER requires at least one layer. Use StackedNER::builder().layer(...).build()",
233        )
234    }
235
236    /// Build the configured StackedNER without panicking.
237    ///
238    /// This is useful when the stack is assembled dynamically (e.g., from CLI flags)
239    /// and an empty stack should be handled as an error instead of aborting.
240    pub fn try_build(self) -> crate::Result<StackedNER> {
241        if self.layers.is_empty() {
242            return Err(crate::Error::InvalidInput(
243                "StackedNER requires at least one layer".to_string(),
244            ));
245        }
246
247        let name = format!(
248            "stacked({})",
249            self.layers
250                .iter()
251                .map(|l| l.name())
252                .collect::<Vec<_>>()
253                .join("+")
254        );
255
256        Ok(StackedNER {
257            layers: self.layers.into_iter().map(Arc::from).collect(),
258            strategy: self.strategy,
259            name,
260            name_static: std::sync::OnceLock::new(),
261        })
262    }
263}
264
265impl StackedNER {
266    /// Create default configuration: Pattern + Statistical layers.
267    ///
268    /// This provides zero-dependency NER with:
269    /// - High-precision structured entity extraction (dates, money, etc.)
270    /// - Heuristic named entity extraction (person, org, location)
271    #[must_use]
272    pub fn new() -> Self {
273        Self::default()
274    }
275
276    /// Create a builder for custom configuration.
277    #[must_use]
278    pub fn builder() -> StackedNERBuilder {
279        StackedNERBuilder::default()
280    }
281
282    /// Create with explicit layers and default priority strategy.
283    #[must_use]
284    pub fn with_layers(layers: Vec<Box<dyn Model + Send + Sync>>) -> Self {
285        let mut builder = Self::builder().strategy(ConflictStrategy::Priority);
286        for layer in layers {
287            builder = builder.layer_boxed(layer);
288        }
289        builder.build()
290    }
291
292    /// Create with custom heuristic threshold.
293    ///
294    /// Higher threshold = fewer but higher confidence heuristic entities.
295    /// Note: HeuristicNER does not currently support dynamic thresholding
296    /// in constructor, so this method ignores the parameter for now but maintains API compat.
297    #[must_use]
298    pub fn with_heuristic_threshold(_threshold: f64) -> Self {
299        Self::builder()
300            .layer(RegexNER::new())
301            .layer(HeuristicNER::new())
302            .build()
303    }
304
305    /// Backwards compatibility alias.
306    #[deprecated(since = "0.3.0", note = "Use with_heuristic_threshold instead")]
307    #[must_use]
308    pub fn with_statistical_threshold(threshold: f64) -> Self {
309        Self::with_heuristic_threshold(threshold)
310    }
311
312    /// Pattern-only configuration (no heuristic layer).
313    ///
314    /// Extracts only structured entities: dates, times, money, percentages,
315    /// emails, URLs, phone numbers.
316    #[must_use]
317    pub fn pattern_only() -> Self {
318        Self::builder().layer(RegexNER::new()).build()
319    }
320
321    /// Heuristic-only configuration (no pattern layer).
322    ///
323    /// Extracts only named entities: person, organization, location.
324    #[must_use]
325    pub fn heuristic_only() -> Self {
326        Self::builder().layer(HeuristicNER::new()).build()
327    }
328
329    /// Backwards compatibility alias.
330    #[deprecated(since = "0.3.0", note = "Use heuristic_only instead")]
331    #[must_use]
332    pub fn statistical_only() -> Self {
333        Self::heuristic_only()
334    }
335
336    /// Add an ML backend as highest priority.
337    ///
338    /// ML runs first, then Pattern fills structured gaps, then Heuristic.
339    #[must_use]
340    pub fn with_ml_first(ml_backend: Box<dyn Model + Send + Sync>) -> Self {
341        Self::builder()
342            .layer_boxed(ml_backend)
343            .layer(RegexNER::new())
344            .layer(HeuristicNER::new())
345            .build()
346    }
347
348    /// Add an ML backend as fallback (lowest priority).
349    ///
350    /// Pattern runs first (high precision), then Heuristic, then ML.
351    #[must_use]
352    pub fn with_ml_fallback(ml_backend: Box<dyn Model + Send + Sync>) -> Self {
353        Self::builder()
354            .layer(RegexNER::new())
355            .layer(HeuristicNER::new())
356            .layer_boxed(ml_backend)
357            .build()
358    }
359
360    /// Get the number of layers.
361    #[must_use]
362    pub fn num_layers(&self) -> usize {
363        self.layers.len()
364    }
365
366    /// Get layer names in priority order.
367    #[must_use]
368    pub fn layer_names(&self) -> Vec<String> {
369        self.layers
370            .iter()
371            .map(|l| l.name().to_string())
372            .collect_vec()
373    }
374
375    /// Get the conflict strategy.
376    #[must_use]
377    pub fn strategy(&self) -> ConflictStrategy {
378        self.strategy
379    }
380
381    /// Get statistics about the stack configuration.
382    ///
383    /// Returns a summary of layer count, strategy, and layer names.
384    /// Useful for debugging and monitoring.
385    #[must_use]
386    pub fn stats(&self) -> StackStats {
387        StackStats {
388            layer_count: self.layers.len(),
389            strategy: self.strategy,
390            layer_names: self.layer_names(),
391        }
392    }
393}
394
395/// Statistics about a StackedNER configuration.
396///
397/// Provides insight into the stack's structure for debugging and monitoring.
398#[derive(Debug, Clone)]
399pub struct StackStats {
400    /// Number of layers in the stack.
401    pub layer_count: usize,
402    /// Conflict resolution strategy.
403    pub strategy: ConflictStrategy,
404    /// Names of all layers in priority order (earliest = highest priority).
405    pub layer_names: Vec<String>,
406}
407
408impl Default for StackedNER {
409    /// Default configuration: Best available model stack.
410    ///
411    /// Tries to include ML backends (GLiNER, BERT) when available, falling back to
412    /// Pattern + Heuristic for zero-dependency operation.
413    ///
414    /// Downloads are allowed by default; opt out by setting `ANNO_NO_DOWNLOADS=1`
415    /// (or `HF_HUB_OFFLINE=1` to force HuggingFace offline mode).
416    ///
417    /// Priority:
418    /// 1. BERT ONNX (if `onnx` feature and model available) - strong default for standard NER
419    /// 2. GLiNER (if `onnx` feature and model available) - zero-shot, broader label set
420    /// 3. Pattern + Heuristic (always available) - zero dependencies
421    fn default() -> Self {
422        // Try BERT first for standard NER (usually best on PER/ORG/LOC/MISC).
423        #[cfg(feature = "onnx")]
424        {
425            fn no_downloads() -> bool {
426                match std::env::var("ANNO_NO_DOWNLOADS") {
427                    Ok(v) => matches!(
428                        v.trim().to_ascii_lowercase().as_str(),
429                        "1" | "true" | "yes" | "y" | "on"
430                    ),
431                    Err(_) => false,
432                }
433            }
434
435            struct EnvVarGuard {
436                key: &'static str,
437                prev: Option<String>,
438            }
439
440            impl EnvVarGuard {
441                fn set(key: &'static str, value: &str) -> Self {
442                    let prev = std::env::var(key).ok();
443                    std::env::set_var(key, value);
444                    Self { key, prev }
445                }
446            }
447
448            impl Drop for EnvVarGuard {
449                fn drop(&mut self) {
450                    match &self.prev {
451                        Some(v) => std::env::set_var(self.key, v),
452                        None => std::env::remove_var(self.key),
453                    }
454                }
455            }
456
457            // Opt-out policy: allow downloads unless explicitly disabled.
458            // GLiNER/BERT loaders use `hf_hub`, which honors `HF_HUB_OFFLINE=1`.
459            let _offline = no_downloads().then(|| EnvVarGuard::set("HF_HUB_OFFLINE", "1"));
460
461            use crate::backends::onnx::BertNEROnnx;
462            use crate::DEFAULT_BERT_ONNX_MODEL;
463            if let Ok(bert) = BertNEROnnx::new(DEFAULT_BERT_ONNX_MODEL) {
464                return Self::builder()
465                    .layer_boxed(Box::new(bert))
466                    .layer(RegexNER::new())
467                    .layer(HeuristicNER::new())
468                    .build();
469            }
470
471            // Fallback to GLiNER (zero-shot, broader label set).
472            use crate::{GLiNEROnnx, DEFAULT_GLINER_MODEL};
473            if let Ok(gliner) = GLiNEROnnx::new(DEFAULT_GLINER_MODEL) {
474                return Self::builder()
475                    .layer_boxed(Box::new(gliner))
476                    .layer(RegexNER::new())
477                    .layer(HeuristicNER::new())
478                    .build();
479            }
480        }
481
482        // Ultimate fallback: Pattern + Heuristic (zero dependencies)
483        Self::builder()
484            .layer(RegexNER::new())
485            .layer(HeuristicNER::new())
486            .build()
487    }
488}
489
490impl Model for StackedNER {
491    #[cfg_attr(feature = "production", tracing::instrument(skip(self, text), fields(text_len = text.len(), num_layers = self.layers.len())))]
492    fn extract_entities(&self, text: &str, language: Option<&str>) -> Result<Vec<Entity>> {
493        // Performance: Pre-allocate entities vec with estimated capacity
494        // Most texts have 0-20 entities, but we'll start with a reasonable default
495        let mut entities: Vec<Entity> = Vec::with_capacity(16);
496        let mut layer_errors = Vec::new();
497
498        // Performance optimization: Cache text length (O(n) operation, called many times)
499        // This is shared across all backends and called in hot loops
500        // ROI: High - called once per extract_entities, saves O(n) per entity in loop
501        let text_char_count = text.chars().count();
502
503        for layer in &self.layers {
504            let layer_name = layer.name();
505
506            // Try to extract from this layer, but continue on error if other layers succeeded
507            let layer_entities = match layer.extract_entities(text, language) {
508                Ok(ents) => ents,
509                Err(e) => {
510                    // Log error but continue with other layers (partial results)
511                    layer_errors.push((layer_name.to_string(), format!("{}", e)));
512                    if entities.is_empty() {
513                        // If no entities found yet, fail fast
514                        return Err(e);
515                    }
516                    // Otherwise, continue with partial results
517                    continue;
518                }
519            };
520
521            for mut candidate in layer_entities {
522                // Defensive: Clamp entity offsets to valid range
523                // Some backends may produce out-of-bounds offsets in edge cases (Unicode, control chars)
524                // Use cached text_char_count instead of recalculating (performance optimization)
525                if candidate.end > text_char_count {
526                    log::debug!(
527                        "StackedNER: Clamping entity end offset from {} to {} (text length: {})",
528                        candidate.end,
529                        text_char_count,
530                        text_char_count
531                    );
532                    candidate.end = text_char_count;
533                    // Keep `entity.text` consistent with the adjusted span (Unicode-safe).
534                    //
535                    // This only triggers on buggy/out-of-bounds backends, but when it does,
536                    // returning a span/text mismatch is more confusing than truncating text.
537                    if candidate.start < candidate.end {
538                        candidate.text = crate::offset::TextSpan::from_chars(
539                            text,
540                            candidate.start,
541                            candidate.end,
542                        )
543                        .extract(text)
544                        .to_string();
545                    }
546                }
547                if candidate.start >= candidate.end || candidate.start > text_char_count {
548                    // Invalid span - skip this entity
549                    log::debug!(
550                        "StackedNER: Skipping entity with invalid span: start={}, end={}, text_len={}",
551                        candidate.start,
552                        candidate.end,
553                        text_char_count
554                    );
555                    continue;
556                }
557
558                // Add provenance tracking if not already set
559                if candidate.provenance.is_none() {
560                    candidate.provenance = Some(anno_core::Provenance {
561                        source: Cow::Borrowed(layer_name),
562                        method: method_for_layer_name(layer_name),
563                        pattern: None,
564                        raw_confidence: Some(candidate.confidence),
565                        model_version: None,
566                        timestamp: None,
567                    });
568                }
569
570                // Find ALL overlapping entities (not just first)
571                //
572                // Performance: O(n) per candidate, O(n²) overall for n entities.
573                // For large entity sets, consider optimizing with:
574                // - Interval tree: O(n log n) construction, O(log n + k) query (k = overlaps)
575                // - Sorted intervals with binary search: O(n log n) sort, O(log n + k) query
576                // Current implementation prioritizes correctness and simplicity.
577                //
578                // Note: Entities are sorted at the end, but during conflict resolution
579                // we process candidates in layer order, so we can't assume sorted order here.
580                let overlapping_indices: Vec<usize> = entities
581                    .iter()
582                    .enumerate()
583                    .filter_map(|(idx, e)| {
584                        // Check if candidate overlaps with existing entity
585                        // Overlap: !(candidate.end <= e.start || candidate.start >= e.end)
586                        if candidate.end > e.start && candidate.start < e.end {
587                            Some(idx)
588                        } else {
589                            None
590                        }
591                    })
592                    .collect();
593
594                match overlapping_indices.len() {
595                    0 => {
596                        // No overlap - add directly
597                        entities.push(candidate);
598                    }
599                    1 => {
600                        // Single overlap - resolve normally
601                        let idx = overlapping_indices[0];
602                        match self.strategy.resolve(&entities[idx], &candidate) {
603                            Resolution::KeepExisting => {}
604                            Resolution::Replace => {
605                                entities[idx] = candidate;
606                            }
607                            Resolution::KeepBoth => {
608                                entities.push(candidate);
609                            }
610                        }
611                    }
612                    _ => {
613                        // Multiple overlaps - need to handle carefully
614                        // Strategy: resolve with the "best" existing entity based on strategy,
615                        // then check if candidate should replace it
616                        let best_idx = overlapping_indices
617                            .iter()
618                            .max_by(|&&a, &&b| {
619                                // Find the "best" existing entity to compare against
620                                match self.strategy {
621                                    ConflictStrategy::Priority => {
622                                        // Earlier in list = higher priority
623                                        a.cmp(&b).reverse()
624                                    }
625                                    ConflictStrategy::LongestSpan => {
626                                        let len_a = entities[a].end - entities[a].start;
627                                        let len_b = entities[b].end - entities[b].start;
628                                        len_a.cmp(&len_b).then_with(|| b.cmp(&a))
629                                    }
630                                    ConflictStrategy::HighestConf => entities[a]
631                                        .confidence
632                                        .partial_cmp(&entities[b].confidence)
633                                        .unwrap_or(std::cmp::Ordering::Equal)
634                                        .then_with(|| b.cmp(&a)),
635                                    ConflictStrategy::Union => {
636                                        // For union, we'll keep all, so just pick first
637                                        a.cmp(&b)
638                                    }
639                                }
640                            })
641                            .copied()
642                            .unwrap_or(overlapping_indices[0]);
643
644                        match self.strategy {
645                            ConflictStrategy::Union => {
646                                // Keep candidate and all existing overlapping entities
647                                entities.push(candidate);
648                            }
649                            _ => {
650                                // Resolve with best existing entity
651                                match self.strategy.resolve(&entities[best_idx], &candidate) {
652                                    Resolution::KeepExisting => {
653                                        // Remove other overlapping entities (they're subsumed)
654                                        // Sort indices descending to remove from end
655                                        let mut to_remove: Vec<usize> = overlapping_indices
656                                            .into_iter()
657                                            .filter(|&idx| idx != best_idx)
658                                            .collect();
659                                        // Performance: Use unstable sort (we don't need stable sort here)
660                                        to_remove.sort_unstable_by(|a, b| b.cmp(a));
661                                        for idx in to_remove {
662                                            entities.remove(idx);
663                                        }
664                                    }
665                                    Resolution::Replace => {
666                                        // Replace best and remove others
667                                        let mut to_remove: Vec<usize> = overlapping_indices
668                                            .into_iter()
669                                            .filter(|&idx| idx != best_idx)
670                                            .collect();
671                                        // Performance: Use unstable sort (we don't need stable sort here)
672                                        to_remove.sort_unstable_by(|a, b| b.cmp(a));
673
674                                        // Adjust best_idx based on how many entities we remove before it
675                                        let removed_before_best =
676                                            to_remove.iter().filter(|&&idx| idx < best_idx).count();
677                                        let adjusted_best_idx = best_idx - removed_before_best;
678
679                                        // Remove entities (in descending order to preserve indices)
680                                        for idx in to_remove {
681                                            entities.remove(idx);
682                                        }
683
684                                        // Now use adjusted index
685                                        entities[adjusted_best_idx] = candidate;
686                                    }
687                                    Resolution::KeepBoth => {
688                                        // Remove others, keep best and candidate
689                                        let mut to_remove: Vec<usize> = overlapping_indices
690                                            .into_iter()
691                                            .filter(|&idx| idx != best_idx)
692                                            .collect();
693                                        // Performance: Use unstable sort (we don't need stable sort here)
694                                        to_remove.sort_unstable_by(|a, b| b.cmp(a));
695                                        // Remove entities (best_idx remains valid since we don't remove it)
696                                        for idx in to_remove {
697                                            entities.remove(idx);
698                                        }
699                                        entities.push(candidate);
700                                    }
701                                }
702                            }
703                        }
704                    }
705                }
706            }
707        }
708
709        // Sort by position (start, then end) with deterministic tie-breaks.
710        //
711        // We include additional keys so exact-tie cases (same span) produce stable ordering,
712        // and so dedup-by-span+type (below) works reliably if duplicates slip through.
713        entities.sort_unstable_by(|a, b| {
714            let a_ty = a.entity_type.as_label();
715            let b_ty = b.entity_type.as_label();
716            let a_src = a
717                .provenance
718                .as_ref()
719                .map(|p| p.source.as_ref())
720                .unwrap_or("");
721            let b_src = b
722                .provenance
723                .as_ref()
724                .map(|p| p.source.as_ref())
725                .unwrap_or("");
726
727            (a.start, a.end, a_ty, a_src, a.text.as_str()).cmp(&(
728                b.start,
729                b.end,
730                b_ty,
731                b_src,
732                b.text.as_str(),
733            ))
734        });
735
736        // Remove any duplicates that might have been created (defensive)
737        // Only deduplicate if not using Union strategy (Union intentionally allows overlaps)
738        if self.strategy != ConflictStrategy::Union {
739            // Two entities are duplicates if they have same span and type
740            // Performance: dedup_by is O(n) and efficient for sorted vec
741            entities.dedup_by(|a, b| {
742                a.start == b.start && a.end == b.end && a.entity_type == b.entity_type
743            });
744        }
745
746        // If we had errors but got partial results, log them but return success
747        if !layer_errors.is_empty() && !entities.is_empty() {
748            log::warn!(
749                "StackedNER: Some layers failed but returning partial results. Errors: {:?}",
750                layer_errors
751            );
752        }
753
754        // Validate final entities (defensive programming)
755        // This catches bugs in individual backends that might produce invalid spans
756        for entity in &entities {
757            if entity.start >= entity.end {
758                log::warn!(
759                    "StackedNER: Invalid entity span detected: start={}, end={}, text={:?}, type={:?}",
760                    entity.start,
761                    entity.end,
762                    entity.text,
763                    entity.entity_type
764                );
765            }
766        }
767
768        Ok(entities)
769    }
770
771    fn supported_types(&self) -> Vec<EntityType> {
772        // Use itertools for efficient deduplication
773        self.layers
774            .iter()
775            .flat_map(|layer| layer.supported_types())
776            .sorted_by(|a, b| format!("{:?}", a).cmp(&format!("{:?}", b)))
777            .dedup()
778            .collect_vec()
779    }
780
781    fn is_available(&self) -> bool {
782        self.layers.iter().any(|l| l.is_available())
783    }
784
785    fn name(&self) -> &'static str {
786        // Use OnceLock to cache the static string, avoiding repeated memory leaks
787        self.name_static
788            .get_or_init(|| Box::leak(self.name.clone().into_boxed_str()))
789    }
790
791    fn description(&self) -> &'static str {
792        "Stacked NER (multi-backend composition)"
793    }
794
795    fn capabilities(&self) -> crate::ModelCapabilities {
796        crate::ModelCapabilities {
797            batch_capable: true,
798            optimal_batch_size: Some(32),
799            streaming_capable: true,
800            recommended_chunk_size: Some(8_000),
801            ..Default::default()
802        }
803    }
804}
805
806// =============================================================================
807// Type Aliases for Backwards Compatibility
808// =============================================================================
809
810/// Alias for backwards compatibility.
811#[deprecated(since = "0.2.0", note = "Use StackedNER instead")]
812pub type LayeredNER = StackedNER;
813
814/// Alias for backwards compatibility.
815#[deprecated(since = "0.2.0", note = "Use StackedNER::default() instead")]
816pub type TieredNER = StackedNER;
817
818/// Alias for backwards compatibility.
819#[deprecated(since = "0.2.0", note = "Use StackedNER instead")]
820pub type CompositeNER = StackedNER;
821
822// Capability markers: StackedNER combines pattern and heuristic extraction
823impl crate::StructuredEntityCapable for StackedNER {}
824impl crate::NamedEntityCapable for StackedNER {}
825
826// =============================================================================
827// BatchCapable and StreamingCapable Trait Implementations
828// =============================================================================
829
830impl crate::BatchCapable for StackedNER {
831    fn extract_entities_batch(
832        &self,
833        texts: &[&str],
834        language: Option<&str>,
835    ) -> Result<Vec<Vec<Entity>>> {
836        texts
837            .iter()
838            .map(|text| self.extract_entities(text, language))
839            .collect()
840    }
841
842    fn optimal_batch_size(&self) -> Option<usize> {
843        Some(32) // Combination of pattern + heuristic
844    }
845}
846
847impl crate::StreamingCapable for StackedNER {
848    fn recommended_chunk_size(&self) -> usize {
849        8_000 // Slightly smaller due to multi-layer processing
850    }
851}
852
853// =============================================================================
854// Tests
855// =============================================================================
856
857#[cfg(test)]
858mod tests;
anno/backends/stacked/mod.rs

anno/backends/stacked/
mod.rs