Skip to main content

anno/
lib.rs

1//! # anno
2//!
3//! Information extraction: named entity recognition (NER) and within-document coreference.
4//!
5//! - **NER output**: variable-length spans with **character offsets** (Unicode scalar values), not
6//!   byte offsets.
7//! - **Coreference output**: clusters (“tracks”) of mentions within one document.
8//!
9//! This crate focuses on inference-time extraction. Dataset loaders, benchmarking, and matrix
10//! evaluation tooling live in `anno-eval` (and the `anno` CLI lives in `anno-cli`).
11//!
12//! ## Quickstart
13//!
14//! ```rust
15//! use anno::{Model, StackedNER};
16//!
17//! let m = StackedNER::default();
18//! let ents = m.extract_entities("Lynn Conway worked at IBM and Xerox PARC.", None)?;
19//! assert!(!ents.is_empty());
20//! # Ok::<(), anno::Error>(())
21//! ```
22//!
23//! ## Zero-shot custom entity types
24//!
25//! Zero-shot custom entity types are provided by GLiNER backends when the `onnx` feature is
26//! enabled. See the repo docs for the CLI flag (`--extract-types`) and the library API.
27//!
28//! ## Offline / downloads
29//!
30//! By default, ML weights may download on first use. To force cached-only behavior, set
31//! `ANNO_NO_DOWNLOADS=1` (after prefetching models).
32
33#![warn(missing_docs)]
34
35// Allow unit tests (and included CI test modules) to refer to this crate as `anno::...`,
36// matching integration-test style imports.
37extern crate self as anno;
38
39// Module declarations (standard Cargo layout under `src/`)
40pub mod backends;
41/// Edit distance algorithms.
42pub mod edit_distance;
43pub mod env;
44pub mod error;
45/// Evaluation/analysis primitives (coref metrics, cluster encoders, etc.).
46///
47/// This module is only available when the legacy `eval` feature (or the preferred `analysis`
48/// alias) is enabled.
49#[cfg(any(feature = "analysis", feature = "eval"))]
50pub mod eval;
51/// Entity feature extraction for downstream ML and analysis.
52pub mod features;
53/// Small, dependency-light heuristics (negation, quantifiers, etc.).
54pub mod heuristics;
55/// Lightweight URL/file ingestion helpers (not a crawling/pipeline product).
56pub mod ingest;
57/// Joint inference experiments (optional; not the primary API surface).
58pub mod joint;
59/// Keyword and keyphrase extraction (TF-IDF, YAKE, TextRank).
60#[cfg(feature = "graph")]
61pub mod keywords;
62pub mod lang;
63/// Knowledge-base linking helpers (experimental).
64pub mod linking;
65pub mod offset;
66/// Shared PageRank algorithm for graph-based ranking.
67#[cfg(feature = "graph")]
68pub mod pagerank;
69/// Preprocessing for mention detection.
70pub mod preprocess;
71/// Entity salience and importance ranking.
72#[cfg(feature = "graph")]
73pub mod salience;
74pub mod schema;
75pub mod similarity;
76/// Extractive summarization.
77#[cfg(feature = "graph")]
78pub mod summarize;
79pub mod sync;
80/// Temporal entity tracking, parsing, and diachronic NER.
81pub mod temporal;
82/// Language-specific tokenization for multilingual NLP.
83pub mod tokenizer;
84pub mod types;
85
86// Note: research-only geometry experiments were archived out of `anno` to keep the public
87// surface grounded. Prefer `docs/` for repo-local design notes and experiments.
88
89/// Discourse-level analysis for coreference resolution.
90///
91/// Provides infrastructure for handling phenomena that span sentence boundaries:
92///
93/// - **Centering theory**: Track discourse focus through forward/backward-looking centers
94/// - **Uncertain reference**: Deferred resolution using epsilon-term semantics
95/// - **Abstract anaphora**: Pronouns referring to events, propositions, facts
96/// - **Shell nouns**: Abstract nouns like "problem", "issue", "fact"
97///
98/// Enable with the `discourse` feature.
99///
100/// See `discourse::centering` for salience-based pronoun resolution and
101/// `discourse::uncertain_reference` for handling ambiguous references.
102#[cfg(feature = "discourse")]
103pub mod discourse;
104
105// Re-export error types
106pub use error::{Error, Result};
107
108// =============================================================================
109// Core types live in `anno-core`
110// =============================================================================
111
112// Re-export core types at the crate root (the `anno` public API surface).
113pub use anno_core::{
114    generate_span_candidates, CorefChain, CorefDocument, CoreferenceResolver, Corpus,
115    DiscontinuousSpan, Entity, EntityBuilder, EntityCategory, EntityType, EntityViewport,
116    ExtractionMethod, Gender, GroundedDocument, HashMapLexicon, HierarchicalConfidence, Identity,
117    IdentityId, IdentitySource, Lexicon, Location, Mention, MentionType, Modality, Number, Person,
118    PhiFeatures, Provenance, Quantifier, RaggedBatch, Relation, Signal, SignalId, SignalRef, Span,
119    SpanCandidate, Track, TrackId, TrackRef, TrackStats, TypeLabel, TypeMapper, ValidationIssue,
120};
121
122/// `anno-core`’s stable types under a namespaced module.
123///
124/// This exists for readability in downstream codebases (e.g. `anno::core::Entity`)
125/// and mirrors the structure of the internal `anno-core` crate.
126pub mod core {
127    pub use anno_core::core::*;
128}
129
130// Re-export commonly used types
131pub use lang::{detect_language, Language};
132pub use offset::{
133    bytes_to_chars, chars_to_bytes, is_ascii, OffsetMapping, SpanConverter, TextSpan, TokenSpan,
134};
135pub use schema::*;
136pub use similarity::*;
137pub use sync::*;
138pub use types::*;
139
140// =============================================================================
141// Sealed Trait Pattern
142// =============================================================================
143//
144// The `Model` trait is sealed to:
145// 1. Maintain invariants (entities have valid offsets, confidence in [0,1])
146// 2. Allow adding methods without breaking external implementations
147// 3. Ensure all backends share consistent behavior
148//
149// For external/plugin backends, use the `AnyModel` wrapper (see below).
150// =============================================================================
151
152mod sealed {
153    pub trait Sealed {}
154
155    impl Sealed for super::RegexNER {}
156    impl Sealed for super::HeuristicNER {}
157    impl Sealed for super::StackedNER {}
158    impl Sealed for super::EnsembleNER {}
159    impl Sealed for super::CrfNER {}
160    impl Sealed for super::NuNER {}
161    impl Sealed for super::W2NER {}
162    impl Sealed for super::NERExtractor {}
163
164    #[cfg(feature = "onnx")]
165    impl Sealed for super::BertNEROnnx {}
166
167    #[cfg(feature = "onnx")]
168    impl Sealed for super::GLiNEROnnx {}
169
170    #[cfg(feature = "onnx")]
171    impl Sealed for super::backends::albert::ALBERTNER {}
172
173    #[cfg(feature = "onnx")]
174    impl Sealed for super::backends::deberta_v3::DeBERTaV3NER {}
175
176    #[cfg(feature = "onnx")]
177    impl Sealed for super::backends::gliner_poly::GLiNERPoly {}
178
179    #[cfg(feature = "onnx")]
180    impl Sealed for super::backends::gliner2::GLiNER2Onnx {}
181
182    #[cfg(feature = "candle")]
183    impl Sealed for super::CandleNER {}
184
185    #[cfg(feature = "candle")]
186    impl Sealed for super::backends::gliner_candle::GLiNERCandle {}
187
188    #[cfg(feature = "candle")]
189    impl Sealed for super::backends::gliner2::GLiNER2Candle {}
190
191    #[cfg(feature = "candle")]
192    impl<E: Send + Sync> Sealed for super::backends::gliner_pipeline::GLiNERPipeline<E> {}
193
194    #[cfg(feature = "burn")]
195    impl Sealed for super::backends::burn::BurnNER {}
196
197    impl Sealed for super::backends::tplinker::TPLinker {}
198    impl Sealed for super::backends::universal_ner::UniversalNER {}
199    impl Sealed for super::backends::lexicon::LexiconNER {}
200
201    #[allow(deprecated)]
202    impl Sealed for super::backends::rule::RuleBasedNER {}
203
204    impl Sealed for super::MockModel {}
205    impl Sealed for super::joint::JointModel {}
206}
207
208/// Trait for NER model backends.
209///
210/// # Sealed Trait
211///
212/// `Model` is intentionally sealed (cannot be implemented outside this crate) to:
213///
214/// 1. **Maintain invariants**: All backends must produce entities with valid character
215///    offsets, confidence in `[0, 1]`, and non-empty text.
216/// 2. **Allow evolution**: New methods can be added with default implementations
217///    without breaking external code.
218/// 3. **Ensure consistency**: All backends share standardized behavior for
219///    `is_available()`, `supported_types()`, etc.
220///
221/// # For External Backends
222///
223/// If you need to integrate an external NER backend (e.g., a REST API, Python model
224/// via PyO3, or custom implementation), use the [`AnyModel`] wrapper:
225///
226/// ```rust,ignore
227/// use anno::{AnyModel, Entity, EntityType, Result};
228///
229/// struct MyExternalNER { /* ... */ }
230///
231/// impl MyExternalNER {
232///     fn extract(&self, text: &str) -> Vec<Entity> {
233///         // Your implementation
234///         vec![]
235///     }
236/// }
237///
238/// // Wrap in AnyModel to use with anno's infrastructure
239/// let model = AnyModel::new(
240///     "my-ner",
241///     vec![EntityType::Person, EntityType::Organization],
242///     move |text, _lang| Ok(my_ner.extract(text)),
243/// );
244///
245/// // Now usable wherever Box<dyn Model> is expected
246/// let entities = model.extract_entities("Hello world", None)?;
247/// ```
248///
249/// [`AnyModel`]: crate::AnyModel
250pub trait Model: sealed::Sealed + Send + Sync {
251    /// Extract entities from text.
252    fn extract_entities(&self, text: &str, language: Option<&str>) -> Result<Vec<Entity>>;
253
254    /// Get supported entity types.
255    fn supported_types(&self) -> Vec<EntityType>;
256
257    /// Check if model is available and ready.
258    fn is_available(&self) -> bool;
259
260    /// Get the model name/identifier.
261    fn name(&self) -> &'static str {
262        "unknown"
263    }
264
265    /// Get a description of the model.
266    fn description(&self) -> &'static str {
267        "Unknown NER model"
268    }
269
270    /// Get capability summary for this model.
271    ///
272    /// Override this in implementations that support additional capabilities
273    /// (batch, GPU, streaming, etc.) to enable runtime discovery.
274    ///
275    /// # Default
276    ///
277    /// Returns a [`ModelCapabilities`] with all fields set to `false`/`None`.
278    fn capabilities(&self) -> ModelCapabilities {
279        ModelCapabilities::default()
280    }
281
282    /// Get a version identifier for the model configuration/weights.
283    ///
284    /// Used for cache invalidation. Default implementation returns "1".
285    fn version(&self) -> String {
286        "1".to_string()
287    }
288}
289
290// =============================================================================
291// AnyModel: Adapter for External Backends
292// =============================================================================
293
294/// A wrapper that allows external code to implement NER backends without
295/// directly implementing the sealed `Model` trait.
296///
297/// `AnyModel` acts as an adapter: you provide a closure that does the actual
298/// entity extraction, and `AnyModel` implements `Model` on your behalf.
299///
300/// # Example
301///
302/// ```rust
303/// use anno::{AnyModel, Entity, EntityType, Model, Result};
304///
305/// // Define extraction logic as a closure or function
306/// let my_extractor = |text: &str, _lang: Option<&str>| -> Result<Vec<Entity>> {
307///     // Your custom NER logic here
308///     Ok(vec![])
309/// };
310///
311/// // Wrap in AnyModel
312/// let model = AnyModel::new(
313///     "my-custom-ner",
314///     "Custom NER backend using external API",
315///     vec![EntityType::Person, EntityType::Organization],
316///     my_extractor,
317/// );
318///
319/// // Use like any other Model
320/// assert!(model.is_available());
321/// let entities = model.extract_entities("Hello world", None).unwrap();
322/// ```
323///
324/// # Thread Safety
325///
326/// The extractor closure must be `Send + Sync`. For interior mutability
327/// (e.g., caching, connection pooling), use `Arc<Mutex<...>>` or similar.
328/// Type alias for the `AnyModel` extractor closure.
329type AnyModelExtractor = dyn Fn(&str, Option<&str>) -> Result<Vec<Entity>> + Send + Sync;
330
331/// A wrapper that turns an extractor closure into a `Model`.
332pub struct AnyModel {
333    name: &'static str,
334    description: &'static str,
335    supported_types: Vec<EntityType>,
336    extractor: Box<AnyModelExtractor>,
337    version: String,
338}
339
340impl AnyModel {
341    /// Create a new `AnyModel` wrapper.
342    ///
343    /// # Arguments
344    ///
345    /// * `name` - Model identifier (e.g., "my-ner")
346    /// * `description` - Human-readable description
347    /// * `supported_types` - Entity types this model can extract
348    /// * `extractor` - Closure that performs the actual extraction
349    pub fn new(
350        name: &'static str,
351        description: &'static str,
352        supported_types: Vec<EntityType>,
353        extractor: impl Fn(&str, Option<&str>) -> Result<Vec<Entity>> + Send + Sync + 'static,
354    ) -> Self {
355        Self {
356            name,
357            description,
358            supported_types,
359            extractor: Box::new(extractor),
360            version: "1".to_string(),
361        }
362    }
363
364    /// Set the version string for cache invalidation.
365    pub fn with_version(mut self, version: impl Into<String>) -> Self {
366        self.version = version.into();
367        self
368    }
369}
370
371impl std::fmt::Debug for AnyModel {
372    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
373        f.debug_struct("AnyModel")
374            .field("name", &self.name)
375            .field("description", &self.description)
376            .field("supported_types", &self.supported_types)
377            .finish()
378    }
379}
380
381// AnyModel gets the Sealed impl so it can implement Model
382impl sealed::Sealed for AnyModel {}
383
384impl Model for AnyModel {
385    fn extract_entities(&self, text: &str, language: Option<&str>) -> Result<Vec<Entity>> {
386        (self.extractor)(text, language)
387    }
388
389    fn supported_types(&self) -> Vec<EntityType> {
390        self.supported_types.clone()
391    }
392
393    fn is_available(&self) -> bool {
394        true
395    }
396
397    fn name(&self) -> &'static str {
398        self.name
399    }
400
401    fn description(&self) -> &'static str {
402        self.description
403    }
404
405    fn version(&self) -> String {
406        self.version.clone()
407    }
408}
409
410// =============================================================================
411// Capability Marker Traits
412// =============================================================================
413
414/// Trait for models that support batch processing.
415///
416/// Models implementing this trait can process multiple texts efficiently,
417/// potentially using parallel processing or optimized batch operations.
418pub trait BatchCapable: Model {
419    /// Extract entities from multiple texts in a batch.
420    ///
421    /// # Arguments
422    /// * `texts` - Slice of text strings to process
423    /// * `language` - Optional language hint for the texts
424    ///
425    /// # Returns
426    /// A vector of entity vectors, one per input text
427    fn extract_entities_batch(
428        &self,
429        texts: &[&str],
430        language: Option<&str>,
431    ) -> Result<Vec<Vec<Entity>>> {
432        texts
433            .iter()
434            .map(|text| self.extract_entities(text, language))
435            .collect()
436    }
437
438    /// Get the optimal batch size for this model, if applicable.
439    ///
440    /// Returns `None` if the model doesn't have a specific optimal batch size,
441    /// or `Some(n)` if there's a recommended batch size for best performance.
442    fn optimal_batch_size(&self) -> Option<usize> {
443        None
444    }
445}
446
447/// Trait for models that support GPU acceleration.
448///
449/// Models implementing this trait can report whether GPU is active
450/// and which device they're using.
451pub trait GpuCapable: Model {
452    /// Check if GPU acceleration is currently active.
453    ///
454    /// Returns `true` if the model is using GPU, `false` if using CPU.
455    fn is_gpu_active(&self) -> bool;
456
457    /// Get the device identifier (e.g., "cuda:0", "cpu").
458    ///
459    /// Returns a string describing the compute device being used.
460    fn device(&self) -> &str;
461}
462
463/// Trait for models that support streaming/chunked extraction.
464///
465/// Useful for processing very long documents by splitting them into chunks
466/// and extracting entities from each chunk with proper offset tracking.
467pub trait StreamingCapable: Model {
468    /// Extract entities from a chunk of text, adjusting offsets by the chunk's position.
469    ///
470    /// # Arguments
471    ///
472    /// * `chunk` - A portion of the full document text
473    /// * `offset` - Character offset of this chunk within the full document
474    ///
475    /// # Returns
476    ///
477    /// Entities with offsets adjusted to their position in the full document.
478    fn extract_entities_streaming(&self, chunk: &str, offset: usize) -> Result<Vec<Entity>> {
479        let entities = self.extract_entities(chunk, None)?;
480        Ok(entities
481            .into_iter()
482            .map(|mut e| {
483                e.start += offset;
484                e.end += offset;
485                e
486            })
487            .collect())
488    }
489
490    /// Get the recommended chunk size for streaming extraction.
491    ///
492    /// Returns the optimal number of characters per chunk for this model.
493    /// Default implementation returns 10,000 characters.
494    fn recommended_chunk_size(&self) -> usize {
495        10_000
496    }
497}
498
499/// Marker trait for models that extract named entities (persons, organizations, locations).
500///
501/// This is a marker trait used for type-level distinctions between different
502/// model capabilities. All NER models should implement this.
503pub trait NamedEntityCapable: Model {}
504
505/// Marker trait for models that extract structured entities (dates, times, money, etc.).
506///
507/// This is a marker trait used for type-level distinctions between different
508/// model capabilities. Models that extract structured data (like `RegexNER`) should implement this.
509pub trait StructuredEntityCapable: Model {}
510
511// =============================================================================
512// Capability Discovery for Trait Objects
513// =============================================================================
514
515/// Summary of a model's capabilities, useful when working with `Box<dyn Model>`.
516///
517/// Since capability traits (`BatchCapable`, `GpuCapable`, etc.) can't be queried
518/// through a `Box<dyn Model>` without downcasting, this struct provides a static
519/// summary of what the model supports.
520///
521/// # Example
522///
523/// ```rust,ignore
524/// use anno::{Model, ModelCapabilities};
525///
526/// fn process_with_model(model: &dyn Model) {
527///     let caps = model.capabilities();
528///
529///     if caps.batch_capable {
530///         println!("Model supports batch processing");
531///     }
532///     if caps.gpu_capable {
533///         println!("Model can use GPU: {:?}", caps.device);
534///     }
535/// }
536/// ```
537#[derive(Debug, Clone, Default)]
538pub struct ModelCapabilities {
539    /// True if the model implements `BatchCapable`.
540    pub batch_capable: bool,
541    /// Optimal batch size, if batch capable.
542    pub optimal_batch_size: Option<usize>,
543    /// True if the model implements `GpuCapable`.
544    pub gpu_capable: bool,
545    /// True if GPU is currently active.
546    pub gpu_active: bool,
547    /// Device identifier (e.g., "cuda:0", "cpu"), if GPU capable.
548    pub device: Option<String>,
549    /// True if the model implements `StreamingCapable`.
550    pub streaming_capable: bool,
551    /// Recommended chunk size for streaming, if streaming capable.
552    pub recommended_chunk_size: Option<usize>,
553    /// True if the model implements `RelationCapable`.
554    pub relation_capable: bool,
555    /// True if the model implements `DynamicLabels` (zero-shot, caller-supplied entity types).
556    pub dynamic_labels: bool,
557    /// True if the model can extract discontinuous entities spanning non-adjacent spans.
558    /// Only `W2NER` (when loaded with an ONNX session) sets this today.
559    pub discontinuous_capable: bool,
560}
561
562/// Trait for models that can extract relations between entities.
563///
564/// Models implementing this trait can jointly extract entities and their relationships,
565/// producing (head, relation_type, tail) triples.
566pub trait RelationCapable: Model {
567    /// Extract entities and their relations from text.
568    ///
569    /// # Arguments
570    ///
571    /// * `text` - Input text to extract from
572    /// * `language` - Optional language hint (e.g., "en", "es")
573    ///
574    /// # Returns
575    ///
576    /// A tuple of (entities, relations) where relations link entities together.
577    fn extract_with_relations(
578        &self,
579        text: &str,
580        language: Option<&str>,
581    ) -> Result<(Vec<Entity>, Vec<Relation>)>;
582}
583
584/// Trait for models that support dynamic/zero-shot entity type specification.
585///
586/// Models implementing this trait can extract entities of arbitrary types
587/// specified at inference time (e.g., GLiNER, UniversalNER), rather than
588/// being limited to a fixed set of pre-trained types.
589pub trait DynamicLabels: Model {
590    /// Extract entities with custom type labels.
591    ///
592    /// # Arguments
593    ///
594    /// * `text` - Input text to extract from
595    /// * `labels` - Custom entity type labels to extract (e.g., ["PERSON", "ORGANIZATION"])
596    /// * `language` - Optional language hint (e.g., "en", "es")
597    ///
598    /// # Returns
599    ///
600    /// Entities of the specified types found in the text.
601    fn extract_with_labels(
602        &self,
603        text: &str,
604        labels: &[&str],
605        language: Option<&str>,
606    ) -> Result<Vec<Entity>>;
607}
608
609// Re-export backends
610pub use backends::label_prompt::{LabelNormalizer, StandardNormalizer};
611pub use backends::{
612    AutoNER, BackendType, ConflictStrategy, CrfNER, EnsembleNER, HeuristicNER, LexiconNER,
613    NERExtractor, NuNER, RegexNER, StackedNER, TPLinker, W2NERConfig, W2NERRelation, W2NER,
614};
615
616// Mention-ranking coreference (Bourgois & Poibeau 2025)
617pub use backends::mention_ranking::{
618    ClusteringStrategy, MentionCluster, MentionRankingConfig, MentionRankingCoref, RankedMention,
619};
620
621// Re-export MockModel for testing
622
623// Re-export Model trait and related
624pub use backends::inference::*;
625
626#[cfg(feature = "onnx")]
627pub use backends::{BertNEROnnx, GLiNEROnnx};
628
629#[cfg(feature = "candle")]
630pub use backends::CandleNER;
631
632// Constants
633
634/// Default BERT ONNX model identifier (HuggingFace model ID).
635pub const DEFAULT_BERT_ONNX_MODEL: &str = "protectai/bert-base-NER-onnx";
636
637/// Default GLiNER ONNX model identifier (HuggingFace model ID).
638pub const DEFAULT_GLINER_MODEL: &str = "onnx-community/gliner_small-v2.1";
639
640/// Default GLiNER2 ONNX model identifier (HuggingFace model ID).
641pub const DEFAULT_GLINER2_MODEL: &str = "onnx-community/gliner-multitask-large-v0.5";
642
643/// Default Candle model identifier (HuggingFace model ID).
644/// Uses dbmdz's model which has both tokenizer.json and safetensors.
645pub const DEFAULT_CANDLE_MODEL: &str = "dslim/bert-base-NER";
646
647/// Default GLiNER Candle model identifier (HuggingFace model ID).
648/// Uses a model with tokenizer.json and pytorch_model.bin for Candle compatibility.
649/// The backend converts pytorch_model.bin to safetensors automatically.
650// NeuML/gliner-bert-tiny uses BERT (not DeBERTa) which is compatible with CandleEncoder
651// Other GLiNER models use DeBERTa-v3 which has different architecture (relative attention)
652pub const DEFAULT_GLINER_CANDLE_MODEL: &str = "NeuML/gliner-bert-tiny";
653
654/// Default NuNER ONNX model identifier (HuggingFace model ID).
655pub const DEFAULT_NUNER_MODEL: &str = "deepanwa/NuNerZero_onnx";
656
657/// Default W2NER ONNX model identifier (HuggingFace model ID).
658pub const DEFAULT_W2NER_MODEL: &str = "ljynlp/w2ner-bert-base";
659
660/// Automatically select the best available NER backend.
661pub fn auto() -> Result<Box<dyn Model>> {
662    #[cfg(feature = "onnx")]
663    {
664        if let Ok(model) = GLiNEROnnx::new(DEFAULT_GLINER_MODEL) {
665            return Ok(Box::new(model));
666        }
667        if let Ok(model) = BertNEROnnx::new(DEFAULT_BERT_ONNX_MODEL) {
668            return Ok(Box::new(model));
669        }
670    }
671    #[cfg(feature = "candle")]
672    {
673        if let Ok(model) = CandleNER::from_pretrained(DEFAULT_CANDLE_MODEL) {
674            return Ok(Box::new(model));
675        }
676    }
677    Ok(Box::new(StackedNER::default()))
678}
679
680/// Check which backends are currently available.
681pub fn available_backends() -> Vec<(&'static str, bool)> {
682    // Keep this list stable and conservative: it is used by the CLI (`anno models list`) to show
683    // what a given build can actually instantiate.
684    let mut backends = vec![
685        // Zero-dependency / always compiled.
686        ("RegexNER", true),
687        ("HeuristicNER", true),
688        ("StackedNER", true),
689        ("EnsembleNER", true),
690        ("CrfNER", true),
691        ("HmmNER", true),
692    ];
693
694    // Feature-gated ML backends: include them even when disabled so the CLI can tell users what
695    // they are missing.
696    #[cfg(feature = "onnx")]
697    {
698        backends.push(("BertNEROnnx", true));
699        backends.push(("GLiNEROnnx", true));
700        backends.push(("NuNER", true));
701        backends.push(("W2NER", true));
702    }
703    #[cfg(not(feature = "onnx"))]
704    {
705        backends.push(("BertNEROnnx", false));
706        backends.push(("GLiNEROnnx", false));
707        backends.push(("NuNER", false));
708        backends.push(("W2NER", false));
709    }
710
711    #[cfg(feature = "candle")]
712    {
713        backends.push(("CandleNER", true));
714    }
715    #[cfg(not(feature = "candle"))]
716    {
717        backends.push(("CandleNER", false));
718    }
719
720    backends
721}
722
723/// A mock NER model for testing purposes.
724///
725/// This is provided so tests can create custom mock implementations
726/// without breaking the sealed trait pattern.
727///
728/// # Entity Validation
729///
730/// By default, `extract_entities` validates that entity offsets are within
731/// the input text bounds and that `start < end`. Set `validate = false`
732/// to disable this (useful for testing error handling).
733///
734/// # Example
735///
736/// ```rust
737/// use anno::{MockModel, Entity, EntityType, Result};
738///
739/// let mock = MockModel::new("test-mock")
740///     .with_entities(vec![
741///         Entity::new("John", EntityType::Person, 0, 4, 0.9),
742///     ]);
743///
744/// // Use mock in tests
745/// ```
746#[derive(Clone)]
747pub struct MockModel {
748    /// Model name identifier.
749    name: &'static str,
750    /// Entities to return when `extract_entities` is called.
751    entities: Vec<Entity>,
752    /// Supported entity types for this mock model.
753    types: Vec<EntityType>,
754    /// If true, validate entity offsets against input text (default: true).
755    validate: bool,
756}
757
758impl MockModel {
759    /// Create a new mock model.
760    #[must_use]
761    pub fn new(name: &'static str) -> Self {
762        Self {
763            name,
764            entities: Vec::new(),
765            types: Vec::new(),
766            validate: true,
767        }
768    }
769
770    /// Set entities to return on extraction.
771    ///
772    /// # Panics
773    ///
774    /// Panics if any entity has `start >= end`.
775    #[must_use]
776    pub fn with_entities(mut self, entities: Vec<Entity>) -> Self {
777        // Basic validation on construction
778        for (i, e) in entities.iter().enumerate() {
779            assert!(
780                e.start < e.end,
781                "MockModel entity {}: start ({}) must be < end ({})",
782                i,
783                e.start,
784                e.end
785            );
786            assert!(
787                e.confidence >= 0.0 && e.confidence <= 1.0,
788                "MockModel entity {}: confidence ({}) must be in [0.0, 1.0]",
789                i,
790                e.confidence
791            );
792        }
793        self.entities = entities;
794        self
795    }
796
797    /// Set supported entity types.
798    #[must_use]
799    pub fn with_types(mut self, types: Vec<EntityType>) -> Self {
800        self.types = types;
801        self
802    }
803
804    /// Disable offset validation during extraction (for testing error paths).
805    #[must_use]
806    pub fn without_validation(mut self) -> Self {
807        self.validate = false;
808        self
809    }
810
811    /// Validate that entity offsets are within text bounds.
812    fn validate_entities(&self, text: &str) -> Result<()> {
813        // Performance optimization: Cache text length (called once, used for all entities)
814        let text_len = text.chars().count();
815        for (i, e) in self.entities.iter().enumerate() {
816            if e.end > text_len {
817                return Err(Error::InvalidInput(format!(
818                    "MockModel entity {} '{}': end offset ({}) exceeds text length ({} chars)",
819                    i, e.text, e.end, text_len
820                )));
821            }
822            // Verify text matches (using char offsets)
823            // Use optimized extract_text_with_len to avoid recalculating length
824            let actual_text = e.extract_text_with_len(text, text_len);
825            if actual_text != e.text {
826                return Err(Error::InvalidInput(format!(
827                    "MockModel entity {} text mismatch: expected '{}' at [{},{}), found '{}'",
828                    i, e.text, e.start, e.end, actual_text
829                )));
830            }
831        }
832        Ok(())
833    }
834}
835
836impl Model for MockModel {
837    fn extract_entities(&self, text: &str, _language: Option<&str>) -> Result<Vec<Entity>> {
838        if self.validate && !self.entities.is_empty() {
839            self.validate_entities(text)?;
840        }
841        Ok(self.entities.clone())
842    }
843
844    fn supported_types(&self) -> Vec<EntityType> {
845        self.types.clone()
846    }
847
848    fn is_available(&self) -> bool {
849        true
850    }
851
852    fn name(&self) -> &'static str {
853        self.name
854    }
855
856    fn description(&self) -> &'static str {
857        "Mock NER model for testing"
858    }
859}
860
861// CI matrix harness moved to `anno-eval`.