anno/
lib.rs

1//! # anno
2//!
3//! Information extraction: named entity recognition (NER) and within-document coreference.
4//!
5//! - **NER output**: variable-length spans with **character offsets** (Unicode scalar values), not
6//!   byte offsets.
7//! - **Coreference output**: clusters (“tracks”) of mentions within one document.
8//!
9//! This crate focuses on inference-time extraction. Dataset loaders, benchmarking, and matrix
10//! evaluation tooling live in `anno-eval` (and the `anno` CLI lives in `anno-cli`).
11//!
12//! ## Quickstart
13//!
14//! ```rust
15//! use anno::{Model, StackedNER};
16//!
17//! let m = StackedNER::default();
18//! let ents = m.extract_entities("Lynn Conway worked at IBM and Xerox PARC.", None)?;
19//! assert!(!ents.is_empty());
20//! # Ok::<(), anno::Error>(())
21//! ```
22//!
23//! ## Zero-shot custom entity types
24//!
25//! Zero-shot custom entity types are provided by GLiNER backends when the `onnx` feature is
26//! enabled. See the repo docs for the CLI flag (`--extract-types`) and the library API.
27//!
28//! ## Offline / downloads
29//!
30//! By default, ML weights may download on first use. To force cached-only behavior, set
31//! `ANNO_NO_DOWNLOADS=1` (after prefetching models).
32
33#![warn(missing_docs)]
34
35// Allow unit tests (and included CI test modules) to refer to this crate as `anno::...`,
36// matching integration-test style imports.
37extern crate self as anno;
38
39// Module declarations (standard Cargo layout under `src/`)
40pub mod backends;
41/// Edit distance algorithms.
42pub mod edit_distance;
43pub mod env;
44pub mod error;
45/// Evaluation/analysis primitives (coref metrics, cluster encoders, etc.).
46///
47/// This module is only available when the legacy `eval` feature (or the preferred `analysis`
48/// alias) is enabled.
49#[cfg(any(feature = "analysis", feature = "eval"))]
50pub mod eval;
51/// Entity feature extraction for downstream ML and analysis.
52pub mod features;
53/// Small, dependency-light heuristics (negation, quantifiers, etc.).
54pub mod heuristics;
55/// Lightweight URL/file ingestion helpers (not a crawling/pipeline product).
56pub mod ingest;
57/// Joint inference experiments (optional; not the primary API surface).
58pub mod joint;
59/// Keyword and keyphrase extraction (TF-IDF, YAKE, TextRank).
60#[cfg(feature = "graph")]
61pub mod keywords;
62pub mod lang;
63/// Knowledge-base linking helpers (experimental).
64pub mod linking;
65pub mod offset;
66/// Shared PageRank algorithm for graph-based ranking.
67#[cfg(feature = "graph")]
68pub mod pagerank;
69/// Preprocessing for mention detection.
70pub mod preprocess;
71/// Entity salience and importance ranking.
72#[cfg(feature = "graph")]
73pub mod salience;
74pub mod schema;
75pub mod similarity;
76/// Extractive summarization.
77#[cfg(feature = "graph")]
78pub mod summarize;
79pub mod sync;
80/// Temporal entity tracking, parsing, and diachronic NER.
81pub mod temporal;
82/// Language-specific tokenization for multilingual NLP.
83pub mod tokenizer;
84pub mod types;
85
86// Note: research-only geometry experiments were archived out of `anno` to keep the public
87// surface grounded. Prefer `docs/` for repo-local design notes and experiments.
88
89/// Discourse-level analysis for coreference resolution.
90///
91/// Provides infrastructure for handling phenomena that span sentence boundaries:
92///
93/// - **Centering theory**: Track discourse focus through forward/backward-looking centers
94/// - **Uncertain reference**: Deferred resolution using epsilon-term semantics
95/// - **Abstract anaphora**: Pronouns referring to events, propositions, facts
96/// - **Shell nouns**: Abstract nouns like "problem", "issue", "fact"
97///
98/// Enable with the `discourse` feature.
99///
100/// See `discourse::centering` for salience-based pronoun resolution and
101/// `discourse::uncertain_reference` for handling ambiguous references.
102#[cfg(feature = "discourse")]
103pub mod discourse;
104
105// Re-export error types
106pub use error::{Error, Result};
107
108// =============================================================================
109// Core types live in `anno-core`
110// =============================================================================
111
112// Re-export core types at the crate root (the `anno` public API surface).
113pub use anno_core::{
114    generate_span_candidates, CorefChain, CorefDocument, CoreferenceResolver, Corpus,
115    DiscontinuousSpan, Entity, EntityBuilder, EntityCategory, EntityType, EntityViewport,
116    ExtractionMethod, Gender, GraphDocument, GraphEdge, GraphExportFormat, GraphNode,
117    GroundedDocument, HashMapLexicon, HierarchicalConfidence, Identity, IdentityId, IdentitySource,
118    Lexicon, Location, Mention, MentionType, Modality, Number, Person, PhiFeatures, Provenance,
119    Quantifier, RaggedBatch, Relation, Signal, SignalId, SignalRef, Span, SpanCandidate, Track,
120    TrackId, TrackRef, TrackStats, TypeLabel, TypeMapper, ValidationIssue,
121};
122
123/// `anno-core`’s stable types under a namespaced module.
124///
125/// This exists for readability in downstream codebases (e.g. `anno::core::Entity`)
126/// and mirrors the structure of the internal `anno-core` crate.
127pub mod core {
128    pub use anno_core::core::*;
129}
130
131// Re-export commonly used types
132pub use lang::{detect_language, Language};
133pub use offset::{
134    bytes_to_chars, chars_to_bytes, is_ascii, OffsetMapping, SpanConverter, TextSpan, TokenSpan,
135};
136pub use schema::*;
137pub use similarity::*;
138pub use sync::*;
139pub use types::*;
140
141// =============================================================================
142// Sealed Trait Pattern
143// =============================================================================
144//
145// The `Model` trait is sealed to:
146// 1. Maintain invariants (entities have valid offsets, confidence in [0,1])
147// 2. Allow adding methods without breaking external implementations
148// 3. Ensure all backends share consistent behavior
149//
150// For external/plugin backends, use the `AnyModel` wrapper (see below).
151// =============================================================================
152
153mod sealed {
154    pub trait Sealed {}
155
156    impl Sealed for super::RegexNER {}
157    impl Sealed for super::HeuristicNER {}
158    impl Sealed for super::StackedNER {}
159    impl Sealed for super::EnsembleNER {}
160    impl Sealed for super::CrfNER {}
161    impl Sealed for super::NuNER {}
162    impl Sealed for super::W2NER {}
163    impl Sealed for super::NERExtractor {}
164
165    #[cfg(feature = "onnx")]
166    impl Sealed for super::BertNEROnnx {}
167
168    #[cfg(feature = "onnx")]
169    impl Sealed for super::GLiNEROnnx {}
170
171    #[cfg(feature = "onnx")]
172    impl Sealed for super::backends::albert::ALBERTNER {}
173
174    #[cfg(feature = "onnx")]
175    impl Sealed for super::backends::deberta_v3::DeBERTaV3NER {}
176
177    #[cfg(feature = "onnx")]
178    impl Sealed for super::backends::gliner_poly::GLiNERPoly {}
179
180    #[cfg(feature = "onnx")]
181    impl Sealed for super::backends::gliner2::GLiNER2Onnx {}
182
183    #[cfg(feature = "candle")]
184    impl Sealed for super::CandleNER {}
185
186    #[cfg(feature = "candle")]
187    impl Sealed for super::backends::gliner_candle::GLiNERCandle {}
188
189    #[cfg(feature = "candle")]
190    impl Sealed for super::backends::gliner2::GLiNER2Candle {}
191
192    #[cfg(feature = "candle")]
193    impl<E: Send + Sync> Sealed for super::backends::gliner_pipeline::GLiNERPipeline<E> {}
194
195    #[cfg(feature = "burn")]
196    impl Sealed for super::backends::burn::BurnNER {}
197
198    impl Sealed for super::backends::tplinker::TPLinker {}
199    impl Sealed for super::backends::universal_ner::UniversalNER {}
200    impl Sealed for super::backends::lexicon::LexiconNER {}
201
202    #[allow(deprecated)]
203    impl Sealed for super::backends::rule::RuleBasedNER {}
204
205    impl Sealed for super::MockModel {}
206    impl Sealed for super::joint::JointModel {}
207}
208
209/// Trait for NER model backends.
210///
211/// # Sealed Trait
212///
213/// `Model` is intentionally sealed (cannot be implemented outside this crate) to:
214///
215/// 1. **Maintain invariants**: All backends must produce entities with valid character
216///    offsets, confidence in `[0, 1]`, and non-empty text.
217/// 2. **Allow evolution**: New methods can be added with default implementations
218///    without breaking external code.
219/// 3. **Ensure consistency**: All backends share standardized behavior for
220///    `is_available()`, `supported_types()`, etc.
221///
222/// # For External Backends
223///
224/// If you need to integrate an external NER backend (e.g., a REST API, Python model
225/// via PyO3, or custom implementation), use the [`AnyModel`] wrapper:
226///
227/// ```rust,ignore
228/// use anno::{AnyModel, Entity, EntityType, Result};
229///
230/// struct MyExternalNER { /* ... */ }
231///
232/// impl MyExternalNER {
233///     fn extract(&self, text: &str) -> Vec<Entity> {
234///         // Your implementation
235///         vec![]
236///     }
237/// }
238///
239/// // Wrap in AnyModel to use with anno's infrastructure
240/// let model = AnyModel::new(
241///     "my-ner",
242///     vec![EntityType::Person, EntityType::Organization],
243///     move |text, _lang| Ok(my_ner.extract(text)),
244/// );
245///
246/// // Now usable wherever Box<dyn Model> is expected
247/// let entities = model.extract_entities("Hello world", None)?;
248/// ```
249///
250/// [`AnyModel`]: crate::AnyModel
251pub trait Model: sealed::Sealed + Send + Sync {
252    /// Extract entities from text.
253    fn extract_entities(&self, text: &str, language: Option<&str>) -> Result<Vec<Entity>>;
254
255    /// Get supported entity types.
256    fn supported_types(&self) -> Vec<EntityType>;
257
258    /// Check if model is available and ready.
259    fn is_available(&self) -> bool;
260
261    /// Get the model name/identifier.
262    fn name(&self) -> &'static str {
263        "unknown"
264    }
265
266    /// Get a description of the model.
267    fn description(&self) -> &'static str {
268        "Unknown NER model"
269    }
270
271    /// Get capability summary for this model.
272    ///
273    /// Override this in implementations that support additional capabilities
274    /// (batch, GPU, streaming, etc.) to enable runtime discovery.
275    ///
276    /// # Default
277    ///
278    /// Returns a [`ModelCapabilities`] with all fields set to `false`/`None`.
279    fn capabilities(&self) -> ModelCapabilities {
280        ModelCapabilities::default()
281    }
282
283    /// Get a version identifier for the model configuration/weights.
284    ///
285    /// Used for cache invalidation. Default implementation returns "1".
286    fn version(&self) -> String {
287        "1".to_string()
288    }
289}
290
291// =============================================================================
292// AnyModel: Adapter for External Backends
293// =============================================================================
294
295/// A wrapper that allows external code to implement NER backends without
296/// directly implementing the sealed `Model` trait.
297///
298/// `AnyModel` acts as an adapter: you provide a closure that does the actual
299/// entity extraction, and `AnyModel` implements `Model` on your behalf.
300///
301/// # Example
302///
303/// ```rust
304/// use anno::{AnyModel, Entity, EntityType, Model, Result};
305///
306/// // Define extraction logic as a closure or function
307/// let my_extractor = |text: &str, _lang: Option<&str>| -> Result<Vec<Entity>> {
308///     // Your custom NER logic here
309///     Ok(vec![])
310/// };
311///
312/// // Wrap in AnyModel
313/// let model = AnyModel::new(
314///     "my-custom-ner",
315///     "Custom NER backend using external API",
316///     vec![EntityType::Person, EntityType::Organization],
317///     my_extractor,
318/// );
319///
320/// // Use like any other Model
321/// assert!(model.is_available());
322/// let entities = model.extract_entities("Hello world", None).unwrap();
323/// ```
324///
325/// # Thread Safety
326///
327/// The extractor closure must be `Send + Sync`. For interior mutability
328/// (e.g., caching, connection pooling), use `Arc<Mutex<...>>` or similar.
329/// Type alias for the `AnyModel` extractor closure.
330type AnyModelExtractor = dyn Fn(&str, Option<&str>) -> Result<Vec<Entity>> + Send + Sync;
331
332/// A wrapper that turns an extractor closure into a `Model`.
333pub struct AnyModel {
334    name: &'static str,
335    description: &'static str,
336    supported_types: Vec<EntityType>,
337    extractor: Box<AnyModelExtractor>,
338    version: String,
339}
340
341impl AnyModel {
342    /// Create a new `AnyModel` wrapper.
343    ///
344    /// # Arguments
345    ///
346    /// * `name` - Model identifier (e.g., "my-ner")
347    /// * `description` - Human-readable description
348    /// * `supported_types` - Entity types this model can extract
349    /// * `extractor` - Closure that performs the actual extraction
350    pub fn new(
351        name: &'static str,
352        description: &'static str,
353        supported_types: Vec<EntityType>,
354        extractor: impl Fn(&str, Option<&str>) -> Result<Vec<Entity>> + Send + Sync + 'static,
355    ) -> Self {
356        Self {
357            name,
358            description,
359            supported_types,
360            extractor: Box::new(extractor),
361            version: "1".to_string(),
362        }
363    }
364
365    /// Set the version string for cache invalidation.
366    pub fn with_version(mut self, version: impl Into<String>) -> Self {
367        self.version = version.into();
368        self
369    }
370}
371
372impl std::fmt::Debug for AnyModel {
373    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
374        f.debug_struct("AnyModel")
375            .field("name", &self.name)
376            .field("description", &self.description)
377            .field("supported_types", &self.supported_types)
378            .finish()
379    }
380}
381
382// AnyModel gets the Sealed impl so it can implement Model
383impl sealed::Sealed for AnyModel {}
384
385impl Model for AnyModel {
386    fn extract_entities(&self, text: &str, language: Option<&str>) -> Result<Vec<Entity>> {
387        (self.extractor)(text, language)
388    }
389
390    fn supported_types(&self) -> Vec<EntityType> {
391        self.supported_types.clone()
392    }
393
394    fn is_available(&self) -> bool {
395        true
396    }
397
398    fn name(&self) -> &'static str {
399        self.name
400    }
401
402    fn description(&self) -> &'static str {
403        self.description
404    }
405
406    fn version(&self) -> String {
407        self.version.clone()
408    }
409}
410
411// =============================================================================
412// Capability Marker Traits
413// =============================================================================
414
415/// Trait for models that support batch processing.
416///
417/// Models implementing this trait can process multiple texts efficiently,
418/// potentially using parallel processing or optimized batch operations.
419pub trait BatchCapable: Model {
420    /// Extract entities from multiple texts in a batch.
421    ///
422    /// # Arguments
423    /// * `texts` - Slice of text strings to process
424    /// * `language` - Optional language hint for the texts
425    ///
426    /// # Returns
427    /// A vector of entity vectors, one per input text
428    fn extract_entities_batch(
429        &self,
430        texts: &[&str],
431        language: Option<&str>,
432    ) -> Result<Vec<Vec<Entity>>> {
433        texts
434            .iter()
435            .map(|text| self.extract_entities(text, language))
436            .collect()
437    }
438
439    /// Get the optimal batch size for this model, if applicable.
440    ///
441    /// Returns `None` if the model doesn't have a specific optimal batch size,
442    /// or `Some(n)` if there's a recommended batch size for best performance.
443    fn optimal_batch_size(&self) -> Option<usize> {
444        None
445    }
446}
447
448/// Trait for models that support GPU acceleration.
449///
450/// Models implementing this trait can report whether GPU is active
451/// and which device they're using.
452pub trait GpuCapable: Model {
453    /// Check if GPU acceleration is currently active.
454    ///
455    /// Returns `true` if the model is using GPU, `false` if using CPU.
456    fn is_gpu_active(&self) -> bool;
457
458    /// Get the device identifier (e.g., "cuda:0", "cpu").
459    ///
460    /// Returns a string describing the compute device being used.
461    fn device(&self) -> &str;
462}
463
464/// Trait for models that support streaming/chunked extraction.
465///
466/// Useful for processing very long documents by splitting them into chunks
467/// and extracting entities from each chunk with proper offset tracking.
468pub trait StreamingCapable: Model {
469    /// Extract entities from a chunk of text, adjusting offsets by the chunk's position.
470    ///
471    /// # Arguments
472    ///
473    /// * `chunk` - A portion of the full document text
474    /// * `offset` - Character offset of this chunk within the full document
475    ///
476    /// # Returns
477    ///
478    /// Entities with offsets adjusted to their position in the full document.
479    fn extract_entities_streaming(&self, chunk: &str, offset: usize) -> Result<Vec<Entity>> {
480        let entities = self.extract_entities(chunk, None)?;
481        Ok(entities
482            .into_iter()
483            .map(|mut e| {
484                e.start += offset;
485                e.end += offset;
486                e
487            })
488            .collect())
489    }
490
491    /// Get the recommended chunk size for streaming extraction.
492    ///
493    /// Returns the optimal number of characters per chunk for this model.
494    /// Default implementation returns 10,000 characters.
495    fn recommended_chunk_size(&self) -> usize {
496        10_000
497    }
498}
499
500/// Marker trait for models that extract named entities (persons, organizations, locations).
501///
502/// This is a marker trait used for type-level distinctions between different
503/// model capabilities. All NER models should implement this.
504pub trait NamedEntityCapable: Model {}
505
506/// Marker trait for models that extract structured entities (dates, times, money, etc.).
507///
508/// This is a marker trait used for type-level distinctions between different
509/// model capabilities. Models that extract structured data (like `RegexNER`) should implement this.
510pub trait StructuredEntityCapable: Model {}
511
512// =============================================================================
513// Capability Discovery for Trait Objects
514// =============================================================================
515
516/// Summary of a model's capabilities, useful when working with `Box<dyn Model>`.
517///
518/// Since capability traits (`BatchCapable`, `GpuCapable`, etc.) can't be queried
519/// through a `Box<dyn Model>` without downcasting, this struct provides a static
520/// summary of what the model supports.
521///
522/// # Example
523///
524/// ```rust,ignore
525/// use anno::{Model, ModelCapabilities};
526///
527/// fn process_with_model(model: &dyn Model) {
528///     let caps = model.capabilities();
529///
530///     if caps.batch_capable {
531///         println!("Model supports batch processing");
532///     }
533///     if caps.gpu_capable {
534///         println!("Model can use GPU: {:?}", caps.device);
535///     }
536/// }
537/// ```
538#[derive(Debug, Clone, Default)]
539pub struct ModelCapabilities {
540    /// True if the model implements `BatchCapable`.
541    pub batch_capable: bool,
542    /// Optimal batch size, if batch capable.
543    pub optimal_batch_size: Option<usize>,
544    /// True if the model implements `GpuCapable`.
545    pub gpu_capable: bool,
546    /// True if GPU is currently active.
547    pub gpu_active: bool,
548    /// Device identifier (e.g., "cuda:0", "cpu"), if GPU capable.
549    pub device: Option<String>,
550    /// True if the model implements `StreamingCapable`.
551    pub streaming_capable: bool,
552    /// Recommended chunk size for streaming, if streaming capable.
553    pub recommended_chunk_size: Option<usize>,
554    /// True if the model implements `RelationCapable`.
555    pub relation_capable: bool,
556}
557
558/// Trait for models that can extract relations between entities.
559///
560/// Models implementing this trait can jointly extract entities and their relationships,
561/// producing (head, relation_type, tail) triples.
562pub trait RelationCapable: Model {
563    /// Extract entities and their relations from text.
564    ///
565    /// # Arguments
566    ///
567    /// * `text` - Input text to extract from
568    /// * `language` - Optional language hint (e.g., "en", "es")
569    ///
570    /// # Returns
571    ///
572    /// A tuple of (entities, relations) where relations link entities together.
573    fn extract_with_relations(
574        &self,
575        text: &str,
576        language: Option<&str>,
577    ) -> Result<(Vec<Entity>, Vec<Relation>)>;
578}
579
580/// Trait for models that support dynamic/zero-shot entity type specification.
581///
582/// Models implementing this trait can extract entities of arbitrary types
583/// specified at inference time (e.g., GLiNER, UniversalNER), rather than
584/// being limited to a fixed set of pre-trained types.
585pub trait DynamicLabels: Model {
586    /// Extract entities with custom type labels.
587    ///
588    /// # Arguments
589    ///
590    /// * `text` - Input text to extract from
591    /// * `labels` - Custom entity type labels to extract (e.g., ["PERSON", "ORGANIZATION"])
592    /// * `language` - Optional language hint (e.g., "en", "es")
593    ///
594    /// # Returns
595    ///
596    /// Entities of the specified types found in the text.
597    fn extract_with_labels(
598        &self,
599        text: &str,
600        labels: &[&str],
601        language: Option<&str>,
602    ) -> Result<Vec<Entity>>;
603}
604
605// Re-export backends
606pub use backends::label_prompt::{LabelNormalizer, StandardNormalizer};
607pub use backends::{
608    AutoNER, BackendType, ConflictStrategy, CrfNER, EnsembleNER, HeuristicNER, LexiconNER,
609    NERExtractor, NuNER, RegexNER, StackedNER, TPLinker, W2NERConfig, W2NERRelation, W2NER,
610};
611
612// Mention-ranking coreference (Bourgois & Poibeau 2025)
613pub use backends::mention_ranking::{
614    ClusteringStrategy, MentionCluster, MentionRankingConfig, MentionRankingCoref, RankedMention,
615};
616
617// Re-export MockModel for testing
618
619// Re-export Model trait and related
620pub use backends::inference::*;
621
622#[cfg(feature = "onnx")]
623pub use backends::{BertNEROnnx, GLiNEROnnx};
624
625#[cfg(feature = "candle")]
626pub use backends::CandleNER;
627
628// Constants
629
630/// Default BERT ONNX model identifier (HuggingFace model ID).
631pub const DEFAULT_BERT_ONNX_MODEL: &str = "protectai/bert-base-NER-onnx";
632
633/// Default GLiNER ONNX model identifier (HuggingFace model ID).
634pub const DEFAULT_GLINER_MODEL: &str = "onnx-community/gliner_small-v2.1";
635
636/// Default GLiNER2 ONNX model identifier (HuggingFace model ID).
637pub const DEFAULT_GLINER2_MODEL: &str = "onnx-community/gliner-multitask-large-v0.5";
638
639/// Default Candle model identifier (HuggingFace model ID).
640/// Uses dbmdz's model which has both tokenizer.json and safetensors.
641pub const DEFAULT_CANDLE_MODEL: &str = "dslim/bert-base-NER";
642
643/// Default GLiNER Candle model identifier (HuggingFace model ID).
644/// Uses a model with tokenizer.json and pytorch_model.bin for Candle compatibility.
645/// The backend converts pytorch_model.bin to safetensors automatically.
646// NeuML/gliner-bert-tiny uses BERT (not DeBERTa) which is compatible with CandleEncoder
647// Other GLiNER models use DeBERTa-v3 which has different architecture (relative attention)
648pub const DEFAULT_GLINER_CANDLE_MODEL: &str = "NeuML/gliner-bert-tiny";
649
650/// Default NuNER ONNX model identifier (HuggingFace model ID).
651pub const DEFAULT_NUNER_MODEL: &str = "deepanwa/NuNerZero_onnx";
652
653/// Default W2NER ONNX model identifier (HuggingFace model ID).
654pub const DEFAULT_W2NER_MODEL: &str = "ljynlp/w2ner-bert-base";
655
656/// Automatically select the best available NER backend.
657pub fn auto() -> Result<Box<dyn Model>> {
658    #[cfg(feature = "onnx")]
659    {
660        if let Ok(model) = GLiNEROnnx::new(DEFAULT_GLINER_MODEL) {
661            return Ok(Box::new(model));
662        }
663        if let Ok(model) = BertNEROnnx::new(DEFAULT_BERT_ONNX_MODEL) {
664            return Ok(Box::new(model));
665        }
666    }
667    #[cfg(feature = "candle")]
668    {
669        if let Ok(model) = CandleNER::from_pretrained(DEFAULT_CANDLE_MODEL) {
670            return Ok(Box::new(model));
671        }
672    }
673    Ok(Box::new(StackedNER::default()))
674}
675
676/// Check which backends are currently available.
677pub fn available_backends() -> Vec<(&'static str, bool)> {
678    // Keep this list stable and conservative: it is used by the CLI (`anno models list`) to show
679    // what a given build can actually instantiate.
680    let mut backends = vec![
681        // Zero-dependency / always compiled.
682        ("RegexNER", true),
683        ("HeuristicNER", true),
684        ("StackedNER", true),
685        ("EnsembleNER", true),
686        ("CrfNER", true),
687        ("HmmNER", true),
688    ];
689
690    // Feature-gated ML backends: include them even when disabled so the CLI can tell users what
691    // they are missing.
692    #[cfg(feature = "onnx")]
693    {
694        backends.push(("BertNEROnnx", true));
695        backends.push(("GLiNEROnnx", true));
696        backends.push(("NuNER", true));
697        backends.push(("W2NER", true));
698    }
699    #[cfg(not(feature = "onnx"))]
700    {
701        backends.push(("BertNEROnnx", false));
702        backends.push(("GLiNEROnnx", false));
703        backends.push(("NuNER", false));
704        backends.push(("W2NER", false));
705    }
706
707    #[cfg(feature = "candle")]
708    {
709        backends.push(("CandleNER", true));
710    }
711    #[cfg(not(feature = "candle"))]
712    {
713        backends.push(("CandleNER", false));
714    }
715
716    backends
717}
718
719/// A mock NER model for testing purposes.
720///
721/// This is provided so tests can create custom mock implementations
722/// without breaking the sealed trait pattern.
723///
724/// # Entity Validation
725///
726/// By default, `extract_entities` validates that entity offsets are within
727/// the input text bounds and that `start < end`. Set `validate = false`
728/// to disable this (useful for testing error handling).
729///
730/// # Example
731///
732/// ```rust
733/// use anno::{MockModel, Entity, EntityType, Result};
734///
735/// let mock = MockModel::new("test-mock")
736///     .with_entities(vec![
737///         Entity::new("John", EntityType::Person, 0, 4, 0.9),
738///     ]);
739///
740/// // Use mock in tests
741/// ```
742#[derive(Clone)]
743pub struct MockModel {
744    /// Model name identifier.
745    name: &'static str,
746    /// Entities to return when `extract_entities` is called.
747    entities: Vec<Entity>,
748    /// Supported entity types for this mock model.
749    types: Vec<EntityType>,
750    /// If true, validate entity offsets against input text (default: true).
751    validate: bool,
752}
753
754impl MockModel {
755    /// Create a new mock model.
756    #[must_use]
757    pub fn new(name: &'static str) -> Self {
758        Self {
759            name,
760            entities: Vec::new(),
761            types: Vec::new(),
762            validate: true,
763        }
764    }
765
766    /// Set entities to return on extraction.
767    ///
768    /// # Panics
769    ///
770    /// Panics if any entity has `start >= end`.
771    #[must_use]
772    pub fn with_entities(mut self, entities: Vec<Entity>) -> Self {
773        // Basic validation on construction
774        for (i, e) in entities.iter().enumerate() {
775            assert!(
776                e.start < e.end,
777                "MockModel entity {}: start ({}) must be < end ({})",
778                i,
779                e.start,
780                e.end
781            );
782            assert!(
783                e.confidence >= 0.0 && e.confidence <= 1.0,
784                "MockModel entity {}: confidence ({}) must be in [0.0, 1.0]",
785                i,
786                e.confidence
787            );
788        }
789        self.entities = entities;
790        self
791    }
792
793    /// Set supported entity types.
794    #[must_use]
795    pub fn with_types(mut self, types: Vec<EntityType>) -> Self {
796        self.types = types;
797        self
798    }
799
800    /// Disable offset validation during extraction (for testing error paths).
801    #[must_use]
802    pub fn without_validation(mut self) -> Self {
803        self.validate = false;
804        self
805    }
806
807    /// Validate that entity offsets are within text bounds.
808    fn validate_entities(&self, text: &str) -> Result<()> {
809        // Performance optimization: Cache text length (called once, used for all entities)
810        let text_len = text.chars().count();
811        for (i, e) in self.entities.iter().enumerate() {
812            if e.end > text_len {
813                return Err(Error::InvalidInput(format!(
814                    "MockModel entity {} '{}': end offset ({}) exceeds text length ({} chars)",
815                    i, e.text, e.end, text_len
816                )));
817            }
818            // Verify text matches (using char offsets)
819            // Use optimized extract_text_with_len to avoid recalculating length
820            let actual_text = e.extract_text_with_len(text, text_len);
821            if actual_text != e.text {
822                return Err(Error::InvalidInput(format!(
823                    "MockModel entity {} text mismatch: expected '{}' at [{},{}), found '{}'",
824                    i, e.text, e.start, e.end, actual_text
825                )));
826            }
827        }
828        Ok(())
829    }
830}
831
832impl Model for MockModel {
833    fn extract_entities(&self, text: &str, _language: Option<&str>) -> Result<Vec<Entity>> {
834        if self.validate && !self.entities.is_empty() {
835            self.validate_entities(text)?;
836        }
837        Ok(self.entities.clone())
838    }
839
840    fn supported_types(&self) -> Vec<EntityType> {
841        self.types.clone()
842    }
843
844    fn is_available(&self) -> bool {
845        true
846    }
847
848    fn name(&self) -> &'static str {
849        self.name
850    }
851
852    fn description(&self) -> &'static str {
853        "Mock NER model for testing"
854    }
855}
856
857// CI matrix harness moved to `anno-eval`.
anno/lib.rs

anno/
lib.rs