anno/lib.rs
1//! # anno
2//!
3//! Information extraction: named entity recognition (NER) and within-document coreference.
4//!
5//! - **NER output**: variable-length spans with **character offsets** (Unicode scalar values), not
6//! byte offsets.
7//! - **Coreference output**: clusters (“tracks”) of mentions within one document.
8//!
9//! This crate focuses on inference-time extraction. Dataset loaders, benchmarking, and matrix
10//! evaluation tooling live in `anno-eval` (and the `anno` CLI lives in `anno-cli`).
11//!
12//! ## Quickstart
13//!
14//! ```rust
15//! use anno::{Model, StackedNER};
16//!
17//! let m = StackedNER::default();
18//! let ents = m.extract_entities("Lynn Conway worked at IBM and Xerox PARC.", None)?;
19//! assert!(!ents.is_empty());
20//! # Ok::<(), anno::Error>(())
21//! ```
22//!
23//! ## Zero-shot custom entity types
24//!
25//! Zero-shot custom entity types are provided by GLiNER backends when the `onnx` feature is
26//! enabled. See the repo docs for the CLI flag (`--extract-types`) and the library API.
27//!
28//! ## Offline / downloads
29//!
30//! By default, ML weights may download on first use. To force cached-only behavior, set
31//! `ANNO_NO_DOWNLOADS=1` (after prefetching models).
32
33#![warn(missing_docs)]
34
35// Allow unit tests (and included CI test modules) to refer to this crate as `anno::...`,
36// matching integration-test style imports.
37extern crate self as anno;
38
39// Module declarations (standard Cargo layout under `src/`)
40pub mod backends;
41/// Edit distance algorithms.
42pub mod edit_distance;
43pub mod env;
44pub mod error;
45/// Evaluation/analysis primitives (coref metrics, cluster encoders, etc.).
46///
47/// This module is only available when the legacy `eval` feature (or the preferred `analysis`
48/// alias) is enabled.
49#[cfg(any(feature = "analysis", feature = "eval"))]
50pub mod eval;
51/// Entity feature extraction for downstream ML and analysis.
52pub mod features;
53/// Small, dependency-light heuristics (negation, quantifiers, etc.).
54pub mod heuristics;
55/// Lightweight URL/file ingestion helpers (not a crawling/pipeline product).
56pub mod ingest;
57/// Joint inference experiments (optional; not the primary API surface).
58pub mod joint;
59/// Keyword and keyphrase extraction (TF-IDF, YAKE, TextRank).
60#[cfg(feature = "graph")]
61pub mod keywords;
62pub mod lang;
63/// Knowledge-base linking helpers (experimental).
64pub mod linking;
65pub mod offset;
66/// Shared PageRank algorithm for graph-based ranking.
67#[cfg(feature = "graph")]
68pub mod pagerank;
69/// Preprocessing for mention detection.
70pub mod preprocess;
71/// Entity salience and importance ranking.
72#[cfg(feature = "graph")]
73pub mod salience;
74pub mod schema;
75pub mod similarity;
76/// Extractive summarization.
77#[cfg(feature = "graph")]
78pub mod summarize;
79pub mod sync;
80/// Temporal entity tracking, parsing, and diachronic NER.
81pub mod temporal;
82/// Language-specific tokenization for multilingual NLP.
83pub mod tokenizer;
84pub mod types;
85
86// Note: research-only geometry experiments were archived out of `anno` to keep the public
87// surface grounded. Prefer `docs/` for repo-local design notes and experiments.
88
89/// Discourse-level analysis for coreference resolution.
90///
91/// Provides infrastructure for handling phenomena that span sentence boundaries:
92///
93/// - **Centering theory**: Track discourse focus through forward/backward-looking centers
94/// - **Uncertain reference**: Deferred resolution using epsilon-term semantics
95/// - **Abstract anaphora**: Pronouns referring to events, propositions, facts
96/// - **Shell nouns**: Abstract nouns like "problem", "issue", "fact"
97///
98/// Enable with the `discourse` feature.
99///
100/// See `discourse::centering` for salience-based pronoun resolution and
101/// `discourse::uncertain_reference` for handling ambiguous references.
102#[cfg(feature = "discourse")]
103pub mod discourse;
104
105// Re-export error types
106pub use error::{Error, Result};
107
108// =============================================================================
109// Core types live in `anno-core`
110// =============================================================================
111
112// Re-export core types at the crate root (the `anno` public API surface).
113pub use anno_core::{
114 generate_span_candidates, CorefChain, CorefDocument, CoreferenceResolver, Corpus,
115 DiscontinuousSpan, Entity, EntityBuilder, EntityCategory, EntityType, EntityViewport,
116 ExtractionMethod, Gender, GraphDocument, GraphEdge, GraphExportFormat, GraphNode,
117 GroundedDocument, HashMapLexicon, HierarchicalConfidence, Identity, IdentityId, IdentitySource,
118 Lexicon, Location, Mention, MentionType, Modality, Number, Person, PhiFeatures, Provenance,
119 Quantifier, RaggedBatch, Relation, Signal, SignalId, SignalRef, Span, SpanCandidate, Track,
120 TrackId, TrackRef, TrackStats, TypeLabel, TypeMapper, ValidationIssue,
121};
122
123/// `anno-core`’s stable types under a namespaced module.
124///
125/// This exists for readability in downstream codebases (e.g. `anno::core::Entity`)
126/// and mirrors the structure of the internal `anno-core` crate.
127pub mod core {
128 pub use anno_core::core::*;
129}
130
131// Re-export commonly used types
132pub use lang::{detect_language, Language};
133pub use offset::{
134 bytes_to_chars, chars_to_bytes, is_ascii, OffsetMapping, SpanConverter, TextSpan, TokenSpan,
135};
136pub use schema::*;
137pub use similarity::*;
138pub use sync::*;
139pub use types::*;
140
141// =============================================================================
142// Sealed Trait Pattern
143// =============================================================================
144//
145// The `Model` trait is sealed to:
146// 1. Maintain invariants (entities have valid offsets, confidence in [0,1])
147// 2. Allow adding methods without breaking external implementations
148// 3. Ensure all backends share consistent behavior
149//
150// For external/plugin backends, use the `AnyModel` wrapper (see below).
151// =============================================================================
152
153mod sealed {
154 pub trait Sealed {}
155
156 impl Sealed for super::RegexNER {}
157 impl Sealed for super::HeuristicNER {}
158 impl Sealed for super::StackedNER {}
159 impl Sealed for super::EnsembleNER {}
160 impl Sealed for super::CrfNER {}
161 impl Sealed for super::NuNER {}
162 impl Sealed for super::W2NER {}
163 impl Sealed for super::NERExtractor {}
164
165 #[cfg(feature = "onnx")]
166 impl Sealed for super::BertNEROnnx {}
167
168 #[cfg(feature = "onnx")]
169 impl Sealed for super::GLiNEROnnx {}
170
171 #[cfg(feature = "onnx")]
172 impl Sealed for super::backends::albert::ALBERTNER {}
173
174 #[cfg(feature = "onnx")]
175 impl Sealed for super::backends::deberta_v3::DeBERTaV3NER {}
176
177 #[cfg(feature = "onnx")]
178 impl Sealed for super::backends::gliner_poly::GLiNERPoly {}
179
180 #[cfg(feature = "onnx")]
181 impl Sealed for super::backends::gliner2::GLiNER2Onnx {}
182
183 #[cfg(feature = "candle")]
184 impl Sealed for super::CandleNER {}
185
186 #[cfg(feature = "candle")]
187 impl Sealed for super::backends::gliner_candle::GLiNERCandle {}
188
189 #[cfg(feature = "candle")]
190 impl Sealed for super::backends::gliner2::GLiNER2Candle {}
191
192 #[cfg(feature = "candle")]
193 impl<E: Send + Sync> Sealed for super::backends::gliner_pipeline::GLiNERPipeline<E> {}
194
195 #[cfg(feature = "burn")]
196 impl Sealed for super::backends::burn::BurnNER {}
197
198 impl Sealed for super::backends::tplinker::TPLinker {}
199 impl Sealed for super::backends::universal_ner::UniversalNER {}
200 impl Sealed for super::backends::lexicon::LexiconNER {}
201
202 #[allow(deprecated)]
203 impl Sealed for super::backends::rule::RuleBasedNER {}
204
205 impl Sealed for super::MockModel {}
206 impl Sealed for super::joint::JointModel {}
207}
208
209/// Trait for NER model backends.
210///
211/// # Sealed Trait
212///
213/// `Model` is intentionally sealed (cannot be implemented outside this crate) to:
214///
215/// 1. **Maintain invariants**: All backends must produce entities with valid character
216/// offsets, confidence in `[0, 1]`, and non-empty text.
217/// 2. **Allow evolution**: New methods can be added with default implementations
218/// without breaking external code.
219/// 3. **Ensure consistency**: All backends share standardized behavior for
220/// `is_available()`, `supported_types()`, etc.
221///
222/// # For External Backends
223///
224/// If you need to integrate an external NER backend (e.g., a REST API, Python model
225/// via PyO3, or custom implementation), use the [`AnyModel`] wrapper:
226///
227/// ```rust,ignore
228/// use anno::{AnyModel, Entity, EntityType, Result};
229///
230/// struct MyExternalNER { /* ... */ }
231///
232/// impl MyExternalNER {
233/// fn extract(&self, text: &str) -> Vec<Entity> {
234/// // Your implementation
235/// vec![]
236/// }
237/// }
238///
239/// // Wrap in AnyModel to use with anno's infrastructure
240/// let model = AnyModel::new(
241/// "my-ner",
242/// vec![EntityType::Person, EntityType::Organization],
243/// move |text, _lang| Ok(my_ner.extract(text)),
244/// );
245///
246/// // Now usable wherever Box<dyn Model> is expected
247/// let entities = model.extract_entities("Hello world", None)?;
248/// ```
249///
250/// [`AnyModel`]: crate::AnyModel
251pub trait Model: sealed::Sealed + Send + Sync {
252 /// Extract entities from text.
253 fn extract_entities(&self, text: &str, language: Option<&str>) -> Result<Vec<Entity>>;
254
255 /// Get supported entity types.
256 fn supported_types(&self) -> Vec<EntityType>;
257
258 /// Check if model is available and ready.
259 fn is_available(&self) -> bool;
260
261 /// Get the model name/identifier.
262 fn name(&self) -> &'static str {
263 "unknown"
264 }
265
266 /// Get a description of the model.
267 fn description(&self) -> &'static str {
268 "Unknown NER model"
269 }
270
271 /// Get capability summary for this model.
272 ///
273 /// Override this in implementations that support additional capabilities
274 /// (batch, GPU, streaming, etc.) to enable runtime discovery.
275 ///
276 /// # Default
277 ///
278 /// Returns a [`ModelCapabilities`] with all fields set to `false`/`None`.
279 fn capabilities(&self) -> ModelCapabilities {
280 ModelCapabilities::default()
281 }
282
283 /// Get a version identifier for the model configuration/weights.
284 ///
285 /// Used for cache invalidation. Default implementation returns "1".
286 fn version(&self) -> String {
287 "1".to_string()
288 }
289}
290
291// =============================================================================
292// AnyModel: Adapter for External Backends
293// =============================================================================
294
295/// A wrapper that allows external code to implement NER backends without
296/// directly implementing the sealed `Model` trait.
297///
298/// `AnyModel` acts as an adapter: you provide a closure that does the actual
299/// entity extraction, and `AnyModel` implements `Model` on your behalf.
300///
301/// # Example
302///
303/// ```rust
304/// use anno::{AnyModel, Entity, EntityType, Model, Result};
305///
306/// // Define extraction logic as a closure or function
307/// let my_extractor = |text: &str, _lang: Option<&str>| -> Result<Vec<Entity>> {
308/// // Your custom NER logic here
309/// Ok(vec![])
310/// };
311///
312/// // Wrap in AnyModel
313/// let model = AnyModel::new(
314/// "my-custom-ner",
315/// "Custom NER backend using external API",
316/// vec![EntityType::Person, EntityType::Organization],
317/// my_extractor,
318/// );
319///
320/// // Use like any other Model
321/// assert!(model.is_available());
322/// let entities = model.extract_entities("Hello world", None).unwrap();
323/// ```
324///
325/// # Thread Safety
326///
327/// The extractor closure must be `Send + Sync`. For interior mutability
328/// (e.g., caching, connection pooling), use `Arc<Mutex<...>>` or similar.
329/// Type alias for the `AnyModel` extractor closure.
330type AnyModelExtractor = dyn Fn(&str, Option<&str>) -> Result<Vec<Entity>> + Send + Sync;
331
332/// A wrapper that turns an extractor closure into a `Model`.
333pub struct AnyModel {
334 name: &'static str,
335 description: &'static str,
336 supported_types: Vec<EntityType>,
337 extractor: Box<AnyModelExtractor>,
338 version: String,
339}
340
341impl AnyModel {
342 /// Create a new `AnyModel` wrapper.
343 ///
344 /// # Arguments
345 ///
346 /// * `name` - Model identifier (e.g., "my-ner")
347 /// * `description` - Human-readable description
348 /// * `supported_types` - Entity types this model can extract
349 /// * `extractor` - Closure that performs the actual extraction
350 pub fn new(
351 name: &'static str,
352 description: &'static str,
353 supported_types: Vec<EntityType>,
354 extractor: impl Fn(&str, Option<&str>) -> Result<Vec<Entity>> + Send + Sync + 'static,
355 ) -> Self {
356 Self {
357 name,
358 description,
359 supported_types,
360 extractor: Box::new(extractor),
361 version: "1".to_string(),
362 }
363 }
364
365 /// Set the version string for cache invalidation.
366 pub fn with_version(mut self, version: impl Into<String>) -> Self {
367 self.version = version.into();
368 self
369 }
370}
371
372impl std::fmt::Debug for AnyModel {
373 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
374 f.debug_struct("AnyModel")
375 .field("name", &self.name)
376 .field("description", &self.description)
377 .field("supported_types", &self.supported_types)
378 .finish()
379 }
380}
381
382// AnyModel gets the Sealed impl so it can implement Model
383impl sealed::Sealed for AnyModel {}
384
385impl Model for AnyModel {
386 fn extract_entities(&self, text: &str, language: Option<&str>) -> Result<Vec<Entity>> {
387 (self.extractor)(text, language)
388 }
389
390 fn supported_types(&self) -> Vec<EntityType> {
391 self.supported_types.clone()
392 }
393
394 fn is_available(&self) -> bool {
395 true
396 }
397
398 fn name(&self) -> &'static str {
399 self.name
400 }
401
402 fn description(&self) -> &'static str {
403 self.description
404 }
405
406 fn version(&self) -> String {
407 self.version.clone()
408 }
409}
410
411// =============================================================================
412// Capability Marker Traits
413// =============================================================================
414
415/// Trait for models that support batch processing.
416///
417/// Models implementing this trait can process multiple texts efficiently,
418/// potentially using parallel processing or optimized batch operations.
419pub trait BatchCapable: Model {
420 /// Extract entities from multiple texts in a batch.
421 ///
422 /// # Arguments
423 /// * `texts` - Slice of text strings to process
424 /// * `language` - Optional language hint for the texts
425 ///
426 /// # Returns
427 /// A vector of entity vectors, one per input text
428 fn extract_entities_batch(
429 &self,
430 texts: &[&str],
431 language: Option<&str>,
432 ) -> Result<Vec<Vec<Entity>>> {
433 texts
434 .iter()
435 .map(|text| self.extract_entities(text, language))
436 .collect()
437 }
438
439 /// Get the optimal batch size for this model, if applicable.
440 ///
441 /// Returns `None` if the model doesn't have a specific optimal batch size,
442 /// or `Some(n)` if there's a recommended batch size for best performance.
443 fn optimal_batch_size(&self) -> Option<usize> {
444 None
445 }
446}
447
448/// Trait for models that support GPU acceleration.
449///
450/// Models implementing this trait can report whether GPU is active
451/// and which device they're using.
452pub trait GpuCapable: Model {
453 /// Check if GPU acceleration is currently active.
454 ///
455 /// Returns `true` if the model is using GPU, `false` if using CPU.
456 fn is_gpu_active(&self) -> bool;
457
458 /// Get the device identifier (e.g., "cuda:0", "cpu").
459 ///
460 /// Returns a string describing the compute device being used.
461 fn device(&self) -> &str;
462}
463
464/// Trait for models that support streaming/chunked extraction.
465///
466/// Useful for processing very long documents by splitting them into chunks
467/// and extracting entities from each chunk with proper offset tracking.
468pub trait StreamingCapable: Model {
469 /// Extract entities from a chunk of text, adjusting offsets by the chunk's position.
470 ///
471 /// # Arguments
472 ///
473 /// * `chunk` - A portion of the full document text
474 /// * `offset` - Character offset of this chunk within the full document
475 ///
476 /// # Returns
477 ///
478 /// Entities with offsets adjusted to their position in the full document.
479 fn extract_entities_streaming(&self, chunk: &str, offset: usize) -> Result<Vec<Entity>> {
480 let entities = self.extract_entities(chunk, None)?;
481 Ok(entities
482 .into_iter()
483 .map(|mut e| {
484 e.start += offset;
485 e.end += offset;
486 e
487 })
488 .collect())
489 }
490
491 /// Get the recommended chunk size for streaming extraction.
492 ///
493 /// Returns the optimal number of characters per chunk for this model.
494 /// Default implementation returns 10,000 characters.
495 fn recommended_chunk_size(&self) -> usize {
496 10_000
497 }
498}
499
500/// Marker trait for models that extract named entities (persons, organizations, locations).
501///
502/// This is a marker trait used for type-level distinctions between different
503/// model capabilities. All NER models should implement this.
504pub trait NamedEntityCapable: Model {}
505
506/// Marker trait for models that extract structured entities (dates, times, money, etc.).
507///
508/// This is a marker trait used for type-level distinctions between different
509/// model capabilities. Models that extract structured data (like `RegexNER`) should implement this.
510pub trait StructuredEntityCapable: Model {}
511
512// =============================================================================
513// Capability Discovery for Trait Objects
514// =============================================================================
515
516/// Summary of a model's capabilities, useful when working with `Box<dyn Model>`.
517///
518/// Since capability traits (`BatchCapable`, `GpuCapable`, etc.) can't be queried
519/// through a `Box<dyn Model>` without downcasting, this struct provides a static
520/// summary of what the model supports.
521///
522/// # Example
523///
524/// ```rust,ignore
525/// use anno::{Model, ModelCapabilities};
526///
527/// fn process_with_model(model: &dyn Model) {
528/// let caps = model.capabilities();
529///
530/// if caps.batch_capable {
531/// println!("Model supports batch processing");
532/// }
533/// if caps.gpu_capable {
534/// println!("Model can use GPU: {:?}", caps.device);
535/// }
536/// }
537/// ```
538#[derive(Debug, Clone, Default)]
539pub struct ModelCapabilities {
540 /// True if the model implements `BatchCapable`.
541 pub batch_capable: bool,
542 /// Optimal batch size, if batch capable.
543 pub optimal_batch_size: Option<usize>,
544 /// True if the model implements `GpuCapable`.
545 pub gpu_capable: bool,
546 /// True if GPU is currently active.
547 pub gpu_active: bool,
548 /// Device identifier (e.g., "cuda:0", "cpu"), if GPU capable.
549 pub device: Option<String>,
550 /// True if the model implements `StreamingCapable`.
551 pub streaming_capable: bool,
552 /// Recommended chunk size for streaming, if streaming capable.
553 pub recommended_chunk_size: Option<usize>,
554 /// True if the model implements `RelationCapable`.
555 pub relation_capable: bool,
556}
557
558/// Trait for models that can extract relations between entities.
559///
560/// Models implementing this trait can jointly extract entities and their relationships,
561/// producing (head, relation_type, tail) triples.
562pub trait RelationCapable: Model {
563 /// Extract entities and their relations from text.
564 ///
565 /// # Arguments
566 ///
567 /// * `text` - Input text to extract from
568 /// * `language` - Optional language hint (e.g., "en", "es")
569 ///
570 /// # Returns
571 ///
572 /// A tuple of (entities, relations) where relations link entities together.
573 fn extract_with_relations(
574 &self,
575 text: &str,
576 language: Option<&str>,
577 ) -> Result<(Vec<Entity>, Vec<Relation>)>;
578}
579
580/// Trait for models that support dynamic/zero-shot entity type specification.
581///
582/// Models implementing this trait can extract entities of arbitrary types
583/// specified at inference time (e.g., GLiNER, UniversalNER), rather than
584/// being limited to a fixed set of pre-trained types.
585pub trait DynamicLabels: Model {
586 /// Extract entities with custom type labels.
587 ///
588 /// # Arguments
589 ///
590 /// * `text` - Input text to extract from
591 /// * `labels` - Custom entity type labels to extract (e.g., ["PERSON", "ORGANIZATION"])
592 /// * `language` - Optional language hint (e.g., "en", "es")
593 ///
594 /// # Returns
595 ///
596 /// Entities of the specified types found in the text.
597 fn extract_with_labels(
598 &self,
599 text: &str,
600 labels: &[&str],
601 language: Option<&str>,
602 ) -> Result<Vec<Entity>>;
603}
604
605// Re-export backends
606pub use backends::label_prompt::{LabelNormalizer, StandardNormalizer};
607pub use backends::{
608 AutoNER, BackendType, ConflictStrategy, CrfNER, EnsembleNER, HeuristicNER, LexiconNER,
609 NERExtractor, NuNER, RegexNER, StackedNER, TPLinker, W2NERConfig, W2NERRelation, W2NER,
610};
611
612// Mention-ranking coreference (Bourgois & Poibeau 2025)
613pub use backends::mention_ranking::{
614 ClusteringStrategy, MentionCluster, MentionRankingConfig, MentionRankingCoref, RankedMention,
615};
616
617// Re-export MockModel for testing
618
619// Re-export Model trait and related
620pub use backends::inference::*;
621
622#[cfg(feature = "onnx")]
623pub use backends::{BertNEROnnx, GLiNEROnnx};
624
625#[cfg(feature = "candle")]
626pub use backends::CandleNER;
627
628// Constants
629
630/// Default BERT ONNX model identifier (HuggingFace model ID).
631pub const DEFAULT_BERT_ONNX_MODEL: &str = "protectai/bert-base-NER-onnx";
632
633/// Default GLiNER ONNX model identifier (HuggingFace model ID).
634pub const DEFAULT_GLINER_MODEL: &str = "onnx-community/gliner_small-v2.1";
635
636/// Default GLiNER2 ONNX model identifier (HuggingFace model ID).
637pub const DEFAULT_GLINER2_MODEL: &str = "onnx-community/gliner-multitask-large-v0.5";
638
639/// Default Candle model identifier (HuggingFace model ID).
640/// Uses dbmdz's model which has both tokenizer.json and safetensors.
641pub const DEFAULT_CANDLE_MODEL: &str = "dslim/bert-base-NER";
642
643/// Default GLiNER Candle model identifier (HuggingFace model ID).
644/// Uses a model with tokenizer.json and pytorch_model.bin for Candle compatibility.
645/// The backend converts pytorch_model.bin to safetensors automatically.
646// NeuML/gliner-bert-tiny uses BERT (not DeBERTa) which is compatible with CandleEncoder
647// Other GLiNER models use DeBERTa-v3 which has different architecture (relative attention)
648pub const DEFAULT_GLINER_CANDLE_MODEL: &str = "NeuML/gliner-bert-tiny";
649
650/// Default NuNER ONNX model identifier (HuggingFace model ID).
651pub const DEFAULT_NUNER_MODEL: &str = "deepanwa/NuNerZero_onnx";
652
653/// Default W2NER ONNX model identifier (HuggingFace model ID).
654pub const DEFAULT_W2NER_MODEL: &str = "ljynlp/w2ner-bert-base";
655
656/// Automatically select the best available NER backend.
657pub fn auto() -> Result<Box<dyn Model>> {
658 #[cfg(feature = "onnx")]
659 {
660 if let Ok(model) = GLiNEROnnx::new(DEFAULT_GLINER_MODEL) {
661 return Ok(Box::new(model));
662 }
663 if let Ok(model) = BertNEROnnx::new(DEFAULT_BERT_ONNX_MODEL) {
664 return Ok(Box::new(model));
665 }
666 }
667 #[cfg(feature = "candle")]
668 {
669 if let Ok(model) = CandleNER::from_pretrained(DEFAULT_CANDLE_MODEL) {
670 return Ok(Box::new(model));
671 }
672 }
673 Ok(Box::new(StackedNER::default()))
674}
675
676/// Check which backends are currently available.
677pub fn available_backends() -> Vec<(&'static str, bool)> {
678 // Keep this list stable and conservative: it is used by the CLI (`anno models list`) to show
679 // what a given build can actually instantiate.
680 let mut backends = vec![
681 // Zero-dependency / always compiled.
682 ("RegexNER", true),
683 ("HeuristicNER", true),
684 ("StackedNER", true),
685 ("EnsembleNER", true),
686 ("CrfNER", true),
687 ("HmmNER", true),
688 ];
689
690 // Feature-gated ML backends: include them even when disabled so the CLI can tell users what
691 // they are missing.
692 #[cfg(feature = "onnx")]
693 {
694 backends.push(("BertNEROnnx", true));
695 backends.push(("GLiNEROnnx", true));
696 backends.push(("NuNER", true));
697 backends.push(("W2NER", true));
698 }
699 #[cfg(not(feature = "onnx"))]
700 {
701 backends.push(("BertNEROnnx", false));
702 backends.push(("GLiNEROnnx", false));
703 backends.push(("NuNER", false));
704 backends.push(("W2NER", false));
705 }
706
707 #[cfg(feature = "candle")]
708 {
709 backends.push(("CandleNER", true));
710 }
711 #[cfg(not(feature = "candle"))]
712 {
713 backends.push(("CandleNER", false));
714 }
715
716 backends
717}
718
719/// A mock NER model for testing purposes.
720///
721/// This is provided so tests can create custom mock implementations
722/// without breaking the sealed trait pattern.
723///
724/// # Entity Validation
725///
726/// By default, `extract_entities` validates that entity offsets are within
727/// the input text bounds and that `start < end`. Set `validate = false`
728/// to disable this (useful for testing error handling).
729///
730/// # Example
731///
732/// ```rust
733/// use anno::{MockModel, Entity, EntityType, Result};
734///
735/// let mock = MockModel::new("test-mock")
736/// .with_entities(vec![
737/// Entity::new("John", EntityType::Person, 0, 4, 0.9),
738/// ]);
739///
740/// // Use mock in tests
741/// ```
742#[derive(Clone)]
743pub struct MockModel {
744 /// Model name identifier.
745 name: &'static str,
746 /// Entities to return when `extract_entities` is called.
747 entities: Vec<Entity>,
748 /// Supported entity types for this mock model.
749 types: Vec<EntityType>,
750 /// If true, validate entity offsets against input text (default: true).
751 validate: bool,
752}
753
754impl MockModel {
755 /// Create a new mock model.
756 #[must_use]
757 pub fn new(name: &'static str) -> Self {
758 Self {
759 name,
760 entities: Vec::new(),
761 types: Vec::new(),
762 validate: true,
763 }
764 }
765
766 /// Set entities to return on extraction.
767 ///
768 /// # Panics
769 ///
770 /// Panics if any entity has `start >= end`.
771 #[must_use]
772 pub fn with_entities(mut self, entities: Vec<Entity>) -> Self {
773 // Basic validation on construction
774 for (i, e) in entities.iter().enumerate() {
775 assert!(
776 e.start < e.end,
777 "MockModel entity {}: start ({}) must be < end ({})",
778 i,
779 e.start,
780 e.end
781 );
782 assert!(
783 e.confidence >= 0.0 && e.confidence <= 1.0,
784 "MockModel entity {}: confidence ({}) must be in [0.0, 1.0]",
785 i,
786 e.confidence
787 );
788 }
789 self.entities = entities;
790 self
791 }
792
793 /// Set supported entity types.
794 #[must_use]
795 pub fn with_types(mut self, types: Vec<EntityType>) -> Self {
796 self.types = types;
797 self
798 }
799
800 /// Disable offset validation during extraction (for testing error paths).
801 #[must_use]
802 pub fn without_validation(mut self) -> Self {
803 self.validate = false;
804 self
805 }
806
807 /// Validate that entity offsets are within text bounds.
808 fn validate_entities(&self, text: &str) -> Result<()> {
809 // Performance optimization: Cache text length (called once, used for all entities)
810 let text_len = text.chars().count();
811 for (i, e) in self.entities.iter().enumerate() {
812 if e.end > text_len {
813 return Err(Error::InvalidInput(format!(
814 "MockModel entity {} '{}': end offset ({}) exceeds text length ({} chars)",
815 i, e.text, e.end, text_len
816 )));
817 }
818 // Verify text matches (using char offsets)
819 // Use optimized extract_text_with_len to avoid recalculating length
820 let actual_text = e.extract_text_with_len(text, text_len);
821 if actual_text != e.text {
822 return Err(Error::InvalidInput(format!(
823 "MockModel entity {} text mismatch: expected '{}' at [{},{}), found '{}'",
824 i, e.text, e.start, e.end, actual_text
825 )));
826 }
827 }
828 Ok(())
829 }
830}
831
832impl Model for MockModel {
833 fn extract_entities(&self, text: &str, _language: Option<&str>) -> Result<Vec<Entity>> {
834 if self.validate && !self.entities.is_empty() {
835 self.validate_entities(text)?;
836 }
837 Ok(self.entities.clone())
838 }
839
840 fn supported_types(&self) -> Vec<EntityType> {
841 self.types.clone()
842 }
843
844 fn is_available(&self) -> bool {
845 true
846 }
847
848 fn name(&self) -> &'static str {
849 self.name
850 }
851
852 fn description(&self) -> &'static str {
853 "Mock NER model for testing"
854 }
855}
856
857// CI matrix harness moved to `anno-eval`.