anno/lib.rs
1//! # anno
2//!
3//! Information extraction: named entity recognition (NER) and within-document coreference.
4//!
5//! - **NER output**: variable-length spans with **character offsets** (Unicode scalar values), not
6//! byte offsets.
7//! - **Coreference output**: clusters (“tracks”) of mentions within one document.
8//!
9//! This crate focuses on inference-time extraction. Dataset loaders, benchmarking, and matrix
10//! evaluation tooling live in `anno-eval` (and the `anno` CLI lives in `anno-cli`).
11//!
12//! ## Quickstart
13//!
14//! ```rust
15//! use anno::{Model, StackedNER};
16//!
17//! let m = StackedNER::default();
18//! let ents = m.extract_entities("Lynn Conway worked at IBM and Xerox PARC.", None)?;
19//! assert!(!ents.is_empty());
20//! # Ok::<(), anno::Error>(())
21//! ```
22//!
23//! ## Zero-shot custom entity types
24//!
25//! Zero-shot custom entity types are provided by GLiNER backends when the `onnx` feature is
26//! enabled. See the repo docs for the CLI flag (`--extract-types`) and the library API.
27//!
28//! ## Offline / downloads
29//!
30//! By default, ML weights may download on first use. To force cached-only behavior, set
31//! `ANNO_NO_DOWNLOADS=1` (after prefetching models).
32
33#![warn(missing_docs)]
34
35// Allow unit tests (and included CI test modules) to refer to this crate as `anno::...`,
36// matching integration-test style imports.
37extern crate self as anno;
38
39// Module declarations (standard Cargo layout under `src/`)
40pub mod backends;
41/// Edit distance algorithms.
42pub mod edit_distance;
43pub mod env;
44pub mod error;
45/// Evaluation/analysis primitives (coref metrics, cluster encoders, etc.).
46///
47/// This module is only available when the legacy `eval` feature (or the preferred `analysis`
48/// alias) is enabled.
49#[cfg(any(feature = "analysis", feature = "eval"))]
50pub mod eval;
51/// Entity feature extraction for downstream ML and analysis.
52pub mod features;
53/// Small, dependency-light heuristics (negation, quantifiers, etc.).
54pub mod heuristics;
55/// Lightweight URL/file ingestion helpers (not a crawling/pipeline product).
56pub mod ingest;
57/// Joint inference experiments (optional; not the primary API surface).
58pub mod joint;
59/// Keyword and keyphrase extraction (TF-IDF, YAKE, TextRank).
60#[cfg(feature = "graph")]
61pub mod keywords;
62pub mod lang;
63/// Knowledge-base linking helpers (experimental).
64pub mod linking;
65pub mod offset;
66/// Shared PageRank algorithm for graph-based ranking.
67#[cfg(feature = "graph")]
68pub mod pagerank;
69/// Preprocessing for mention detection.
70pub mod preprocess;
71/// Entity salience and importance ranking.
72#[cfg(feature = "graph")]
73pub mod salience;
74pub mod schema;
75pub mod similarity;
76/// Extractive summarization.
77#[cfg(feature = "graph")]
78pub mod summarize;
79pub mod sync;
80/// Temporal entity tracking, parsing, and diachronic NER.
81pub mod temporal;
82/// Language-specific tokenization for multilingual NLP.
83pub mod tokenizer;
84pub mod types;
85
86// Note: research-only geometry experiments were archived out of `anno` to keep the public
87// surface grounded. Prefer `docs/` for repo-local design notes and experiments.
88
89/// Discourse-level analysis for coreference resolution.
90///
91/// Provides infrastructure for handling phenomena that span sentence boundaries:
92///
93/// - **Centering theory**: Track discourse focus through forward/backward-looking centers
94/// - **Uncertain reference**: Deferred resolution using epsilon-term semantics
95/// - **Abstract anaphora**: Pronouns referring to events, propositions, facts
96/// - **Shell nouns**: Abstract nouns like "problem", "issue", "fact"
97///
98/// Enable with the `discourse` feature.
99///
100/// See `discourse::centering` for salience-based pronoun resolution and
101/// `discourse::uncertain_reference` for handling ambiguous references.
102#[cfg(feature = "discourse")]
103pub mod discourse;
104
105// Re-export error types
106pub use error::{Error, Result};
107
108// =============================================================================
109// Core types live in `anno-core`
110// =============================================================================
111
112// Re-export core types at the crate root (the `anno` public API surface).
113pub use anno_core::{
114 generate_span_candidates, CorefChain, CorefDocument, CoreferenceResolver, Corpus,
115 DiscontinuousSpan, Entity, EntityBuilder, EntityCategory, EntityType, EntityViewport,
116 ExtractionMethod, Gender, GroundedDocument, HashMapLexicon, HierarchicalConfidence, Identity,
117 IdentityId, IdentitySource, Lexicon, Location, Mention, MentionType, Modality, Number, Person,
118 PhiFeatures, Provenance, Quantifier, RaggedBatch, Relation, Signal, SignalId, SignalRef, Span,
119 SpanCandidate, Track, TrackId, TrackRef, TrackStats, TypeLabel, TypeMapper, ValidationIssue,
120};
121
122/// `anno-core`’s stable types under a namespaced module.
123///
124/// This exists for readability in downstream codebases (e.g. `anno::core::Entity`)
125/// and mirrors the structure of the internal `anno-core` crate.
126pub mod core {
127 pub use anno_core::core::*;
128}
129
130// Re-export commonly used types
131pub use lang::{detect_language, Language};
132pub use offset::{
133 bytes_to_chars, chars_to_bytes, is_ascii, OffsetMapping, SpanConverter, TextSpan, TokenSpan,
134};
135pub use schema::*;
136pub use similarity::*;
137pub use sync::*;
138pub use types::*;
139
140// =============================================================================
141// Sealed Trait Pattern
142// =============================================================================
143//
144// The `Model` trait is sealed to:
145// 1. Maintain invariants (entities have valid offsets, confidence in [0,1])
146// 2. Allow adding methods without breaking external implementations
147// 3. Ensure all backends share consistent behavior
148//
149// For external/plugin backends, use the `AnyModel` wrapper (see below).
150// =============================================================================
151
152mod sealed {
153 pub trait Sealed {}
154
155 impl Sealed for super::RegexNER {}
156 impl Sealed for super::HeuristicNER {}
157 impl Sealed for super::StackedNER {}
158 impl Sealed for super::EnsembleNER {}
159 impl Sealed for super::CrfNER {}
160 impl Sealed for super::NuNER {}
161 impl Sealed for super::W2NER {}
162 impl Sealed for super::NERExtractor {}
163
164 #[cfg(feature = "onnx")]
165 impl Sealed for super::BertNEROnnx {}
166
167 #[cfg(feature = "onnx")]
168 impl Sealed for super::GLiNEROnnx {}
169
170 #[cfg(feature = "onnx")]
171 impl Sealed for super::backends::albert::ALBERTNER {}
172
173 #[cfg(feature = "onnx")]
174 impl Sealed for super::backends::deberta_v3::DeBERTaV3NER {}
175
176 #[cfg(feature = "onnx")]
177 impl Sealed for super::backends::gliner_poly::GLiNERPoly {}
178
179 #[cfg(feature = "onnx")]
180 impl Sealed for super::backends::gliner2::GLiNER2Onnx {}
181
182 #[cfg(feature = "candle")]
183 impl Sealed for super::CandleNER {}
184
185 #[cfg(feature = "candle")]
186 impl Sealed for super::backends::gliner_candle::GLiNERCandle {}
187
188 #[cfg(feature = "candle")]
189 impl Sealed for super::backends::gliner2::GLiNER2Candle {}
190
191 #[cfg(feature = "candle")]
192 impl<E: Send + Sync> Sealed for super::backends::gliner_pipeline::GLiNERPipeline<E> {}
193
194 #[cfg(feature = "burn")]
195 impl Sealed for super::backends::burn::BurnNER {}
196
197 impl Sealed for super::backends::tplinker::TPLinker {}
198 impl Sealed for super::backends::universal_ner::UniversalNER {}
199 impl Sealed for super::backends::lexicon::LexiconNER {}
200
201 #[allow(deprecated)]
202 impl Sealed for super::backends::rule::RuleBasedNER {}
203
204 impl Sealed for super::MockModel {}
205 impl Sealed for super::joint::JointModel {}
206}
207
208/// Trait for NER model backends.
209///
210/// # Sealed Trait
211///
212/// `Model` is intentionally sealed (cannot be implemented outside this crate) to:
213///
214/// 1. **Maintain invariants**: All backends must produce entities with valid character
215/// offsets, confidence in `[0, 1]`, and non-empty text.
216/// 2. **Allow evolution**: New methods can be added with default implementations
217/// without breaking external code.
218/// 3. **Ensure consistency**: All backends share standardized behavior for
219/// `is_available()`, `supported_types()`, etc.
220///
221/// # For External Backends
222///
223/// If you need to integrate an external NER backend (e.g., a REST API, Python model
224/// via PyO3, or custom implementation), use the [`AnyModel`] wrapper:
225///
226/// ```rust,ignore
227/// use anno::{AnyModel, Entity, EntityType, Result};
228///
229/// struct MyExternalNER { /* ... */ }
230///
231/// impl MyExternalNER {
232/// fn extract(&self, text: &str) -> Vec<Entity> {
233/// // Your implementation
234/// vec![]
235/// }
236/// }
237///
238/// // Wrap in AnyModel to use with anno's infrastructure
239/// let model = AnyModel::new(
240/// "my-ner",
241/// vec![EntityType::Person, EntityType::Organization],
242/// move |text, _lang| Ok(my_ner.extract(text)),
243/// );
244///
245/// // Now usable wherever Box<dyn Model> is expected
246/// let entities = model.extract_entities("Hello world", None)?;
247/// ```
248///
249/// [`AnyModel`]: crate::AnyModel
250pub trait Model: sealed::Sealed + Send + Sync {
251 /// Extract entities from text.
252 fn extract_entities(&self, text: &str, language: Option<&str>) -> Result<Vec<Entity>>;
253
254 /// Get supported entity types.
255 fn supported_types(&self) -> Vec<EntityType>;
256
257 /// Check if model is available and ready.
258 fn is_available(&self) -> bool;
259
260 /// Get the model name/identifier.
261 fn name(&self) -> &'static str {
262 "unknown"
263 }
264
265 /// Get a description of the model.
266 fn description(&self) -> &'static str {
267 "Unknown NER model"
268 }
269
270 /// Get capability summary for this model.
271 ///
272 /// Override this in implementations that support additional capabilities
273 /// (batch, GPU, streaming, etc.) to enable runtime discovery.
274 ///
275 /// # Default
276 ///
277 /// Returns a [`ModelCapabilities`] with all fields set to `false`/`None`.
278 fn capabilities(&self) -> ModelCapabilities {
279 ModelCapabilities::default()
280 }
281
282 /// Get a version identifier for the model configuration/weights.
283 ///
284 /// Used for cache invalidation. Default implementation returns "1".
285 fn version(&self) -> String {
286 "1".to_string()
287 }
288}
289
290// =============================================================================
291// AnyModel: Adapter for External Backends
292// =============================================================================
293
294/// A wrapper that allows external code to implement NER backends without
295/// directly implementing the sealed `Model` trait.
296///
297/// `AnyModel` acts as an adapter: you provide a closure that does the actual
298/// entity extraction, and `AnyModel` implements `Model` on your behalf.
299///
300/// # Example
301///
302/// ```rust
303/// use anno::{AnyModel, Entity, EntityType, Model, Result};
304///
305/// // Define extraction logic as a closure or function
306/// let my_extractor = |text: &str, _lang: Option<&str>| -> Result<Vec<Entity>> {
307/// // Your custom NER logic here
308/// Ok(vec![])
309/// };
310///
311/// // Wrap in AnyModel
312/// let model = AnyModel::new(
313/// "my-custom-ner",
314/// "Custom NER backend using external API",
315/// vec![EntityType::Person, EntityType::Organization],
316/// my_extractor,
317/// );
318///
319/// // Use like any other Model
320/// assert!(model.is_available());
321/// let entities = model.extract_entities("Hello world", None).unwrap();
322/// ```
323///
324/// # Thread Safety
325///
326/// The extractor closure must be `Send + Sync`. For interior mutability
327/// (e.g., caching, connection pooling), use `Arc<Mutex<...>>` or similar.
328/// Type alias for the `AnyModel` extractor closure.
329type AnyModelExtractor = dyn Fn(&str, Option<&str>) -> Result<Vec<Entity>> + Send + Sync;
330
331/// A wrapper that turns an extractor closure into a `Model`.
332pub struct AnyModel {
333 name: &'static str,
334 description: &'static str,
335 supported_types: Vec<EntityType>,
336 extractor: Box<AnyModelExtractor>,
337 version: String,
338}
339
340impl AnyModel {
341 /// Create a new `AnyModel` wrapper.
342 ///
343 /// # Arguments
344 ///
345 /// * `name` - Model identifier (e.g., "my-ner")
346 /// * `description` - Human-readable description
347 /// * `supported_types` - Entity types this model can extract
348 /// * `extractor` - Closure that performs the actual extraction
349 pub fn new(
350 name: &'static str,
351 description: &'static str,
352 supported_types: Vec<EntityType>,
353 extractor: impl Fn(&str, Option<&str>) -> Result<Vec<Entity>> + Send + Sync + 'static,
354 ) -> Self {
355 Self {
356 name,
357 description,
358 supported_types,
359 extractor: Box::new(extractor),
360 version: "1".to_string(),
361 }
362 }
363
364 /// Set the version string for cache invalidation.
365 pub fn with_version(mut self, version: impl Into<String>) -> Self {
366 self.version = version.into();
367 self
368 }
369}
370
371impl std::fmt::Debug for AnyModel {
372 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
373 f.debug_struct("AnyModel")
374 .field("name", &self.name)
375 .field("description", &self.description)
376 .field("supported_types", &self.supported_types)
377 .finish()
378 }
379}
380
381// AnyModel gets the Sealed impl so it can implement Model
382impl sealed::Sealed for AnyModel {}
383
384impl Model for AnyModel {
385 fn extract_entities(&self, text: &str, language: Option<&str>) -> Result<Vec<Entity>> {
386 (self.extractor)(text, language)
387 }
388
389 fn supported_types(&self) -> Vec<EntityType> {
390 self.supported_types.clone()
391 }
392
393 fn is_available(&self) -> bool {
394 true
395 }
396
397 fn name(&self) -> &'static str {
398 self.name
399 }
400
401 fn description(&self) -> &'static str {
402 self.description
403 }
404
405 fn version(&self) -> String {
406 self.version.clone()
407 }
408}
409
410// =============================================================================
411// Capability Marker Traits
412// =============================================================================
413
414/// Trait for models that support batch processing.
415///
416/// Models implementing this trait can process multiple texts efficiently,
417/// potentially using parallel processing or optimized batch operations.
418pub trait BatchCapable: Model {
419 /// Extract entities from multiple texts in a batch.
420 ///
421 /// # Arguments
422 /// * `texts` - Slice of text strings to process
423 /// * `language` - Optional language hint for the texts
424 ///
425 /// # Returns
426 /// A vector of entity vectors, one per input text
427 fn extract_entities_batch(
428 &self,
429 texts: &[&str],
430 language: Option<&str>,
431 ) -> Result<Vec<Vec<Entity>>> {
432 texts
433 .iter()
434 .map(|text| self.extract_entities(text, language))
435 .collect()
436 }
437
438 /// Get the optimal batch size for this model, if applicable.
439 ///
440 /// Returns `None` if the model doesn't have a specific optimal batch size,
441 /// or `Some(n)` if there's a recommended batch size for best performance.
442 fn optimal_batch_size(&self) -> Option<usize> {
443 None
444 }
445}
446
447/// Trait for models that support GPU acceleration.
448///
449/// Models implementing this trait can report whether GPU is active
450/// and which device they're using.
451pub trait GpuCapable: Model {
452 /// Check if GPU acceleration is currently active.
453 ///
454 /// Returns `true` if the model is using GPU, `false` if using CPU.
455 fn is_gpu_active(&self) -> bool;
456
457 /// Get the device identifier (e.g., "cuda:0", "cpu").
458 ///
459 /// Returns a string describing the compute device being used.
460 fn device(&self) -> &str;
461}
462
463/// Trait for models that support streaming/chunked extraction.
464///
465/// Useful for processing very long documents by splitting them into chunks
466/// and extracting entities from each chunk with proper offset tracking.
467pub trait StreamingCapable: Model {
468 /// Extract entities from a chunk of text, adjusting offsets by the chunk's position.
469 ///
470 /// # Arguments
471 ///
472 /// * `chunk` - A portion of the full document text
473 /// * `offset` - Character offset of this chunk within the full document
474 ///
475 /// # Returns
476 ///
477 /// Entities with offsets adjusted to their position in the full document.
478 fn extract_entities_streaming(&self, chunk: &str, offset: usize) -> Result<Vec<Entity>> {
479 let entities = self.extract_entities(chunk, None)?;
480 Ok(entities
481 .into_iter()
482 .map(|mut e| {
483 e.start += offset;
484 e.end += offset;
485 e
486 })
487 .collect())
488 }
489
490 /// Get the recommended chunk size for streaming extraction.
491 ///
492 /// Returns the optimal number of characters per chunk for this model.
493 /// Default implementation returns 10,000 characters.
494 fn recommended_chunk_size(&self) -> usize {
495 10_000
496 }
497}
498
499/// Marker trait for models that extract named entities (persons, organizations, locations).
500///
501/// This is a marker trait used for type-level distinctions between different
502/// model capabilities. All NER models should implement this.
503pub trait NamedEntityCapable: Model {}
504
505/// Marker trait for models that extract structured entities (dates, times, money, etc.).
506///
507/// This is a marker trait used for type-level distinctions between different
508/// model capabilities. Models that extract structured data (like `RegexNER`) should implement this.
509pub trait StructuredEntityCapable: Model {}
510
511// =============================================================================
512// Capability Discovery for Trait Objects
513// =============================================================================
514
515/// Summary of a model's capabilities, useful when working with `Box<dyn Model>`.
516///
517/// Since capability traits (`BatchCapable`, `GpuCapable`, etc.) can't be queried
518/// through a `Box<dyn Model>` without downcasting, this struct provides a static
519/// summary of what the model supports.
520///
521/// # Example
522///
523/// ```rust,ignore
524/// use anno::{Model, ModelCapabilities};
525///
526/// fn process_with_model(model: &dyn Model) {
527/// let caps = model.capabilities();
528///
529/// if caps.batch_capable {
530/// println!("Model supports batch processing");
531/// }
532/// if caps.gpu_capable {
533/// println!("Model can use GPU: {:?}", caps.device);
534/// }
535/// }
536/// ```
537#[derive(Debug, Clone, Default)]
538pub struct ModelCapabilities {
539 /// True if the model implements `BatchCapable`.
540 pub batch_capable: bool,
541 /// Optimal batch size, if batch capable.
542 pub optimal_batch_size: Option<usize>,
543 /// True if the model implements `GpuCapable`.
544 pub gpu_capable: bool,
545 /// True if GPU is currently active.
546 pub gpu_active: bool,
547 /// Device identifier (e.g., "cuda:0", "cpu"), if GPU capable.
548 pub device: Option<String>,
549 /// True if the model implements `StreamingCapable`.
550 pub streaming_capable: bool,
551 /// Recommended chunk size for streaming, if streaming capable.
552 pub recommended_chunk_size: Option<usize>,
553 /// True if the model implements `RelationCapable`.
554 pub relation_capable: bool,
555 /// True if the model implements `DynamicLabels` (zero-shot, caller-supplied entity types).
556 pub dynamic_labels: bool,
557 /// True if the model can extract discontinuous entities spanning non-adjacent spans.
558 /// Only `W2NER` (when loaded with an ONNX session) sets this today.
559 pub discontinuous_capable: bool,
560}
561
562/// Trait for models that can extract relations between entities.
563///
564/// Models implementing this trait can jointly extract entities and their relationships,
565/// producing (head, relation_type, tail) triples.
566pub trait RelationCapable: Model {
567 /// Extract entities and their relations from text.
568 ///
569 /// # Arguments
570 ///
571 /// * `text` - Input text to extract from
572 /// * `language` - Optional language hint (e.g., "en", "es")
573 ///
574 /// # Returns
575 ///
576 /// A tuple of (entities, relations) where relations link entities together.
577 fn extract_with_relations(
578 &self,
579 text: &str,
580 language: Option<&str>,
581 ) -> Result<(Vec<Entity>, Vec<Relation>)>;
582}
583
584/// Trait for models that support dynamic/zero-shot entity type specification.
585///
586/// Models implementing this trait can extract entities of arbitrary types
587/// specified at inference time (e.g., GLiNER, UniversalNER), rather than
588/// being limited to a fixed set of pre-trained types.
589pub trait DynamicLabels: Model {
590 /// Extract entities with custom type labels.
591 ///
592 /// # Arguments
593 ///
594 /// * `text` - Input text to extract from
595 /// * `labels` - Custom entity type labels to extract (e.g., ["PERSON", "ORGANIZATION"])
596 /// * `language` - Optional language hint (e.g., "en", "es")
597 ///
598 /// # Returns
599 ///
600 /// Entities of the specified types found in the text.
601 fn extract_with_labels(
602 &self,
603 text: &str,
604 labels: &[&str],
605 language: Option<&str>,
606 ) -> Result<Vec<Entity>>;
607}
608
609// Re-export backends
610pub use backends::label_prompt::{LabelNormalizer, StandardNormalizer};
611pub use backends::{
612 AutoNER, BackendType, ConflictStrategy, CrfNER, EnsembleNER, HeuristicNER, LexiconNER,
613 NERExtractor, NuNER, RegexNER, StackedNER, TPLinker, W2NERConfig, W2NERRelation, W2NER,
614};
615
616// Mention-ranking coreference (Bourgois & Poibeau 2025)
617pub use backends::mention_ranking::{
618 ClusteringStrategy, MentionCluster, MentionRankingConfig, MentionRankingCoref, RankedMention,
619};
620
621// Re-export MockModel for testing
622
623// Re-export Model trait and related
624pub use backends::inference::*;
625
626#[cfg(feature = "onnx")]
627pub use backends::{BertNEROnnx, GLiNEROnnx};
628
629#[cfg(feature = "candle")]
630pub use backends::CandleNER;
631
632// Constants
633
634/// Default BERT ONNX model identifier (HuggingFace model ID).
635pub const DEFAULT_BERT_ONNX_MODEL: &str = "protectai/bert-base-NER-onnx";
636
637/// Default GLiNER ONNX model identifier (HuggingFace model ID).
638pub const DEFAULT_GLINER_MODEL: &str = "onnx-community/gliner_small-v2.1";
639
640/// Default GLiNER2 ONNX model identifier (HuggingFace model ID).
641pub const DEFAULT_GLINER2_MODEL: &str = "onnx-community/gliner-multitask-large-v0.5";
642
643/// Default Candle model identifier (HuggingFace model ID).
644/// Uses dbmdz's model which has both tokenizer.json and safetensors.
645pub const DEFAULT_CANDLE_MODEL: &str = "dslim/bert-base-NER";
646
647/// Default GLiNER Candle model identifier (HuggingFace model ID).
648/// Uses a model with tokenizer.json and pytorch_model.bin for Candle compatibility.
649/// The backend converts pytorch_model.bin to safetensors automatically.
650// NeuML/gliner-bert-tiny uses BERT (not DeBERTa) which is compatible with CandleEncoder
651// Other GLiNER models use DeBERTa-v3 which has different architecture (relative attention)
652pub const DEFAULT_GLINER_CANDLE_MODEL: &str = "NeuML/gliner-bert-tiny";
653
654/// Default NuNER ONNX model identifier (HuggingFace model ID).
655pub const DEFAULT_NUNER_MODEL: &str = "deepanwa/NuNerZero_onnx";
656
657/// Default W2NER ONNX model identifier (HuggingFace model ID).
658pub const DEFAULT_W2NER_MODEL: &str = "ljynlp/w2ner-bert-base";
659
660/// Automatically select the best available NER backend.
661pub fn auto() -> Result<Box<dyn Model>> {
662 #[cfg(feature = "onnx")]
663 {
664 if let Ok(model) = GLiNEROnnx::new(DEFAULT_GLINER_MODEL) {
665 return Ok(Box::new(model));
666 }
667 if let Ok(model) = BertNEROnnx::new(DEFAULT_BERT_ONNX_MODEL) {
668 return Ok(Box::new(model));
669 }
670 }
671 #[cfg(feature = "candle")]
672 {
673 if let Ok(model) = CandleNER::from_pretrained(DEFAULT_CANDLE_MODEL) {
674 return Ok(Box::new(model));
675 }
676 }
677 Ok(Box::new(StackedNER::default()))
678}
679
680/// Check which backends are currently available.
681pub fn available_backends() -> Vec<(&'static str, bool)> {
682 // Keep this list stable and conservative: it is used by the CLI (`anno models list`) to show
683 // what a given build can actually instantiate.
684 let mut backends = vec![
685 // Zero-dependency / always compiled.
686 ("RegexNER", true),
687 ("HeuristicNER", true),
688 ("StackedNER", true),
689 ("EnsembleNER", true),
690 ("CrfNER", true),
691 ("HmmNER", true),
692 ];
693
694 // Feature-gated ML backends: include them even when disabled so the CLI can tell users what
695 // they are missing.
696 #[cfg(feature = "onnx")]
697 {
698 backends.push(("BertNEROnnx", true));
699 backends.push(("GLiNEROnnx", true));
700 backends.push(("NuNER", true));
701 backends.push(("W2NER", true));
702 }
703 #[cfg(not(feature = "onnx"))]
704 {
705 backends.push(("BertNEROnnx", false));
706 backends.push(("GLiNEROnnx", false));
707 backends.push(("NuNER", false));
708 backends.push(("W2NER", false));
709 }
710
711 #[cfg(feature = "candle")]
712 {
713 backends.push(("CandleNER", true));
714 }
715 #[cfg(not(feature = "candle"))]
716 {
717 backends.push(("CandleNER", false));
718 }
719
720 backends
721}
722
723/// A mock NER model for testing purposes.
724///
725/// This is provided so tests can create custom mock implementations
726/// without breaking the sealed trait pattern.
727///
728/// # Entity Validation
729///
730/// By default, `extract_entities` validates that entity offsets are within
731/// the input text bounds and that `start < end`. Set `validate = false`
732/// to disable this (useful for testing error handling).
733///
734/// # Example
735///
736/// ```rust
737/// use anno::{MockModel, Entity, EntityType, Result};
738///
739/// let mock = MockModel::new("test-mock")
740/// .with_entities(vec![
741/// Entity::new("John", EntityType::Person, 0, 4, 0.9),
742/// ]);
743///
744/// // Use mock in tests
745/// ```
746#[derive(Clone)]
747pub struct MockModel {
748 /// Model name identifier.
749 name: &'static str,
750 /// Entities to return when `extract_entities` is called.
751 entities: Vec<Entity>,
752 /// Supported entity types for this mock model.
753 types: Vec<EntityType>,
754 /// If true, validate entity offsets against input text (default: true).
755 validate: bool,
756}
757
758impl MockModel {
759 /// Create a new mock model.
760 #[must_use]
761 pub fn new(name: &'static str) -> Self {
762 Self {
763 name,
764 entities: Vec::new(),
765 types: Vec::new(),
766 validate: true,
767 }
768 }
769
770 /// Set entities to return on extraction.
771 ///
772 /// # Panics
773 ///
774 /// Panics if any entity has `start >= end`.
775 #[must_use]
776 pub fn with_entities(mut self, entities: Vec<Entity>) -> Self {
777 // Basic validation on construction
778 for (i, e) in entities.iter().enumerate() {
779 assert!(
780 e.start < e.end,
781 "MockModel entity {}: start ({}) must be < end ({})",
782 i,
783 e.start,
784 e.end
785 );
786 assert!(
787 e.confidence >= 0.0 && e.confidence <= 1.0,
788 "MockModel entity {}: confidence ({}) must be in [0.0, 1.0]",
789 i,
790 e.confidence
791 );
792 }
793 self.entities = entities;
794 self
795 }
796
797 /// Set supported entity types.
798 #[must_use]
799 pub fn with_types(mut self, types: Vec<EntityType>) -> Self {
800 self.types = types;
801 self
802 }
803
804 /// Disable offset validation during extraction (for testing error paths).
805 #[must_use]
806 pub fn without_validation(mut self) -> Self {
807 self.validate = false;
808 self
809 }
810
811 /// Validate that entity offsets are within text bounds.
812 fn validate_entities(&self, text: &str) -> Result<()> {
813 // Performance optimization: Cache text length (called once, used for all entities)
814 let text_len = text.chars().count();
815 for (i, e) in self.entities.iter().enumerate() {
816 if e.end > text_len {
817 return Err(Error::InvalidInput(format!(
818 "MockModel entity {} '{}': end offset ({}) exceeds text length ({} chars)",
819 i, e.text, e.end, text_len
820 )));
821 }
822 // Verify text matches (using char offsets)
823 // Use optimized extract_text_with_len to avoid recalculating length
824 let actual_text = e.extract_text_with_len(text, text_len);
825 if actual_text != e.text {
826 return Err(Error::InvalidInput(format!(
827 "MockModel entity {} text mismatch: expected '{}' at [{},{}), found '{}'",
828 i, e.text, e.start, e.end, actual_text
829 )));
830 }
831 }
832 Ok(())
833 }
834}
835
836impl Model for MockModel {
837 fn extract_entities(&self, text: &str, _language: Option<&str>) -> Result<Vec<Entity>> {
838 if self.validate && !self.entities.is_empty() {
839 self.validate_entities(text)?;
840 }
841 Ok(self.entities.clone())
842 }
843
844 fn supported_types(&self) -> Vec<EntityType> {
845 self.types.clone()
846 }
847
848 fn is_available(&self) -> bool {
849 true
850 }
851
852 fn name(&self) -> &'static str {
853 self.name
854 }
855
856 fn description(&self) -> &'static str {
857 "Mock NER model for testing"
858 }
859}
860
861// CI matrix harness moved to `anno-eval`.