#![warn(missing_docs)]
#![cfg_attr(docsrs, feature(doc_cfg))]
extern crate self as anno;
pub mod active;
mod annotated;
pub mod backends;
#[cfg(feature = "discourse")]
#[cfg_attr(docsrs, doc(cfg(feature = "discourse")))]
pub mod discourse;
pub mod edit_distance;
pub mod env;
pub mod error;
pub mod export;
#[cfg(feature = "graph")]
#[cfg_attr(docsrs, doc(cfg(feature = "graph")))]
pub mod graph;
pub mod heuristics;
pub mod ingest;
pub mod lang;
#[cfg(feature = "analysis")]
#[cfg_attr(docsrs, doc(cfg(feature = "analysis")))]
pub mod metrics;
pub mod offset;
pub mod pii;
pub mod rag;
pub mod schema;
pub mod similarity;
pub mod types;
pub use error::{Error, Result};
pub mod coalesce;
pub mod core;
pub mod minimal;
pub use crate::core::{
generate_span_candidates, Animacy, Confidence, CorefChain, CorefDocument, CoreferenceResolver,
Corpus, DiscontinuousSpan, Entity, EntityBuilder, EntityCategory, EntityType, ExtractionMethod,
Gender, GroundedDocument, HashMapLexicon, HierarchicalConfidence, Identity, IdentityId,
IdentitySource, Lexicon, Location, Mention, MentionType, Modality, Number, Person, PhiFeatures,
Provenance, Quantifier, RaggedBatch, Relation, Signal, SignalId, SignalRef, Span,
SpanCandidate, Track, TrackId, TrackRef, TrackStats, TypeLabel, TypeMapper, ValidationIssue,
};
pub use crate::core::grounded::SignalValidationError;
pub use crate::core::types::{ByteOffset, CanonicalId, CharOffset};
pub use lang::{detect_language, Language};
pub use offset::{
bytes_to_chars, chars_to_bytes, is_ascii, OffsetMapping, SpanConverter, TextSpan, TokenSpan,
};
pub use similarity::string_similarity;
pub use types::EntitySliceExt;
mod sealed {
pub trait Sealed {}
impl Sealed for super::RegexNER {}
impl Sealed for super::HeuristicNER {}
impl Sealed for super::StackedNER {}
impl Sealed for super::EnsembleNER {}
impl Sealed for super::CrfNER {}
impl Sealed for super::NuNER {}
impl Sealed for super::W2NER {}
#[cfg(feature = "onnx")]
impl Sealed for super::BertNEROnnx {}
#[cfg(feature = "onnx")]
impl Sealed for super::GLiNEROnnx {}
impl Sealed for super::backends::gliner_poly::GLiNERPoly {}
#[cfg(feature = "onnx")]
impl Sealed for super::backends::gliner_multitask::GLiNERMultitaskOnnx {}
#[cfg(feature = "candle")]
impl Sealed for super::CandleNER {}
#[cfg(feature = "candle")]
impl Sealed for super::backends::gliner_candle::GLiNERCandle {}
#[cfg(feature = "candle")]
impl Sealed for super::backends::gliner_multitask::GLiNERMultitaskCandle {}
impl Sealed for super::backends::tplinker::TPLinker {}
impl Sealed for super::backends::universal_ner::UniversalNER {}
impl Sealed for super::backends::lexicon::LexiconNER {}
impl Sealed for super::backends::hmm::HmmNER {}
impl Sealed for super::backends::heuristic_crf::HeuristicCrfNER {}
#[cfg(test)]
impl Sealed for super::MockModel {}
}
pub trait Model: sealed::Sealed + Send + Sync {
fn extract_entities(&self, text: &str, language: Option<Language>) -> Result<Vec<Entity>>;
fn supported_types(&self) -> Vec<EntityType>;
fn is_available(&self) -> bool;
fn name(&self) -> &'static str {
"unknown"
}
fn description(&self) -> &'static str {
"Unknown NER model"
}
fn capabilities(&self) -> ModelCapabilities {
ModelCapabilities::default()
}
fn extract_batch(
&self,
texts: &[&str],
language: Option<Language>,
) -> Vec<Result<Vec<Entity>>> {
texts
.iter()
.map(|t| self.extract_entities(t, language))
.collect()
}
#[cfg(feature = "parallel")]
#[cfg_attr(docsrs, doc(cfg(feature = "parallel")))]
fn par_extract_batch(
&self,
texts: &[&str],
language: Option<Language>,
) -> Vec<Result<Vec<Entity>>> {
use rayon::prelude::*;
texts
.par_iter()
.map(|t| self.extract_entities(t, language))
.collect()
}
fn version(&self) -> String {
"1".to_string()
}
fn as_zero_shot(&self) -> Option<&dyn backends::inference::ZeroShotNER> {
None
}
fn as_relation_extractor(&self) -> Option<&dyn backends::inference::RelationExtractor> {
None
}
}
type AnyModelExtractor = dyn Fn(&str, Option<Language>) -> Result<Vec<Entity>> + Send + Sync;
type AnyModelZeroShotExtractor = dyn Fn(&str, &[&str], f32) -> Result<Vec<Entity>> + Send + Sync;
type AnyModelRelationExtractor = dyn Fn(&str) -> Result<(Vec<Entity>, Vec<Relation>)> + Send + Sync;
pub struct AnyModel {
name: &'static str,
description: &'static str,
supported_types: Vec<EntityType>,
extractor: Box<AnyModelExtractor>,
version: String,
zero_shot_extractor: Option<Box<AnyModelZeroShotExtractor>>,
relation_extractor: Option<Box<AnyModelRelationExtractor>>,
}
impl AnyModel {
pub fn new(
name: &'static str,
description: &'static str,
supported_types: Vec<EntityType>,
extractor: impl Fn(&str, Option<Language>) -> Result<Vec<Entity>> + Send + Sync + 'static,
) -> Self {
Self {
name,
description,
supported_types,
extractor: Box::new(extractor),
version: "1".to_string(),
zero_shot_extractor: None,
relation_extractor: None,
}
}
pub fn with_version(mut self, version: impl Into<String>) -> Self {
self.version = version.into();
self
}
#[must_use]
pub fn with_zero_shot(
mut self,
f: impl Fn(&str, &[&str], f32) -> Result<Vec<Entity>> + Send + Sync + 'static,
) -> Self {
self.zero_shot_extractor = Some(Box::new(f));
self
}
#[must_use]
pub fn with_relations(
mut self,
f: impl Fn(&str) -> Result<(Vec<Entity>, Vec<Relation>)> + Send + Sync + 'static,
) -> Self {
self.relation_extractor = Some(Box::new(f));
self
}
}
impl std::fmt::Debug for AnyModel {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("AnyModel")
.field("name", &self.name)
.field("description", &self.description)
.field("supported_types", &self.supported_types)
.finish()
}
}
impl sealed::Sealed for AnyModel {}
impl Model for AnyModel {
fn extract_entities(&self, text: &str, language: Option<Language>) -> Result<Vec<Entity>> {
(self.extractor)(text, language)
}
fn supported_types(&self) -> Vec<EntityType> {
self.supported_types.clone()
}
fn is_available(&self) -> bool {
true
}
fn name(&self) -> &'static str {
self.name
}
fn description(&self) -> &'static str {
self.description
}
fn capabilities(&self) -> ModelCapabilities {
ModelCapabilities {
zero_shot: self.zero_shot_extractor.is_some(),
relation_capable: self.relation_extractor.is_some(),
..ModelCapabilities::default()
}
}
fn version(&self) -> String {
self.version.clone()
}
fn as_zero_shot(&self) -> Option<&dyn backends::inference::ZeroShotNER> {
if self.zero_shot_extractor.is_some() {
Some(self)
} else {
None
}
}
fn as_relation_extractor(&self) -> Option<&dyn backends::inference::RelationExtractor> {
if self.relation_extractor.is_some() {
Some(self)
} else {
None
}
}
}
impl backends::inference::ZeroShotNER for AnyModel {
fn extract_with_types(
&self,
text: &str,
entity_types: &[&str],
threshold: f32,
) -> Result<Vec<Entity>> {
match &self.zero_shot_extractor {
Some(f) => f(text, entity_types, threshold),
None => Err(Error::FeatureNotAvailable(
"AnyModel: ZeroShotNER closure not configured (use .with_zero_shot())".into(),
)),
}
}
fn extract_with_descriptions(
&self,
text: &str,
descriptions: &[&str],
threshold: f32,
) -> Result<Vec<Entity>> {
self.extract_with_types(text, descriptions, threshold)
}
fn default_types(&self) -> &[&'static str] {
&[]
}
}
impl backends::inference::RelationExtractor for AnyModel {
fn extract_with_relations(
&self,
_text: &str,
_entity_types: &[&str],
_relation_types: &[&str],
_threshold: f32,
) -> Result<backends::inference::ExtractionWithRelations> {
Err(Error::FeatureNotAvailable(
"AnyModel does not support custom entity/relation types; call \
RelationExtractor::extract_relations_default instead."
.into(),
))
}
fn extract_relations_default(&self, text: &str) -> Result<(Vec<Entity>, Vec<Relation>)> {
match &self.relation_extractor {
Some(f) => f(text),
None => Err(Error::FeatureNotAvailable(
"AnyModel: relation closure not configured (use .with_relations())".into(),
)),
}
}
}
#[derive(Debug, Clone, Default)]
pub struct ModelCapabilities {
pub relation_capable: bool,
pub zero_shot: bool,
pub discontinuous_capable: bool,
}
pub use backends::{
ConflictStrategy, CrfNER, EnsembleNER, HeuristicNER, LexiconNER, NuNER, RegexNER, StackedNER,
TPLinker, W2NERConfig, W2NERRelation, W2NER,
};
pub use backends::coref::mention_ranking::{
ClusteringStrategy, MentionCluster, MentionRankingConfig, MentionRankingCoref, RankedMention,
};
pub use backends::CorefBackend;
pub use backends::inference::{
extract_relation_triples, extract_relation_triples_simple, extract_relations,
CoreferenceConfig, DiscontinuousEntity, DiscontinuousNER, ExtractionWithRelations,
RelationExtractionConfig, RelationExtractor, RelationTriple, ZeroShotNER,
};
#[cfg(feature = "onnx")]
#[cfg_attr(docsrs, doc(cfg(feature = "onnx")))]
pub use backends::{BertNEROnnx, GLiNEROnnx};
#[cfg(feature = "onnx")]
#[cfg_attr(docsrs, doc(cfg(feature = "onnx")))]
pub use backends::{FCoref, FCorefConfig};
#[cfg(feature = "candle")]
#[cfg_attr(docsrs, doc(cfg(feature = "candle")))]
pub use backends::CandleNER;
pub fn extract(text: &str) -> Result<Vec<Entity>> {
let model = StackedNER::default();
model.extract_entities(text, None)
}
pub fn extract_batch(texts: &[&str]) -> Vec<Result<Vec<Entity>>> {
let model = StackedNER::default();
model.extract_batch(texts, None)
}
pub use annotated::annotate;
pub use annotated::AnnotatedDoc;
pub mod prelude {
pub use crate::types::EntitySliceExt;
pub use crate::{
AnnotatedDoc, Confidence, Entity, EntityType, Error, Language, Model, Result, StackedNER,
};
}
pub mod models {
pub const BERT_ONNX: &str = "protectai/bert-base-NER-onnx";
pub const GLINER: &str = "onnx-community/gliner_small-v2.1";
pub const GLINER_MULTITASK: &str = "onnx-community/gliner-multitask-large-v0.5";
pub const CANDLE: &str = "dslim/bert-base-NER";
pub const GLINER_CANDLE: &str = "urchade/gliner_small-v2.1";
pub const NUNER: &str = "deepanwa/NuNerZero_onnx";
pub const NUNER_ZERO: &str = "numind/NuNER_Zero";
pub const GLINER_POLY: &str = "knowledgator/gliner-bi-large-v1.0";
pub const W2NER: &str = "ljynlp/w2ner-bert-base";
pub const B2NER: &str = "Umean/B2NER-Internlm2.5-7B-LoRA";
pub const DEBERTA_V3: &str = "ficsort/deberta-v3-base-conll2003-ner";
pub const BIOMEDICAL: &str = "d4data/biomedical-ner-all";
pub const GLINER_PII: &str = "knowledgator/gliner-pii-edge-v1.0";
pub const GLINER_RELEX: &str = "knowledgator/gliner-relex-large-v1.0";
pub const GLINER_BI_BASE: &str = "knowledgator/gliner-bi-base-v2.0";
pub const GLINER_BI_LARGE: &str = "knowledgator/gliner-bi-large-v2.0";
pub const NUNER_ZERO_4K: &str = "numind/NuNER_Zero-4k";
pub const NUNER_ZERO_SPAN: &str = "numind/NuNER_Zero-span";
}
#[doc(hidden)]
pub const DEFAULT_BERT_ONNX_MODEL: &str = models::BERT_ONNX;
#[doc(hidden)]
pub const DEFAULT_GLINER_MODEL: &str = models::GLINER;
#[doc(hidden)]
pub const DEFAULT_GLINER_MULTITASK_MODEL: &str = models::GLINER_MULTITASK;
#[doc(hidden)]
pub const DEFAULT_CANDLE_MODEL: &str = models::CANDLE;
#[doc(hidden)]
pub const DEFAULT_GLINER_CANDLE_MODEL: &str = models::GLINER_CANDLE;
#[doc(hidden)]
pub const DEFAULT_NUNER_MODEL: &str = models::NUNER;
#[doc(hidden)]
pub const DEFAULT_GLINER_POLY_MODEL: &str = models::GLINER_POLY;
#[doc(hidden)]
pub const DEFAULT_W2NER_MODEL: &str = models::W2NER;
pub fn auto() -> Result<Box<dyn Model>> {
#[cfg(feature = "onnx")]
{
if let Ok(model) = GLiNEROnnx::new(DEFAULT_GLINER_MODEL) {
return Ok(Box::new(model));
}
if let Ok(model) = BertNEROnnx::new(DEFAULT_BERT_ONNX_MODEL) {
return Ok(Box::new(model));
}
}
#[cfg(feature = "candle")]
{
if let Ok(model) = CandleNER::from_pretrained(DEFAULT_CANDLE_MODEL) {
return Ok(Box::new(model));
}
}
Ok(Box::new(StackedNER::default()))
}
pub fn available_backends() -> Vec<(&'static str, bool)> {
use backends::catalog::BACKEND_CATALOG;
BACKEND_CATALOG
.iter()
.map(|info| {
let available = match info.feature {
None => true,
Some("onnx") => cfg!(feature = "onnx"),
Some("candle") => cfg!(feature = "candle"),
Some("llm") => cfg!(feature = "llm"),
Some(_) => false,
};
(info.name, available)
})
.collect()
}
#[cfg(test)]
#[derive(Clone)]
pub(crate) struct MockModel {
name: &'static str,
entities: Vec<Entity>,
types: Vec<EntityType>,
validate: bool,
}
#[cfg(test)]
impl MockModel {
#[must_use]
pub fn new(name: &'static str) -> Self {
Self {
name,
entities: Vec::new(),
types: Vec::new(),
validate: true,
}
}
#[must_use]
pub fn with_entities(mut self, entities: Vec<Entity>) -> Self {
for (i, e) in entities.iter().enumerate() {
assert!(
e.start() < e.end(),
"MockModel entity {}: start ({}) must be < end ({})",
i,
e.start(),
e.end()
);
assert!(
e.confidence >= 0.0 && e.confidence <= 1.0,
"MockModel entity {}: confidence ({}) must be in [0.0, 1.0]",
i,
e.confidence
);
}
self.entities = entities;
self
}
#[must_use]
pub fn without_validation(mut self) -> Self {
self.validate = false;
self
}
fn validate_entities(&self, text: &str) -> Result<()> {
let text_len = text.chars().count();
for (i, e) in self.entities.iter().enumerate() {
if e.end() > text_len {
return Err(Error::InvalidInput(format!(
"MockModel entity {} '{}': end offset ({}) exceeds text length ({} chars)",
i,
e.text,
e.end(),
text_len
)));
}
let actual_text = e.extract_text_with_len(text, text_len);
if actual_text != e.text {
return Err(Error::InvalidInput(format!(
"MockModel entity {} text mismatch: expected '{}' at [{},{}), found '{}'",
i,
e.text,
e.start(),
e.end(),
actual_text
)));
}
}
Ok(())
}
}
#[cfg(test)]
impl Model for MockModel {
fn extract_entities(&self, text: &str, _language: Option<Language>) -> Result<Vec<Entity>> {
if self.validate && !self.entities.is_empty() {
self.validate_entities(text)?;
}
Ok(self.entities.clone())
}
fn supported_types(&self) -> Vec<EntityType> {
self.types.clone()
}
fn is_available(&self) -> bool {
true
}
fn name(&self) -> &'static str {
self.name
}
fn description(&self) -> &'static str {
"Mock NER model for testing"
}
}
#[cfg(test)]
mod any_model_tests {
use super::*;
fn base_any_model() -> AnyModel {
AnyModel::new(
"test-any",
"test model",
vec![EntityType::Person],
|_text, _lang| Ok(vec![]),
)
}
#[test]
fn any_model_capabilities_default_no_zero_shot_no_relations() {
let m = base_any_model();
let caps = m.capabilities();
assert!(
!caps.zero_shot,
"should not report zero_shot without closure"
);
assert!(
!caps.relation_capable,
"should not report relation_capable without closure"
);
}
#[test]
fn any_model_zero_shot_returns_entities() {
let m = base_any_model().with_zero_shot(|_text, types, _threshold| {
Ok(types
.iter()
.enumerate()
.map(|(i, &lbl)| {
Entity::new(
lbl,
EntityType::custom(lbl, EntityCategory::Misc),
i,
i + 1,
0.8,
)
})
.collect())
});
assert!(m.capabilities().zero_shot);
let ents = m
.extract_with_types("hello world", &["GREETING", "NOUN"], 0.5)
.unwrap();
assert_eq!(ents.len(), 2);
assert_eq!(ents[0].text, "GREETING");
assert_eq!(ents[1].text, "NOUN");
}
#[test]
fn any_model_zero_shot_missing_returns_feature_not_available() {
let m = base_any_model();
let ents: Result<Vec<Entity>> = m.extract_with_types("hello", &["X"], 0.5);
let err = ents.unwrap_err();
assert!(
matches!(err, Error::FeatureNotAvailable(_)),
"expected FeatureNotAvailable, got: {err:?}"
);
}
#[test]
fn any_model_relations_returns_entities_and_relations() {
use crate::backends::inference::RelationExtractor;
let m = base_any_model().with_relations(|_text| {
let head = Entity::new("Alice", EntityType::Person, 0, 5, 0.9);
let tail = Entity::new("Acme", EntityType::Organization, 15, 19, 0.85);
let rel = Relation::new(head.clone(), tail.clone(), "WORKS_AT", 0.8);
Ok((vec![head, tail], vec![rel]))
});
assert!(m.capabilities().relation_capable);
let (ents, rels) = m
.extract_relations_default("Alice works at Acme Corp")
.unwrap();
assert_eq!(ents.len(), 2);
assert_eq!(rels.len(), 1);
assert_eq!(rels[0].relation_type, "WORKS_AT");
}
#[test]
fn any_model_relations_missing_returns_feature_not_available() {
use crate::backends::inference::RelationExtractor;
let m = base_any_model();
let err = m.extract_relations_default("hello").unwrap_err();
assert!(
matches!(err, Error::FeatureNotAvailable(_)),
"expected FeatureNotAvailable, got: {err:?}"
);
}
}
#[cfg(test)]
mod convenience_tests {
use super::*;
#[test]
fn extract_finds_entities() {
let ents = extract("Marie Curie won the Nobel Prize.").unwrap();
assert!(!ents.is_empty(), "extract() should find entities");
}
#[test]
fn extract_empty_text() {
let ents = extract("").unwrap();
assert!(ents.is_empty());
}
#[test]
fn prelude_imports_work() {
use crate::prelude::*;
let m = StackedNER::default();
let ents = m.extract_entities("Test input", None).unwrap();
let _: Vec<_> = ents.above_confidence(0.5).collect();
}
}
#[cfg(test)]
mod batch_tests {
use super::*;
#[test]
fn extract_batch_empty_slice() {
let results = extract_batch(&[]);
assert!(results.is_empty());
}
#[test]
fn extract_batch_single_text() {
let results = extract_batch(&["Marie Curie won the Nobel Prize."]);
assert_eq!(results.len(), 1);
assert!(results[0].is_ok());
assert!(!results[0].as_ref().unwrap().is_empty());
}
#[test]
fn extract_batch_multiple_texts() {
let results = extract_batch(&[
"Marie Curie won the Nobel Prize.",
"Ada Lovelace wrote the first program.",
"No entities here in this plain sentence.",
]);
assert_eq!(results.len(), 3);
for r in &results {
assert!(r.is_ok());
}
}
#[test]
fn trait_method_extract_batch_empty() {
let m = StackedNER::default();
let results = m.extract_batch(&[], None);
assert!(results.is_empty());
}
#[test]
fn trait_method_extract_batch_count() {
let m = StackedNER::default();
let texts = ["Alice", "Bob", "Carol"];
let results = m.extract_batch(&texts, None);
assert_eq!(results.len(), 3);
}
#[cfg(feature = "parallel")]
#[test]
fn par_extract_batch_preserves_order_and_count() {
let m = StackedNER::default();
let texts = [
"Marie Curie won the Nobel Prize.",
"Alan Turing worked at Bletchley Park.",
"Grace Hopper helped develop COBOL.",
"Ada Lovelace wrote the first program.",
];
let seq = m.extract_batch(&texts, None);
let par = m.par_extract_batch(&texts, None);
assert_eq!(par.len(), seq.len());
for (a, b) in par.iter().zip(seq.iter()) {
assert_eq!(a.is_ok(), b.is_ok());
if let (Ok(av), Ok(bv)) = (a, b) {
assert_eq!(av.len(), bv.len());
for (x, y) in av.iter().zip(bv.iter()) {
assert_eq!(x.text, y.text);
assert_eq!(x.start(), y.start());
assert_eq!(x.end(), y.end());
}
}
}
}
}