anno_core/core/mod.rs
1//! # anno::core
2//!
3//! Core types for the Anno NLP toolkit: Named Entity Recognition, Coreference
4//! Resolution, and Relation Extraction.
5//!
6//! ## Why This Module Exists
7//!
8//! NLP pipelines involve many components (tokenizers, NER models, coreference
9//! resolvers, entity linkers) that need to share data. Without a common type
10//! system, each component defines its own `Entity`, `Span`, `Document` types,
11//! requiring tedious conversion code and risking subtle bugs.
12//!
13//! `anno::core` solves this by providing:
14//!
15//! 1. **Canonical types** that all components agree on
16//! 2. **Rich metadata** beyond basic spans (confidence, provenance, relations)
17//! 3. **Grounded hierarchy** for multi-document, multi-modal processing
18//! 4. **Dataset abstractions** for evaluation and benchmarking
19//!
20//! ## Core Concepts
21//!
22//! ### Entities and Spans
23//!
24//! ```rust,ignore
25//! use anno_core::{Entity, EntityType, Span};
26//!
27//! let entity = Entity::new("Barack Obama", EntityType::Person)
28//! .with_span(Span::new(0, 12))
29//! .with_confidence(0.95);
30//! ```
31//!
32//! ### Grounded Documents
33//!
34//! For cross-document coreference, entities are "grounded" to real-world identities:
35//!
36//! ```rust,ignore
37//! use anno_core::{GroundedDocument, Identity, Signal};
38//!
39//! // Multiple mentions across documents resolve to one identity
40//! let obama_id = Identity::new("Q76"); // Wikidata ID
41//! doc1.ground_mention(mention1, obama_id.clone());
42//! doc2.ground_mention(mention2, obama_id);
43//! ```
44//!
45//! ### Dataset Specifications
46//!
47//! Define or discover evaluation datasets:
48//!
49//! ```rust,ignore
50//! use anno_core::{CustomDataset, Task, Domain, License};
51//!
52//! let dataset = CustomDataset::new("my_ner", Task::NER)
53//! .with_languages(&["en"])
54//! .with_domain(Domain::Biomedical)
55//! .with_license(License::CCBY);
56//! ```
57//!
58//! ## Module Overview
59//!
60//! | Module | Purpose |
61//! |--------|---------|
62//! | [`entity`] | `Entity`, `Span`, `Relation`, `EntityType` |
63//! | [`grounded`] | `GroundedDocument`, `Identity`, `Signal`, `Track` |
64//! | [`coref`] | `Mention`, `CorefChain`, `CorefDocument` |
65//! | [`graph`] | Export to Neo4j, GraphML, JSON-LD |
66//! | [`dataset`] | `DatasetSpec`, `CustomDataset`, `DatasetRegistry` |
67//! | [`calibration`] | Confidence score calibration |
68//! | [`historical`] | Ancient language provenance (BCE dates, epigraphy) |
69//! | [`provenance`] | Document origin tracking |
70//! | [`types`] | `Gender`, `MentionType`, `PhiFeatures`, `TypeLabel` |
71//! | [`provisional`] | Experimental types (`BoxEmbedding`) |
72//!
73//! ## Design Philosophy
74//!
75//! - **Character offsets, not byte offsets**: Unicode-safe from the start
76//! - **Immutable where possible**: Entities are built then used, not mutated
77//! - **Serde everywhere**: All types serialize for caching and interop
78//! - **No ML dependencies**: Pure data types, no torch/candle/onnx
79//!
80//! ## Minimal surface
81//!
82//! If you’re downstream and want a small “just the contract” import set, prefer
83//! `anno_core::minimal` (or `anno::core::*` in the `anno` crate) rather than grabbing the entire
84//! re-export surface.
85
86pub mod calibration;
87pub mod coref;
88pub mod dataset;
89pub mod entity;
90pub mod error;
91pub mod grounded;
92pub mod historical;
93pub mod ontology;
94pub mod provenance;
95pub mod provisional;
96pub mod types;
97
98// Re-exports for convenience
99pub use entity::{
100 generate_span_candidates, DiscontinuousSpan, Entity, EntityBuilder, EntityCategory, EntityType,
101 EntityViewport, ExtractionMethod, HashMapLexicon, HierarchicalConfidence, Lexicon, Provenance,
102 RaggedBatch, Relation, Span, SpanCandidate, TypeMapper, ValidationIssue,
103};
104
105pub use grounded::{
106 Corpus, GroundedDocument, Identity, IdentityId, IdentitySource, Location, Modality, Quantifier,
107 Signal, SignalId, SignalRef, Track, TrackId, TrackRef, TrackStats,
108};
109
110pub use error::{Error, Result};
111
112// Dataset types
113pub use dataset::{
114 CustomDataset, DatasetRegistry, DatasetSpec, DatasetStats, Domain, License, ParserHint,
115 SplitSizes, Task, TemporalCoverage,
116};
117
118// Coreference types
119pub use coref::{entities_to_chains, CorefChain, CorefDocument, CoreferenceResolver, Mention};
120
121// Other modules accessible via anno_core::module_name
122pub use types::{
123 DatePrecision, Gender, HistoricalDate, MentionType, MetricStats, Number, Person, PhiFeatures,
124 TemporalValidity, TypeLabel,
125};