anno/backends/hmm/mod.rs
1//! Hidden Markov Model (HMM) NER backend.
2//!
3//! Implements classical statistical NER using HMMs, the dominant approach
4//! from the 1990s before CRFs became popular. This was the **first statistical
5//! approach** to NER, replacing rule-based systems.
6//!
7//! # Historical Context
8//!
9//! NER first appeared at MUC-6 (1996), where Grishman & Sundheim defined
10//! the task of identifying people, organizations, and locations. Early
11//! systems were rule-based (lexicons, hand-crafted patterns). HMMs brought
12//! statistical learning to NER:
13//!
14//! ```text
15//! 1987 MUC-1: First IE conference (no formal NER task)
16//! 1996 MUC-6: NER formally defined (PER, ORG, LOC)
17//! 1997 Nymble (Bikel et al.): early HMM NER system
18//! 1998 BBN IdentiFinder: HMM-based, MUC-7 benchmark
19//! 2001 CRFs introduced (Lafferty et al.) — HMMs become a common comparison baseline
20//! ```
21//!
22//! # Architecture
23//!
24//! ```text
25//! Input: "John works at Google"
26//! ↓
27//! ┌─────────────────────────────────────────────────────────┐
28//! │ Hidden States (NER Tags) │
29//! │ │
30//! │ B-PER ──> O ──> O ──> B-ORG │
31//! │ │ │ │ │ │
32//! │ ↓ ↓ ↓ ↓ │
33//! │ John works at Google │
34//! │ │
35//! │ Observed Emissions │
36//! └─────────────────────────────────────────────────────────┘
37//!
38//! P(tags | words) ∝ P(tags) × P(words | tags)
39//! = ∏ P(tag_i | tag_{i-1}) × P(word_i | tag_i)
40//! ```
41//!
42//! # HMM Components
43//!
44//! 1. **States**: BIO tags (B-PER, I-PER, B-ORG, I-ORG, B-LOC, I-LOC, O)
45//! 2. **Observations**: Words in the text
46//! 3. **Transition Probabilities**: P(tag_i | tag_{i-1})
47//! 4. **Emission Probabilities**: P(word | tag)
48//! 5. **Initial Probabilities**: P(tag | start)
49//!
50//! # Mathematical Formulation
51//!
52//! HMMs are **generative models** that model the joint probability:
53//!
54//! ```text
55//! P(x, y) = P(y) × P(x | y)
56//! = P(y_1) × ∏_{t=2}^{T} P(y_t | y_{t-1}) // transitions
57//! × ∏_{t=1}^{T} P(x_t | y_t) // emissions
58//! ```
59//!
60//! Decoding uses the **Viterbi algorithm** (dynamic programming) to find
61//! the most likely state sequence in O(T × |S|²) time.
62//!
63//! # History
64//!
65//! - Rabiner (1989): "A Tutorial on Hidden Markov Models" (foundational)
66//! - Bikel et al. 1997: "Nymble: A High-Performance Learning Name-finder"
67//! - BBN IdentiFinder: One of the first HMM-based NER systems
68//! - Often replaced by CRFs for NER in the 2000s, but still useful as a baseline/teaching model
69//!
70//! # Why HMMs Often Underperform CRFs (for NER)
71//!
72//! | Aspect | HMM | CRF |
73//! |--------|-----|-----|
74//! | Model Type | Generative | Discriminative |
75//! | Features | Word identity only | Arbitrary features |
76//! | Context | First-order Markov | Arbitrary windows |
77//! | Label Bias | Inherent | Solved |
78//! | Performance | task-dependent | task-dependent |
79//!
80//! HMMs are typically used with relatively limited emission features. CRFs can use arbitrary
81//! feature functions (capitalization, prefixes/suffixes, gazetteers, etc.) while remaining a
82//! globally normalized conditional model.
83//!
84//! # References
85//!
86//! - Rabiner (1989): "A Tutorial on Hidden Markov Models and Selected
87//! Applications in Speech Recognition" (Proceedings of IEEE)
88//! - Bikel, Miller, Schwartz, Weischedel (1997): "Nymble: A High-Performance
89//! Learning Name-finder" (ANLP)
90//! - Bikel, Schwartz, Weischedel (1999): "An Algorithm that Learns What's
91//! in a Name" (Machine Learning)
92//!
93//! # See Also
94//!
95//! - CRF-style sequence models (`backends/crf.rs`)
96
97use crate::{Entity, EntityType, Model, Result};
98use std::collections::HashMap;
99
100#[cfg(feature = "bundled-hmm-params")]
101use std::sync::OnceLock;
102
103#[cfg(feature = "bundled-hmm-params")]
104use serde_json as _;
105
106#[derive(Debug, Clone)]
107struct HmmParams {
108 states: Vec<String>,
109 initial: Vec<f64>,
110 transitions: Vec<Vec<f64>>,
111 backoff: serde_json::Value,
112}
113
114#[derive(Debug, Clone)]
115struct HmmBackoff {
116 /// len_bucket -> probs per state index (aligned with `states`)
117 len: HashMap<String, Vec<f64>>,
118 /// boolean feature -> P(feature_present | state) per state index
119 bool_present: HashMap<String, Vec<f64>>,
120 /// Stable list of boolean features to include absent probabilities.
121 bool_keys: Vec<String>,
122}
123
124/// HMM configuration.
125#[derive(Debug, Clone)]
126pub struct HmmConfig {
127 /// Smoothing parameter for unseen words.
128 pub smoothing: f64,
129 /// Use log probabilities for numerical stability.
130 pub use_log_probs: bool,
131 /// Optional penalty applied to non-O emissions when using bundled backoff.
132 ///
133 /// Values < 1.0 reduce spurious entities; values > 1.0 increase recall but may over-tag.
134 pub non_o_emission_scale: f64,
135 /// If true, prefer bundled priors/transitions (when available) instead of heuristic dynamics.
136 pub use_bundled_dynamics: bool,
137}
138
139impl Default for HmmConfig {
140 fn default() -> Self {
141 Self {
142 smoothing: 1e-10,
143 use_log_probs: true,
144 // Tuned to reduce spurious tagging when bundled params are enabled.
145 non_o_emission_scale: 0.5,
146 // When bundled params are available, use their dynamics by default.
147 // This keeps the "trained" path genuinely end-to-end.
148 use_bundled_dynamics: true,
149 }
150 }
151}
152
153/// Hidden Markov Model for NER.
154///
155/// This implements a first-order HMM (bigram) for sequence labeling.
156/// Uses the Viterbi algorithm for decoding.
157#[derive(Debug)]
158pub struct HmmNER {
159 /// Configuration.
160 config: HmmConfig,
161 /// State labels (BIO tags).
162 states: Vec<String>,
163 /// State to index mapping.
164 state_to_idx: HashMap<String, usize>,
165 /// Transition probabilities: P(state_j | state_i)
166 /// transitions\[i\]\[j\] = P(j | i)
167 transitions: Vec<Vec<f64>>,
168 /// Initial state probabilities: P(state | start)
169 initial: Vec<f64>,
170 /// Emission probabilities: P(word | state)
171 /// Key: (state_idx, word), Value: probability
172 emissions: HashMap<(usize, String), f64>,
173 /// Vocabulary for unknown word handling.
174 #[allow(dead_code)] // Reserved for OOV handling
175 vocab: HashMap<String, usize>,
176 /// Optional bundled emission backoff (small, trained).
177 backoff: Option<HmmBackoff>,
178}
179
180mod algorithm;
181// HMM Viterbi, forward-backward, and emission scoring: see algorithm.rs.
182impl Model for HmmNER {
183 fn extract_entities(&self, text: &str, _language: Option<&str>) -> Result<Vec<Entity>> {
184 if text.trim().is_empty() {
185 return Ok(vec![]);
186 }
187
188 let words: Vec<&str> = text.split_whitespace().collect();
189 if words.is_empty() {
190 return Ok(vec![]);
191 }
192
193 let label_indices = self.viterbi(&words);
194 let entities = self.decode_entities(text, &words, &label_indices);
195
196 Ok(entities)
197 }
198
199 fn supported_types(&self) -> Vec<EntityType> {
200 vec![
201 EntityType::Person,
202 EntityType::Organization,
203 EntityType::Location,
204 EntityType::Other("MISC".to_string()),
205 ]
206 }
207
208 fn is_available(&self) -> bool {
209 true
210 }
211
212 fn capabilities(&self) -> crate::ModelCapabilities {
213 // HMM has no specialized batch or streaming impl; all defaults are false.
214 crate::ModelCapabilities::default()
215 }
216}
217
218impl crate::sealed::Sealed for HmmNER {}
219impl crate::NamedEntityCapable for HmmNER {}
220
221#[cfg(test)]
222mod tests;