Skip to main content

anno/backends/hmm/
mod.rs

1//! Hidden Markov Model (HMM) NER backend.
2//!
3//! Implements classical statistical NER using HMMs, the dominant approach
4//! from the 1990s before CRFs became popular. This was the **first statistical
5//! approach** to NER, replacing rule-based systems.
6//!
7//! # Historical Context
8//!
9//! NER first appeared at MUC-6 (1996), where Grishman & Sundheim defined
10//! the task of identifying people, organizations, and locations. Early
11//! systems were rule-based (lexicons, hand-crafted patterns). HMMs brought
12//! statistical learning to NER:
13//!
14//! ```text
15//! 1987  MUC-1: First IE conference (no formal NER task)
16//! 1996  MUC-6: NER formally defined (PER, ORG, LOC)
17//! 1997  Nymble (Bikel et al.): early HMM NER system
18//! 1998  BBN IdentiFinder: HMM-based, MUC-7 benchmark
19//! 2001  CRFs introduced (Lafferty et al.) — HMMs become a common comparison baseline
20//! ```
21//!
22//! # Architecture
23//!
24//! ```text
25//! Input: "John works at Google"
26//!    ↓
27//! ┌─────────────────────────────────────────────────────────┐
28//! │ Hidden States (NER Tags)                                │
29//! │                                                         │
30//! │  B-PER ──> O ──> O ──> B-ORG                           │
31//! │    │       │     │       │                              │
32//! │    ↓       ↓     ↓       ↓                              │
33//! │  John   works   at   Google                             │
34//! │                                                         │
35//! │ Observed Emissions                                      │
36//! └─────────────────────────────────────────────────────────┘
37//!
38//! P(tags | words) ∝ P(tags) × P(words | tags)
39//!                 = ∏ P(tag_i | tag_{i-1}) × P(word_i | tag_i)
40//! ```
41//!
42//! # HMM Components
43//!
44//! 1. **States**: BIO tags (B-PER, I-PER, B-ORG, I-ORG, B-LOC, I-LOC, O)
45//! 2. **Observations**: Words in the text
46//! 3. **Transition Probabilities**: P(tag_i | tag_{i-1})
47//! 4. **Emission Probabilities**: P(word | tag)
48//! 5. **Initial Probabilities**: P(tag | start)
49//!
50//! # Mathematical Formulation
51//!
52//! HMMs are **generative models** that model the joint probability:
53//!
54//! ```text
55//! P(x, y) = P(y) × P(x | y)
56//!         = P(y_1) × ∏_{t=2}^{T} P(y_t | y_{t-1})    // transitions
57//!                 × ∏_{t=1}^{T} P(x_t | y_t)          // emissions
58//! ```
59//!
60//! Decoding uses the **Viterbi algorithm** (dynamic programming) to find
61//! the most likely state sequence in O(T × |S|²) time.
62//!
63//! # History
64//!
65//! - Rabiner (1989): "A Tutorial on Hidden Markov Models" (foundational)
66//! - Bikel et al. 1997: "Nymble: A High-Performance Learning Name-finder"
67//! - BBN IdentiFinder: One of the first HMM-based NER systems
68//! - Often replaced by CRFs for NER in the 2000s, but still useful as a baseline/teaching model
69//!
70//! # Why HMMs Often Underperform CRFs (for NER)
71//!
72//! | Aspect | HMM | CRF |
73//! |--------|-----|-----|
74//! | Model Type | Generative | Discriminative |
75//! | Features | Word identity only | Arbitrary features |
76//! | Context | First-order Markov | Arbitrary windows |
77//! | Label Bias | Inherent | Solved |
78//! | Performance | task-dependent | task-dependent |
79//!
80//! HMMs are typically used with relatively limited emission features. CRFs can use arbitrary
81//! feature functions (capitalization, prefixes/suffixes, gazetteers, etc.) while remaining a
82//! globally normalized conditional model.
83//!
84//! # References
85//!
86//! - Rabiner (1989): "A Tutorial on Hidden Markov Models and Selected
87//!   Applications in Speech Recognition" (Proceedings of IEEE)
88//! - Bikel, Miller, Schwartz, Weischedel (1997): "Nymble: A High-Performance
89//!   Learning Name-finder" (ANLP)
90//! - Bikel, Schwartz, Weischedel (1999): "An Algorithm that Learns What's
91//!   in a Name" (Machine Learning)
92//!
93//! # See Also
94//!
95//! - CRF-style sequence models (`backends/crf.rs`)
96
97use crate::{Entity, EntityType, Model, Result};
98use std::collections::HashMap;
99
100#[cfg(feature = "bundled-hmm-params")]
101use std::sync::OnceLock;
102
103#[cfg(feature = "bundled-hmm-params")]
104use serde_json as _;
105
106#[derive(Debug, Clone)]
107struct HmmParams {
108    states: Vec<String>,
109    initial: Vec<f64>,
110    transitions: Vec<Vec<f64>>,
111    backoff: serde_json::Value,
112}
113
114#[derive(Debug, Clone)]
115struct HmmBackoff {
116    /// len_bucket -> probs per state index (aligned with `states`)
117    len: HashMap<String, Vec<f64>>,
118    /// boolean feature -> P(feature_present | state) per state index
119    bool_present: HashMap<String, Vec<f64>>,
120    /// Stable list of boolean features to include absent probabilities.
121    bool_keys: Vec<String>,
122}
123
124/// HMM configuration.
125#[derive(Debug, Clone)]
126pub struct HmmConfig {
127    /// Smoothing parameter for unseen words.
128    pub smoothing: f64,
129    /// Use log probabilities for numerical stability.
130    pub use_log_probs: bool,
131    /// Optional penalty applied to non-O emissions when using bundled backoff.
132    ///
133    /// Values < 1.0 reduce spurious entities; values > 1.0 increase recall but may over-tag.
134    pub non_o_emission_scale: f64,
135    /// If true, prefer bundled priors/transitions (when available) instead of heuristic dynamics.
136    pub use_bundled_dynamics: bool,
137}
138
139impl Default for HmmConfig {
140    fn default() -> Self {
141        Self {
142            smoothing: 1e-10,
143            use_log_probs: true,
144            // Tuned to reduce spurious tagging when bundled params are enabled.
145            non_o_emission_scale: 0.5,
146            // When bundled params are available, use their dynamics by default.
147            // This keeps the "trained" path genuinely end-to-end.
148            use_bundled_dynamics: true,
149        }
150    }
151}
152
153/// Hidden Markov Model for NER.
154///
155/// This implements a first-order HMM (bigram) for sequence labeling.
156/// Uses the Viterbi algorithm for decoding.
157#[derive(Debug)]
158pub struct HmmNER {
159    /// Configuration.
160    config: HmmConfig,
161    /// State labels (BIO tags).
162    states: Vec<String>,
163    /// State to index mapping.
164    state_to_idx: HashMap<String, usize>,
165    /// Transition probabilities: P(state_j | state_i)
166    /// transitions\[i\]\[j\] = P(j | i)
167    transitions: Vec<Vec<f64>>,
168    /// Initial state probabilities: P(state | start)
169    initial: Vec<f64>,
170    /// Emission probabilities: P(word | state)
171    /// Key: (state_idx, word), Value: probability
172    emissions: HashMap<(usize, String), f64>,
173    /// Vocabulary for unknown word handling.
174    #[allow(dead_code)] // Reserved for OOV handling
175    vocab: HashMap<String, usize>,
176    /// Optional bundled emission backoff (small, trained).
177    backoff: Option<HmmBackoff>,
178}
179
180mod algorithm;
181// HMM Viterbi, forward-backward, and emission scoring: see algorithm.rs.
182impl Model for HmmNER {
183    fn extract_entities(&self, text: &str, _language: Option<&str>) -> Result<Vec<Entity>> {
184        if text.trim().is_empty() {
185            return Ok(vec![]);
186        }
187
188        let words: Vec<&str> = text.split_whitespace().collect();
189        if words.is_empty() {
190            return Ok(vec![]);
191        }
192
193        let label_indices = self.viterbi(&words);
194        let entities = self.decode_entities(text, &words, &label_indices);
195
196        Ok(entities)
197    }
198
199    fn supported_types(&self) -> Vec<EntityType> {
200        vec![
201            EntityType::Person,
202            EntityType::Organization,
203            EntityType::Location,
204            EntityType::Other("MISC".to_string()),
205        ]
206    }
207
208    fn is_available(&self) -> bool {
209        true
210    }
211
212    fn capabilities(&self) -> crate::ModelCapabilities {
213        // HMM has no specialized batch or streaming impl; all defaults are false.
214        crate::ModelCapabilities::default()
215    }
216}
217
218impl crate::sealed::Sealed for HmmNER {}
219impl crate::NamedEntityCapable for HmmNER {}
220
221#[cfg(test)]
222mod tests;