anno/backends/lexicon.rs
1//! Lexicon-based NER backend.
2//!
3//! Provides exact-match entity lookup using gazetteers/lexicons.
4//! Useful for closed-domain entities (stock tickers, medical codes, known catalogs).
5//!
6//! # Research Context
7//!
8//! Gazetteers are most valuable when:
9//! 1. **Domain is closed**: Fixed, known entity lists
10//! 2. **Text is short**: where context is insufficient (see the “gazetteer + neural” literature)
11//! 3. **Used as features**: Input to neural model, not final output
12//!
13//! # Usage
14//!
15//! ```rust
16//! use anno::{Model, LexiconNER};
17//! use anno::{HashMapLexicon, EntityType};
18//!
19//! // Create a domain-specific lexicon
20//! let mut lexicon = HashMapLexicon::new("stock_tickers");
21//! lexicon.insert("AAPL", EntityType::Organization, 0.99);
22//! lexicon.insert("GOOGL", EntityType::Organization, 0.99);
23//!
24//! // Use as a backend
25//! let ner = LexiconNER::new(lexicon);
26//! let entities = ner
27//! .extract_entities("AAPL stock rose today.", None)
28//! .unwrap();
29//! ```
30//!
31//! # Integration with StackedNER
32//!
33//! LexiconNER can be used as a layer in StackedNER for hybrid extraction:
34//!
35//! ```rust
36//! use anno::{Model, StackedNER, RegexNER, LexiconNER};
37//! use anno::{HashMapLexicon, EntityType};
38//!
39//! let mut lexicon = HashMapLexicon::new("medical_codes");
40//! lexicon.insert("ICD-10", EntityType::Other("CODE".to_string()), 0.95);
41//!
42//! let ner = StackedNER::builder()
43//! .layer(RegexNER::new()) // Structured entities
44//! .layer(LexiconNER::new(lexicon)) // Domain-specific lookup
45//! .build();
46//! ```
47
48use crate::{Entity, EntityType, Model, Result};
49use anno_core::Lexicon;
50use std::sync::Arc;
51
52/// NER backend that uses exact-match lexicon lookup.
53///
54/// Scans text for known entities from a lexicon/gazetteer.
55/// Best for closed-domain entities where the full list is known.
56pub struct LexiconNER {
57 lexicon: Arc<dyn Lexicon + Send + Sync>,
58 case_sensitive: bool,
59 /// Minimum word boundary requirement (true = only match whole words)
60 word_boundary: bool,
61}
62
63impl LexiconNER {
64 /// Create a new LexiconNER with the given lexicon.
65 pub fn new(lexicon: impl Lexicon + 'static) -> Self {
66 Self {
67 lexicon: Arc::new(lexicon),
68 case_sensitive: false,
69 word_boundary: true,
70 }
71 }
72
73 /// Create with case-sensitive matching.
74 pub fn with_case_sensitive(mut self, case_sensitive: bool) -> Self {
75 self.case_sensitive = case_sensitive;
76 self
77 }
78
79 /// Create with word boundary requirement.
80 ///
81 /// If `true`, only matches whole words (default).
82 /// If `false`, matches substrings (e.g., "Apple" matches in "AppleInc").
83 pub fn with_word_boundary(mut self, word_boundary: bool) -> Self {
84 self.word_boundary = word_boundary;
85 self
86 }
87
88 /// Get a reference to the underlying lexicon.
89 pub fn lexicon(&self) -> &dyn Lexicon {
90 self.lexicon.as_ref()
91 }
92}
93
94impl Model for LexiconNER {
95 fn extract_entities(&self, text: &str, language: Option<&str>) -> Result<Vec<Entity>> {
96 let mut entities = Vec::new();
97
98 // For efficiency with large lexicons, we scan the text and check potential spans
99 // against the lexicon. This is O(n*m) where n=text length, m=avg entity length.
100 // For production with large lexicons, consider Aho-Corasick algorithm.
101
102 let text_chars: Vec<char> = text.chars().collect();
103 let text_len = text_chars.len();
104
105 // Detect if this is a CJK language (no word boundaries)
106 let lang_code = language.map(|l| l.split('-').next().unwrap_or(l).to_lowercase());
107 let is_cjk = lang_code
108 .as_deref()
109 .is_some_and(|l| matches!(l, "zh" | "ja" | "ko"));
110
111 // Helper to check if character is a word boundary marker
112 // For CJK: punctuation and whitespace are boundaries
113 // For other languages: alphanumeric vs non-alphanumeric
114 let is_word_boundary_char = |c: char| -> bool {
115 if is_cjk {
116 // CJK: punctuation, whitespace, and some CJK punctuation marks
117 c.is_whitespace()
118 || matches!(
119 c,
120 '。' | ',' | '、' | ';' | ':' | '?' | '!' | '・' | // CJK punctuation (Chinese/Japanese)
121 '.' | ',' | ';' | ':' | '?' | '!' | '(' | ')' | '[' | ']' | '{' | '}'
122 )
123 } else {
124 // Non-CJK: non-alphanumeric characters
125 !c.is_alphanumeric()
126 }
127 };
128
129 // Try all possible spans (word boundaries if word_boundary=true, or all substrings)
130 for start in 0..text_len {
131 // Try spans of increasing length
132 for end in (start + 1)..=text_len.min(start + 50) {
133 // Limit max span length
134 let span_text: String = text_chars[start..end].iter().collect();
135
136 // Check word boundary if required
137 if self.word_boundary {
138 let is_word_start =
139 start == 0 || is_word_boundary_char(text_chars[start.saturating_sub(1)]);
140 let is_word_end = end >= text_len || is_word_boundary_char(text_chars[end]);
141 if !is_word_start || !is_word_end {
142 continue;
143 }
144 }
145
146 // Try exact match
147 // For case-insensitive: we need to check if lexicon has the entry in any case
148 // Since Lexicon trait only supports exact lookup, we try both original and lowercase
149 // In a production system, consider using a case-normalized lexicon or Aho-Corasick
150 let matched = if self.case_sensitive {
151 self.lexicon.lookup(&span_text)
152 } else {
153 // Try original case first, then lowercase
154 // Note: This assumes lexicon entries are stored in a specific case
155 // For better case-insensitive matching, lexicon should normalize internally
156 self.lexicon
157 .lookup(&span_text)
158 .or_else(|| {
159 let lower = span_text.to_lowercase();
160 if lower != span_text {
161 self.lexicon.lookup(&lower)
162 } else {
163 None
164 }
165 })
166 // Also try with first letter capitalized (common pattern)
167 .or_else(|| {
168 let mut capitalized = span_text.to_lowercase();
169 if let Some(first) = capitalized.chars().next() {
170 capitalized.replace_range(
171 0..first.len_utf8(),
172 &first.to_uppercase().to_string(),
173 );
174 if capitalized != span_text {
175 self.lexicon.lookup(&capitalized)
176 } else {
177 None
178 }
179 } else {
180 None
181 }
182 })
183 };
184
185 if let Some((entity_type, confidence)) = matched {
186 // Found a match - convert byte positions to character positions
187 let char_start = text
188 .char_indices()
189 .nth(start)
190 .map(|(i, _)| i)
191 .unwrap_or(text.len());
192 let char_end = text
193 .char_indices()
194 .nth(end)
195 .map(|(i, _)| i)
196 .unwrap_or(text.len());
197
198 // Extract actual text span (preserving original case)
199 let actual_span: String = text.chars().skip(start).take(end - start).collect();
200
201 let provenance = anno_core::Provenance {
202 source: std::borrow::Cow::Borrowed("lexicon"),
203 method: anno_core::ExtractionMethod::Neural, // Lexicon variant deprecated
204 pattern: Some(std::borrow::Cow::Owned(format!(
205 "lexicon:{}",
206 self.lexicon.source()
207 ))),
208 raw_confidence: Some(confidence),
209 model_version: None,
210 timestamp: None,
211 };
212
213 entities.push(Entity::with_provenance(
214 actual_span,
215 entity_type,
216 char_start,
217 char_end,
218 confidence,
219 provenance,
220 ));
221
222 // Skip ahead to avoid overlapping matches (greedy matching)
223 break;
224 }
225 }
226 }
227
228 // Sort by position and remove overlaps (keep longest)
229 entities.sort_by_key(|e| (e.start, e.end));
230 let mut deduped: Vec<Entity> = Vec::new();
231 for entity in entities {
232 if deduped.is_empty() || !deduped.last().unwrap().overlaps(&entity) {
233 deduped.push(entity);
234 } else {
235 // Keep the longer span
236 let last = deduped.last_mut().unwrap();
237 if entity.end - entity.start > last.end - last.start {
238 *last = entity;
239 }
240 }
241 }
242
243 Ok(deduped)
244 }
245
246 fn supported_types(&self) -> Vec<EntityType> {
247 // We can't enumerate all types from the lexicon trait alone
248 // Return empty vec - types will be discovered during extraction
249 // For better type reporting, consider adding an entries() method to Lexicon trait
250 vec![]
251 }
252
253 fn is_available(&self) -> bool {
254 !self.lexicon.is_empty()
255 }
256
257 fn name(&self) -> &'static str {
258 "lexicon"
259 }
260
261 fn description(&self) -> &'static str {
262 "Exact-match lexicon/gazetteer lookup"
263 }
264}
265
266impl crate::BatchCapable for LexiconNER {
267 fn extract_entities_batch(
268 &self,
269 texts: &[&str],
270 _language: Option<&str>,
271 ) -> Result<Vec<Vec<Entity>>> {
272 texts
273 .iter()
274 .map(|text| self.extract_entities(text, None))
275 .collect()
276 }
277}
278
279impl crate::StreamingCapable for LexiconNER {
280 fn extract_entities_streaming(&self, chunk: &str, offset: usize) -> Result<Vec<Entity>> {
281 let mut entities = self.extract_entities(chunk, None)?;
282 for entity in &mut entities {
283 entity.start += offset;
284 entity.end += offset;
285 }
286 Ok(entities)
287 }
288}
289
290#[cfg(test)]
291mod tests {
292 use super::*;
293 use anno_core::HashMapLexicon;
294
295 #[test]
296 fn test_lexicon_ner_basic() {
297 let mut lexicon = HashMapLexicon::new("test");
298 lexicon.insert("Apple", EntityType::Organization, 0.99);
299 lexicon.insert("Microsoft", EntityType::Organization, 0.99);
300
301 let ner = LexiconNER::new(lexicon);
302 let entities = ner
303 .extract_entities("Apple and Microsoft are tech companies.", None)
304 .unwrap();
305
306 assert_eq!(entities.len(), 2);
307 assert!(entities
308 .iter()
309 .any(|e| e.text == "Apple" && e.entity_type == EntityType::Organization));
310 assert!(entities
311 .iter()
312 .any(|e| e.text == "Microsoft" && e.entity_type == EntityType::Organization));
313 }
314
315 #[test]
316 fn test_lexicon_ner_case_insensitive() {
317 let mut lexicon = HashMapLexicon::new("test");
318 lexicon.insert("Apple", EntityType::Organization, 0.99);
319
320 let ner = LexiconNER::new(lexicon);
321 let entities = ner.extract_entities("apple stock rose.", None).unwrap();
322
323 assert_eq!(entities.len(), 1);
324 assert_eq!(entities[0].text, "apple");
325 }
326
327 #[test]
328 fn test_lexicon_ner_word_boundary() {
329 let mut lexicon = HashMapLexicon::new("test");
330 lexicon.insert("Apple", EntityType::Organization, 0.99);
331
332 let ner = LexiconNER::new(lexicon);
333 let entities = ner
334 .extract_entities("AppleInc is a company.", None)
335 .unwrap();
336
337 // With word boundary, "Apple" should not match in "AppleInc"
338 // Note: This test may need adjustment based on word boundary detection logic
339 assert_eq!(entities.len(), 0);
340 }
341
342 #[test]
343 fn test_lexicon_ner_no_word_boundary() {
344 let mut lexicon = HashMapLexicon::new("test");
345 lexicon.insert("Apple", EntityType::Organization, 0.99);
346
347 let ner = LexiconNER::new(lexicon).with_word_boundary(false);
348 let entities = ner.extract_entities("AppleInc", None).unwrap();
349
350 // Without word boundary, "Apple" should match in "AppleInc"
351 assert!(entities.iter().any(|e| e.text == "Apple"));
352 }
353
354 #[test]
355 fn test_lexicon_ner_unicode_offsets() {
356 let mut lexicon = HashMapLexicon::new("test");
357 lexicon.insert("東京", EntityType::Location, 0.99);
358
359 let ner = LexiconNER::new(lexicon);
360 let text = "Visit 東京 for tourism.";
361 let entities = ner.extract_entities(text, None).unwrap();
362
363 assert_eq!(entities.len(), 1);
364 let entity = &entities[0];
365 assert_eq!(entity.text, "東京");
366 assert!(entity.start < entity.end);
367 assert!(entity.end <= text.chars().count());
368 }
369}