Skip to main content

anno/backends/
lexicon.rs

1//! Lexicon-based NER backend.
2//!
3//! Provides exact-match entity lookup using gazetteers/lexicons.
4//! Useful for closed-domain entities (stock tickers, medical codes, known catalogs).
5//!
6//! # Research Context
7//!
8//! Gazetteers are most valuable when:
9//! 1. **Domain is closed**: Fixed, known entity lists
10//! 2. **Text is short**: where context is insufficient (see the “gazetteer + neural” literature)
11//! 3. **Used as features**: Input to neural model, not final output
12//!
13//! # Usage
14//!
15//! ```rust
16//! use anno::{Model, LexiconNER};
17//! use anno::{HashMapLexicon, EntityType};
18//!
19//! // Create a domain-specific lexicon
20//! let mut lexicon = HashMapLexicon::new("stock_tickers");
21//! lexicon.insert("AAPL", EntityType::Organization, 0.99);
22//! lexicon.insert("GOOGL", EntityType::Organization, 0.99);
23//!
24//! // Use as a backend
25//! let ner = LexiconNER::new(lexicon);
26//! let entities = ner
27//!     .extract_entities("AAPL stock rose today.", None)
28//!     .unwrap();
29//! ```
30//!
31//! # Integration with StackedNER
32//!
33//! LexiconNER can be used as a layer in StackedNER for hybrid extraction:
34//!
35//! ```rust
36//! use anno::{Model, StackedNER, RegexNER, LexiconNER};
37//! use anno::{HashMapLexicon, EntityType};
38//!
39//! let mut lexicon = HashMapLexicon::new("medical_codes");
40//! lexicon.insert("ICD-10", EntityType::Other("CODE".to_string()), 0.95);
41//!
42//! let ner = StackedNER::builder()
43//!     .layer(RegexNER::new())           // Structured entities
44//!     .layer(LexiconNER::new(lexicon))  // Domain-specific lookup
45//!     .build();
46//! ```
47
48use crate::{Entity, EntityType, Model, Result};
49use anno_core::Lexicon;
50use std::sync::Arc;
51
52/// NER backend that uses exact-match lexicon lookup.
53///
54/// Scans text for known entities from a lexicon/gazetteer.
55/// Best for closed-domain entities where the full list is known.
56pub struct LexiconNER {
57    lexicon: Arc<dyn Lexicon + Send + Sync>,
58    case_sensitive: bool,
59    /// Minimum word boundary requirement (true = only match whole words)
60    word_boundary: bool,
61}
62
63impl LexiconNER {
64    /// Create a new LexiconNER with the given lexicon.
65    pub fn new(lexicon: impl Lexicon + 'static) -> Self {
66        Self {
67            lexicon: Arc::new(lexicon),
68            case_sensitive: false,
69            word_boundary: true,
70        }
71    }
72
73    /// Create with case-sensitive matching.
74    pub fn with_case_sensitive(mut self, case_sensitive: bool) -> Self {
75        self.case_sensitive = case_sensitive;
76        self
77    }
78
79    /// Create with word boundary requirement.
80    ///
81    /// If `true`, only matches whole words (default).
82    /// If `false`, matches substrings (e.g., "Apple" matches in "AppleInc").
83    pub fn with_word_boundary(mut self, word_boundary: bool) -> Self {
84        self.word_boundary = word_boundary;
85        self
86    }
87
88    /// Get a reference to the underlying lexicon.
89    pub fn lexicon(&self) -> &dyn Lexicon {
90        self.lexicon.as_ref()
91    }
92}
93
94impl Model for LexiconNER {
95    fn extract_entities(&self, text: &str, language: Option<&str>) -> Result<Vec<Entity>> {
96        let mut entities = Vec::new();
97
98        // For efficiency with large lexicons, we scan the text and check potential spans
99        // against the lexicon. This is O(n*m) where n=text length, m=avg entity length.
100        // For production with large lexicons, consider Aho-Corasick algorithm.
101
102        let text_chars: Vec<char> = text.chars().collect();
103        let text_len = text_chars.len();
104
105        // Detect if this is a CJK language (no word boundaries)
106        let lang_code = language.map(|l| l.split('-').next().unwrap_or(l).to_lowercase());
107        let is_cjk = lang_code
108            .as_deref()
109            .is_some_and(|l| matches!(l, "zh" | "ja" | "ko"));
110
111        // Helper to check if character is a word boundary marker
112        // For CJK: punctuation and whitespace are boundaries
113        // For other languages: alphanumeric vs non-alphanumeric
114        let is_word_boundary_char = |c: char| -> bool {
115            if is_cjk {
116                // CJK: punctuation, whitespace, and some CJK punctuation marks
117                c.is_whitespace()
118                    || matches!(
119                        c,
120                        '。' | ',' | '、' | ';' | ':' | '?' | '!' | '・' | // CJK punctuation (Chinese/Japanese)
121                    '.' | ',' | ';' | ':' | '?' | '!' | '(' | ')' | '[' | ']' | '{' | '}'
122                    )
123            } else {
124                // Non-CJK: non-alphanumeric characters
125                !c.is_alphanumeric()
126            }
127        };
128
129        // Try all possible spans (word boundaries if word_boundary=true, or all substrings)
130        for start in 0..text_len {
131            // Try spans of increasing length
132            for end in (start + 1)..=text_len.min(start + 50) {
133                // Limit max span length
134                let span_text: String = text_chars[start..end].iter().collect();
135
136                // Check word boundary if required
137                if self.word_boundary {
138                    let is_word_start =
139                        start == 0 || is_word_boundary_char(text_chars[start.saturating_sub(1)]);
140                    let is_word_end = end >= text_len || is_word_boundary_char(text_chars[end]);
141                    if !is_word_start || !is_word_end {
142                        continue;
143                    }
144                }
145
146                // Try exact match
147                // For case-insensitive: we need to check if lexicon has the entry in any case
148                // Since Lexicon trait only supports exact lookup, we try both original and lowercase
149                // In a production system, consider using a case-normalized lexicon or Aho-Corasick
150                let matched = if self.case_sensitive {
151                    self.lexicon.lookup(&span_text)
152                } else {
153                    // Try original case first, then lowercase
154                    // Note: This assumes lexicon entries are stored in a specific case
155                    // For better case-insensitive matching, lexicon should normalize internally
156                    self.lexicon
157                        .lookup(&span_text)
158                        .or_else(|| {
159                            let lower = span_text.to_lowercase();
160                            if lower != span_text {
161                                self.lexicon.lookup(&lower)
162                            } else {
163                                None
164                            }
165                        })
166                        // Also try with first letter capitalized (common pattern)
167                        .or_else(|| {
168                            let mut capitalized = span_text.to_lowercase();
169                            if let Some(first) = capitalized.chars().next() {
170                                capitalized.replace_range(
171                                    0..first.len_utf8(),
172                                    &first.to_uppercase().to_string(),
173                                );
174                                if capitalized != span_text {
175                                    self.lexicon.lookup(&capitalized)
176                                } else {
177                                    None
178                                }
179                            } else {
180                                None
181                            }
182                        })
183                };
184
185                if let Some((entity_type, confidence)) = matched {
186                    // Found a match - convert byte positions to character positions
187                    let char_start = text
188                        .char_indices()
189                        .nth(start)
190                        .map(|(i, _)| i)
191                        .unwrap_or(text.len());
192                    let char_end = text
193                        .char_indices()
194                        .nth(end)
195                        .map(|(i, _)| i)
196                        .unwrap_or(text.len());
197
198                    // Extract actual text span (preserving original case)
199                    let actual_span: String = text.chars().skip(start).take(end - start).collect();
200
201                    let provenance = anno_core::Provenance {
202                        source: std::borrow::Cow::Borrowed("lexicon"),
203                        method: anno_core::ExtractionMethod::Neural, // Lexicon variant deprecated
204                        pattern: Some(std::borrow::Cow::Owned(format!(
205                            "lexicon:{}",
206                            self.lexicon.source()
207                        ))),
208                        raw_confidence: Some(confidence),
209                        model_version: None,
210                        timestamp: None,
211                    };
212
213                    entities.push(Entity::with_provenance(
214                        actual_span,
215                        entity_type,
216                        char_start,
217                        char_end,
218                        confidence,
219                        provenance,
220                    ));
221
222                    // Skip ahead to avoid overlapping matches (greedy matching)
223                    break;
224                }
225            }
226        }
227
228        // Sort by position and remove overlaps (keep longest)
229        entities.sort_by_key(|e| (e.start, e.end));
230        let mut deduped: Vec<Entity> = Vec::new();
231        for entity in entities {
232            if deduped.is_empty() || !deduped.last().unwrap().overlaps(&entity) {
233                deduped.push(entity);
234            } else {
235                // Keep the longer span
236                let last = deduped.last_mut().unwrap();
237                if entity.end - entity.start > last.end - last.start {
238                    *last = entity;
239                }
240            }
241        }
242
243        Ok(deduped)
244    }
245
246    fn supported_types(&self) -> Vec<EntityType> {
247        // We can't enumerate all types from the lexicon trait alone
248        // Return empty vec - types will be discovered during extraction
249        // For better type reporting, consider adding an entries() method to Lexicon trait
250        vec![]
251    }
252
253    fn is_available(&self) -> bool {
254        !self.lexicon.is_empty()
255    }
256
257    fn name(&self) -> &'static str {
258        "lexicon"
259    }
260
261    fn description(&self) -> &'static str {
262        "Exact-match lexicon/gazetteer lookup"
263    }
264}
265
266impl crate::BatchCapable for LexiconNER {
267    fn extract_entities_batch(
268        &self,
269        texts: &[&str],
270        _language: Option<&str>,
271    ) -> Result<Vec<Vec<Entity>>> {
272        texts
273            .iter()
274            .map(|text| self.extract_entities(text, None))
275            .collect()
276    }
277}
278
279impl crate::StreamingCapable for LexiconNER {
280    fn extract_entities_streaming(&self, chunk: &str, offset: usize) -> Result<Vec<Entity>> {
281        let mut entities = self.extract_entities(chunk, None)?;
282        for entity in &mut entities {
283            entity.start += offset;
284            entity.end += offset;
285        }
286        Ok(entities)
287    }
288}
289
290#[cfg(test)]
291mod tests {
292    use super::*;
293    use anno_core::HashMapLexicon;
294
295    #[test]
296    fn test_lexicon_ner_basic() {
297        let mut lexicon = HashMapLexicon::new("test");
298        lexicon.insert("Apple", EntityType::Organization, 0.99);
299        lexicon.insert("Microsoft", EntityType::Organization, 0.99);
300
301        let ner = LexiconNER::new(lexicon);
302        let entities = ner
303            .extract_entities("Apple and Microsoft are tech companies.", None)
304            .unwrap();
305
306        assert_eq!(entities.len(), 2);
307        assert!(entities
308            .iter()
309            .any(|e| e.text == "Apple" && e.entity_type == EntityType::Organization));
310        assert!(entities
311            .iter()
312            .any(|e| e.text == "Microsoft" && e.entity_type == EntityType::Organization));
313    }
314
315    #[test]
316    fn test_lexicon_ner_case_insensitive() {
317        let mut lexicon = HashMapLexicon::new("test");
318        lexicon.insert("Apple", EntityType::Organization, 0.99);
319
320        let ner = LexiconNER::new(lexicon);
321        let entities = ner.extract_entities("apple stock rose.", None).unwrap();
322
323        assert_eq!(entities.len(), 1);
324        assert_eq!(entities[0].text, "apple");
325    }
326
327    #[test]
328    fn test_lexicon_ner_word_boundary() {
329        let mut lexicon = HashMapLexicon::new("test");
330        lexicon.insert("Apple", EntityType::Organization, 0.99);
331
332        let ner = LexiconNER::new(lexicon);
333        let entities = ner
334            .extract_entities("AppleInc is a company.", None)
335            .unwrap();
336
337        // With word boundary, "Apple" should not match in "AppleInc"
338        // Note: This test may need adjustment based on word boundary detection logic
339        assert_eq!(entities.len(), 0);
340    }
341
342    #[test]
343    fn test_lexicon_ner_no_word_boundary() {
344        let mut lexicon = HashMapLexicon::new("test");
345        lexicon.insert("Apple", EntityType::Organization, 0.99);
346
347        let ner = LexiconNER::new(lexicon).with_word_boundary(false);
348        let entities = ner.extract_entities("AppleInc", None).unwrap();
349
350        // Without word boundary, "Apple" should match in "AppleInc"
351        assert!(entities.iter().any(|e| e.text == "Apple"));
352    }
353
354    #[test]
355    fn test_lexicon_ner_unicode_offsets() {
356        let mut lexicon = HashMapLexicon::new("test");
357        lexicon.insert("東京", EntityType::Location, 0.99);
358
359        let ner = LexiconNER::new(lexicon);
360        let text = "Visit 東京 for tourism.";
361        let entities = ner.extract_entities(text, None).unwrap();
362
363        assert_eq!(entities.len(), 1);
364        let entity = &entities[0];
365        assert_eq!(entity.text, "東京");
366        assert!(entity.start < entity.end);
367        assert!(entity.end <= text.chars().count());
368    }
369}