anno/backends/regex/
mod.rs

1//! Regex-based NER - Extracts entities via regex patterns only.
2//!
3//! No hardcoded gazetteers. Only extracts entities that can be reliably
4//! identified by their format:
5//! - Dates: ISO 8601, MM/DD/YYYY, "January 15, 2024", "Jan 15"
6//!   - Multilingual: Japanese 年月日, German/French/Spanish/Italian/Portuguese/Dutch months
7//! - Times: "3:30 PM", "14:00", "10am"
8//! - Money: $100, $1.5M, "50 dollars", €500
9//! - Percentages: 15%, 3.5%
10//! - Emails: user@example.com
11//! - URLs: `https://example.com`
12//! - Phone numbers: (555) 123-4567, +1-555-123-4567
13//!
14//! For Person/Organization/Location, use ML models (BERT ONNX, GLiNER).
15
16use crate::{Entity, EntityType, Model, Result};
17use once_cell::sync::Lazy;
18use regex::Regex;
19
20/// Regex-based NER - extracts entities with recognizable formats using regex patterns.
21///
22/// Reliable extraction without ML models. Does NOT attempt to identify
23/// Person/Organization/Location - those require contextual understanding.
24///
25/// # Supported Entity Types
26///
27/// | Type | Examples |
28/// |------|----------|
29/// | Date | "2024-01-15", "January 15, 2024", "2024年1月15日", "15 Januar" |
30/// | Time | "3:30 PM", "14:00", "10am" |
31/// | Money | "$100", "€50", "5 million dollars" |
32/// | Percent | "15%", "3.5%" |
33/// | Email | "user@example.com" |
34/// | URL | `https://example.com` |
35/// | Phone | "(555) 123-4567", "+1-555-1234" |
36///
37/// # Example
38///
39/// ```rust
40/// use anno::{RegexNER, Model};
41///
42/// let ner = RegexNER::new();
43/// let entities = ner.extract_entities(
44///     "Meeting at 3:30 PM on Jan 15. Contact: bob@acme.com",
45///     None
46/// ).unwrap();
47///
48/// assert!(entities.len() >= 3); // time, date, email
49/// ```
50pub struct RegexNER;
51
52impl RegexNER {
53    /// Create a new regex-based NER.
54    #[must_use]
55    pub fn new() -> Self {
56        Self
57    }
58}
59
60impl Default for RegexNER {
61    fn default() -> Self {
62        Self::new()
63    }
64}
65
66// Static regex patterns - compiled once, reused forever
67static DATE_ISO: Lazy<Regex> =
68    Lazy::new(|| Regex::new(r"\b\d{4}-\d{2}-\d{2}\b").expect("valid regex"));
69
70static DATE_US: Lazy<Regex> =
71    Lazy::new(|| Regex::new(r"\b\d{1,2}/\d{1,2}/\d{2,4}\b").expect("valid regex"));
72
73static DATE_EU: Lazy<Regex> =
74    Lazy::new(|| Regex::new(r"\b\d{1,2}\.\d{1,2}\.\d{2,4}\b").expect("valid regex"));
75
76static DATE_WRITTEN_FULL: Lazy<Regex> = Lazy::new(|| {
77    Regex::new(r"(?i)\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}(?:st|nd|rd|th)?(?:,?\s*\d{4})?\b").expect("valid regex")
78});
79
80static DATE_WRITTEN_SHORT: Lazy<Regex> = Lazy::new(|| {
81    Regex::new(r"(?i)\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)\.?\s+\d{1,2}(?:st|nd|rd|th)?(?:,?\s*\d{4})?\b").expect("valid regex")
82});
83
84static DATE_WRITTEN_EU: Lazy<Regex> = Lazy::new(|| {
85    Regex::new(r"(?i)\b\d{1,2}(?:st|nd|rd|th)?\s+(?:January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)\.?(?:\s+\d{4})?\b").expect("valid regex")
86});
87
88// =============================================================================
89// Japanese Date Format: YYYY年MM月DD日
90// =============================================================================
91
92static DATE_JAPANESE: Lazy<Regex> = Lazy::new(|| {
93    // Matches: 2024年1月15日, 2024年01月15日, etc.
94    Regex::new(r"\d{4}年\d{1,2}月\d{1,2}日").expect("valid regex")
95});
96
97// =============================================================================
98// Multilingual Month Names
99// =============================================================================
100
101// German months: Januar, Februar, März, April, Mai, Juni, Juli, August, September, Oktober, November, Dezember
102static DATE_GERMAN_FULL: Lazy<Regex> = Lazy::new(|| {
103    Regex::new(r"(?i)\b(?:Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)\s+\d{1,2}(?:\.)?(?:,?\s*\d{4})?\b").expect("valid regex")
104});
105
106static DATE_GERMAN_EU: Lazy<Regex> = Lazy::new(|| {
107    // "15. Januar 2024" or "15 Januar"
108    Regex::new(r"(?i)\b\d{1,2}\.?\s+(?:Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)(?:\s+\d{4})?\b").expect("valid regex")
109});
110
111// French months: janvier, février, mars, avril, mai, juin, juillet, août, septembre, octobre, novembre, décembre
112static DATE_FRENCH_FULL: Lazy<Regex> = Lazy::new(|| {
113    Regex::new(r"(?i)\b(?:janvier|février|fevrier|mars|avril|mai|juin|juillet|août|aout|septembre|octobre|novembre|décembre|decembre)\s+\d{1,2}(?:,?\s*\d{4})?\b").expect("valid regex")
114});
115
116static DATE_FRENCH_EU: Lazy<Regex> = Lazy::new(|| {
117    // "15 janvier 2024" or "1er janvier"
118    Regex::new(r"(?i)\b\d{1,2}(?:er)?\s+(?:janvier|février|fevrier|mars|avril|mai|juin|juillet|août|aout|septembre|octobre|novembre|décembre|decembre)(?:\s+\d{4})?\b").expect("valid regex")
119});
120
121// Spanish months: enero, febrero, marzo, abril, mayo, junio, julio, agosto, septiembre, octubre, noviembre, diciembre
122static DATE_SPANISH_EU: Lazy<Regex> = Lazy::new(|| {
123    // "15 de enero de 2024" or "15 enero 2024"
124    Regex::new(r"(?i)\b\d{1,2}\s+(?:de\s+)?(?:enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre)(?:\s+(?:de\s+)?\d{4})?\b").expect("valid regex")
125});
126
127// Italian months: gennaio, febbraio, marzo, aprile, maggio, giugno, luglio, agosto, settembre, ottobre, novembre, dicembre
128static DATE_ITALIAN_EU: Lazy<Regex> = Lazy::new(|| {
129    Regex::new(r"(?i)\b\d{1,2}\s+(?:gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|novembre|dicembre)(?:\s+\d{4})?\b").expect("valid regex")
130});
131
132// Portuguese months: janeiro, fevereiro, março, abril, maio, junho, julho, agosto, setembro, outubro, novembro, dezembro
133static DATE_PORTUGUESE_EU: Lazy<Regex> = Lazy::new(|| {
134    // "15 de janeiro de 2024"
135    Regex::new(r"(?i)\b\d{1,2}\s+(?:de\s+)?(?:janeiro|fevereiro|março|marco|abril|maio|junho|julho|agosto|setembro|outubro|novembro|dezembro)(?:\s+(?:de\s+)?\d{4})?\b").expect("valid regex")
136});
137
138// Dutch months: januari, februari, maart, april, mei, juni, juli, augustus, september, oktober, november, december
139static DATE_DUTCH_EU: Lazy<Regex> = Lazy::new(|| {
140    Regex::new(r"(?i)\b\d{1,2}\s+(?:januari|februari|maart|april|mei|juni|juli|augustus|september|oktober|november|december)(?:\s+\d{4})?\b").expect("valid regex")
141});
142
143// Russian months (Cyrillic): январь, февраль, март, апрель, май, июнь, июль, август, сентябрь, октябрь, ноябрь, декабрь
144static DATE_RUSSIAN_EU: Lazy<Regex> = Lazy::new(|| {
145    // "15 января 2024" - uses genitive case forms
146    Regex::new(r"\b\d{1,2}\s+(?:января|февраля|марта|апреля|мая|июня|июля|августа|сентября|октября|ноября|декабря)(?:\s+\d{4})?\b").expect("valid regex")
147});
148
149// Chinese date format: YYYY年MM月DD日 (same as Japanese but also common)
150// Already covered by DATE_JAPANESE
151
152// Korean date format: YYYY년 MM월 DD일
153static DATE_KOREAN: Lazy<Regex> =
154    Lazy::new(|| Regex::new(r"\d{4}년\s*\d{1,2}월\s*\d{1,2}일").expect("valid regex"));
155
156static TIME_12H: Lazy<Regex> = Lazy::new(|| {
157    Regex::new(r"(?i)\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:am|pm|a\.m\.|p\.m\.)\b").expect("valid regex")
158});
159
160static TIME_24H: Lazy<Regex> =
161    Lazy::new(|| Regex::new(r"\b(?:[01]?\d|2[0-3]):[0-5]\d(?::[0-5]\d)?\b").expect("valid regex"));
162
163static TIME_SIMPLE: Lazy<Regex> = Lazy::new(|| {
164    // Note: No trailing \b because a.m./p.m. end with .
165    Regex::new(r"(?i)\b\d{1,2}\s*(?:am\b|pm\b|a\.m\.|p\.m\.)").expect("valid regex")
166});
167
168static MONEY_SYMBOL: Lazy<Regex> = Lazy::new(|| {
169    Regex::new(r"[$€£¥][\d,]+(?:\.\d{1,2})?(?:\s*(?:billion|million|thousand|B|M|K|bn|mn))?")
170        .expect("valid regex")
171});
172
173static MONEY_WRITTEN: Lazy<Regex> = Lazy::new(|| {
174    Regex::new(
175        r"(?i)\b\d+(?:,\d{3})*(?:\.\d{1,2})?\s*(?:dollars?|USD|euros?|EUR|pounds?|GBP|yen|JPY)\b",
176    )
177    .expect("valid regex")
178});
179
180static MONEY_MAGNITUDE: Lazy<Regex> = Lazy::new(|| {
181    Regex::new(
182        r"(?i)\b\d+(?:\.\d+)?\s*(?:billion|million|trillion)\s*(?:dollars?|euros?|pounds?)?\b",
183    )
184    .expect("valid regex")
185});
186
187static PERCENT: Lazy<Regex> = Lazy::new(|| {
188    // Note: No trailing \b because % is not a word character
189    Regex::new(r"\b\d+(?:\.\d+)?\s*(?:%|percent\b|pct\b)").expect("valid regex")
190});
191
192static EMAIL: Lazy<Regex> = Lazy::new(|| {
193    Regex::new(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b").expect("valid regex")
194});
195
196static URL: Lazy<Regex> =
197    Lazy::new(|| Regex::new(r"(?i)\bhttps?://[^\s<>\[\]{}|\\^`\x00-\x1f]+").expect("valid regex"));
198
199static PHONE_US: Lazy<Regex> = Lazy::new(|| {
200    Regex::new(r"(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b").expect("valid regex")
201});
202
203static PHONE_INTL: Lazy<Regex> = Lazy::new(|| {
204    Regex::new(r"\+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}\b").expect("valid regex")
205});
206
207static MENTION: Lazy<Regex> = Lazy::new(|| {
208    // @username - supports letters, numbers, underscore, dot (but not starting/ending with dot)
209    Regex::new(r"\B@[\w](?:[\w.]*[\w])?").expect("valid regex")
210});
211
212static HASHTAG: Lazy<Regex> = Lazy::new(|| {
213    // #hashtag - supports letters, numbers, underscore
214    Regex::new(r"\B#\w+").expect("valid regex")
215});
216
217impl Model for RegexNER {
218    fn extract_entities(&self, text: &str, _language: Option<&str>) -> Result<Vec<Entity>> {
219        use crate::offset::SpanConverter;
220        use anno_core::Provenance;
221        let mut entities = Vec::new();
222
223        // Performance optimization: Build SpanConverter once for all byte-to-char conversions
224        // ROI: High - called once per extract_entities, saves O(n) per regex match
225        let converter = SpanConverter::new(text);
226
227        // Helper to add entity if no overlap
228        // Note: regex returns byte offsets, but we convert to char offsets
229        // for consistency with evaluation (GoldEntity uses char offsets).
230        let mut add_entity =
231            |m: regex::Match, entity_type: EntityType, confidence: f64, pattern: &'static str| {
232                // Convert byte offsets to character offsets for Unicode correctness
233                // Use optimized SpanConverter instead of bytes_to_chars
234                let char_start = converter.byte_to_char(m.start());
235                let char_end = converter.byte_to_char(m.end());
236                if !overlaps(&entities, char_start, char_end) {
237                    entities.push(Entity::with_provenance(
238                        m.as_str(),
239                        entity_type,
240                        char_start,
241                        char_end,
242                        confidence,
243                        Provenance::pattern(pattern),
244                    ));
245                }
246            };
247
248        // Dates (high confidence - very specific patterns)
249        // English dates
250        let date_patterns_en: &[(&Lazy<Regex>, &'static str)] = &[
251            (&DATE_ISO, "DATE_ISO"),
252            (&DATE_US, "DATE_US"),
253            (&DATE_EU, "DATE_EU"),
254            (&DATE_WRITTEN_FULL, "DATE_WRITTEN_FULL"),
255            (&DATE_WRITTEN_SHORT, "DATE_WRITTEN_SHORT"),
256            (&DATE_WRITTEN_EU, "DATE_WRITTEN_EU"),
257        ];
258        for (pattern, name) in date_patterns_en {
259            for m in pattern.find_iter(text) {
260                add_entity(m, EntityType::Date, 0.95, name);
261            }
262        }
263
264        // Multilingual dates (Japanese, Korean, German, French, Spanish, etc.)
265        let date_patterns_i18n: &[(&Lazy<Regex>, &'static str)] = &[
266            (&DATE_JAPANESE, "DATE_JAPANESE"),
267            (&DATE_KOREAN, "DATE_KOREAN"),
268            (&DATE_GERMAN_FULL, "DATE_GERMAN_FULL"),
269            (&DATE_GERMAN_EU, "DATE_GERMAN_EU"),
270            (&DATE_FRENCH_FULL, "DATE_FRENCH_FULL"),
271            (&DATE_FRENCH_EU, "DATE_FRENCH_EU"),
272            (&DATE_SPANISH_EU, "DATE_SPANISH_EU"),
273            (&DATE_ITALIAN_EU, "DATE_ITALIAN_EU"),
274            (&DATE_PORTUGUESE_EU, "DATE_PORTUGUESE_EU"),
275            (&DATE_DUTCH_EU, "DATE_DUTCH_EU"),
276            (&DATE_RUSSIAN_EU, "DATE_RUSSIAN_EU"),
277        ];
278        for (pattern, name) in date_patterns_i18n {
279            for m in pattern.find_iter(text) {
280                add_entity(m, EntityType::Date, 0.93, name); // Slightly lower confidence for i18n
281            }
282        }
283
284        // Times
285        let time_patterns: &[(&Lazy<Regex>, &'static str)] = &[
286            (&TIME_12H, "TIME_12H"),
287            (&TIME_24H, "TIME_24H"),
288            (&TIME_SIMPLE, "TIME_SIMPLE"),
289        ];
290        for (pattern, name) in time_patterns {
291            for m in pattern.find_iter(text) {
292                add_entity(m, EntityType::Time, 0.90, name);
293            }
294        }
295
296        // Money (high confidence)
297        let money_patterns: &[(&Lazy<Regex>, &'static str)] = &[
298            (&MONEY_SYMBOL, "MONEY_SYMBOL"),
299            (&MONEY_WRITTEN, "MONEY_WRITTEN"),
300            (&MONEY_MAGNITUDE, "MONEY_MAGNITUDE"),
301        ];
302        for (pattern, name) in money_patterns {
303            for m in pattern.find_iter(text) {
304                add_entity(m, EntityType::Money, 0.95, name);
305            }
306        }
307
308        // Percentages
309        for m in PERCENT.find_iter(text) {
310            add_entity(m, EntityType::Percent, 0.95, "PERCENT");
311        }
312
313        // Emails (very high confidence - very specific pattern)
314        for m in EMAIL.find_iter(text) {
315            add_entity(m, EntityType::Email, 0.98, "EMAIL");
316        }
317
318        // URLs (very high confidence)
319        for m in URL.find_iter(text) {
320            add_entity(m, EntityType::Url, 0.98, "URL");
321        }
322
323        // Phone numbers (medium confidence - can have false positives)
324        let phone_patterns: &[(&Lazy<Regex>, &'static str)] =
325            &[(&PHONE_US, "PHONE_US"), (&PHONE_INTL, "PHONE_INTL")];
326        for (pattern, name) in phone_patterns {
327            for m in pattern.find_iter(text) {
328                add_entity(m, EntityType::Phone, 0.85, name);
329            }
330        }
331
332        // Social Media (@mentions and #hashtags) - note: mapping to Other for now as specific types don't exist yet
333        for m in MENTION.find_iter(text) {
334            // Using a custom "Mention" type via Other
335            // In future refactor: Add EntityType::Mention
336            let char_start = converter.byte_to_char(m.start());
337            let char_end = converter.byte_to_char(m.end());
338            if !overlaps(&entities, char_start, char_end) {
339                // We use EntityType::Other for now, but specific string "Mention"
340                entities.push(Entity::with_provenance(
341                    m.as_str(),
342                    EntityType::Other("Mention".to_string()),
343                    char_start,
344                    char_end,
345                    0.95,
346                    Provenance::pattern("MENTION"),
347                ));
348            }
349        }
350
351        for m in HASHTAG.find_iter(text) {
352            let char_start = converter.byte_to_char(m.start());
353            let char_end = converter.byte_to_char(m.end());
354            if !overlaps(&entities, char_start, char_end) {
355                entities.push(Entity::with_provenance(
356                    m.as_str(),
357                    EntityType::Other("Hashtag".to_string()),
358                    char_start,
359                    char_end,
360                    0.95,
361                    Provenance::pattern("HASHTAG"),
362                ));
363            }
364        }
365
366        // Performance: Use unstable sort (we don't need stable sort here)
367        // Sort by position for consistent output
368        entities.sort_unstable_by_key(|e| e.start);
369
370        Ok(entities)
371    }
372
373    fn supported_types(&self) -> Vec<EntityType> {
374        vec![
375            EntityType::Date,
376            EntityType::Time,
377            EntityType::Money,
378            EntityType::Percent,
379            EntityType::Email,
380            EntityType::Url,
381            EntityType::Phone,
382        ]
383    }
384
385    fn is_available(&self) -> bool {
386        true
387    }
388
389    fn name(&self) -> &'static str {
390        "regex"
391    }
392
393    fn description(&self) -> &'static str {
394        "Regex-based NER (dates, times, money, percentages, emails, URLs, phones)"
395    }
396
397    fn capabilities(&self) -> crate::ModelCapabilities {
398        crate::ModelCapabilities {
399            batch_capable: true,
400            streaming_capable: true,
401            ..Default::default()
402        }
403    }
404}
405
406/// Check if a span overlaps with existing entities.
407fn overlaps(entities: &[Entity], start: usize, end: usize) -> bool {
408    entities.iter().any(|e| !(end <= e.start || start >= e.end))
409}
410
411// Capability marker: RegexNER extracts structured entities via regex
412impl crate::StructuredEntityCapable for RegexNER {}
413
414#[cfg(test)]
415mod tests {
416    use super::*;
417
418    fn ner() -> RegexNER {
419        RegexNER::new()
420    }
421
422    fn extract(text: &str) -> Vec<Entity> {
423        ner()
424            .extract_entities(text, None)
425            .expect("NER extraction should succeed")
426    }
427
428    fn has_type(entities: &[Entity], ty: &EntityType) -> bool {
429        entities.iter().any(|e| &e.entity_type == ty)
430    }
431
432    fn count_type(entities: &[Entity], ty: &EntityType) -> usize {
433        entities.iter().filter(|e| &e.entity_type == ty).count()
434    }
435
436    fn find_text<'a>(entities: &'a [Entity], text: &str) -> Option<&'a Entity> {
437        entities.iter().find(|e| e.text == text)
438    }
439
440    // ========================================================================
441    // Date Tests
442    // ========================================================================
443
444    #[test]
445    fn date_iso_format() {
446        let e = extract("Meeting on 2024-01-15.");
447        assert!(find_text(&e, "2024-01-15").is_some());
448    }
449
450    #[test]
451    fn date_us_format() {
452        let e = extract("Due by 12/31/2024 and 1/5/24.");
453        assert_eq!(count_type(&e, &EntityType::Date), 2);
454    }
455
456    #[test]
457    fn date_eu_format() {
458        let e = extract("Released on 31.12.2024.");
459        assert!(find_text(&e, "31.12.2024").is_some());
460    }
461
462    #[test]
463    fn date_written_full() {
464        let cases = [
465            "January 15, 2024",
466            "February 28",
467            "March 1st, 2024",
468            "December 25th",
469        ];
470        for case in cases {
471            let e = extract(case);
472            assert!(has_type(&e, &EntityType::Date), "Failed: {}", case);
473        }
474    }
475
476    #[test]
477    fn date_written_short() {
478        let cases = ["Jan 15, 2024", "Feb 28", "Mar. 1st", "Dec 25th, 2024"];
479        for case in cases {
480            let e = extract(case);
481            assert!(has_type(&e, &EntityType::Date), "Failed: {}", case);
482        }
483    }
484
485    #[test]
486    fn date_eu_written() {
487        let cases = ["15 January 2024", "28th February", "1st March 2024"];
488        for case in cases {
489            let e = extract(case);
490            assert!(has_type(&e, &EntityType::Date), "Failed: {}", case);
491        }
492    }
493
494    // ========================================================================
495    // Time Tests
496    // ========================================================================
497
498    #[test]
499    fn time_12h_format() {
500        let cases = ["3:30 PM", "10:00 am", "12:30:45 p.m.", "9:00 AM"];
501        for case in cases {
502            let e = extract(case);
503            assert!(has_type(&e, &EntityType::Time), "Failed: {}", case);
504        }
505    }
506
507    #[test]
508    fn time_24h_format() {
509        let cases = ["14:30", "09:00", "23:59:59", "0:00"];
510        for case in cases {
511            let e = extract(case);
512            assert!(has_type(&e, &EntityType::Time), "Failed: {}", case);
513        }
514    }
515
516    #[test]
517    fn time_simple() {
518        let cases = ["3pm", "10 AM", "9 a.m."];
519        for case in cases {
520            let e = extract(case);
521            assert!(has_type(&e, &EntityType::Time), "Failed: {}", case);
522        }
523    }
524
525    // ========================================================================
526    // Money Tests
527    // ========================================================================
528
529    #[test]
530    fn money_dollar_basic() {
531        let cases = ["$100", "$1,000", "$99.99", "$1,234,567.89"];
532        for case in cases {
533            let e = extract(case);
534            assert!(has_type(&e, &EntityType::Money), "Failed: {}", case);
535        }
536    }
537
538    #[test]
539    fn money_with_magnitude() {
540        let cases = ["$5 million", "$1.5B", "$100K", "$2 billion"];
541        for case in cases {
542            let e = extract(case);
543            assert!(has_type(&e, &EntityType::Money), "Failed: {}", case);
544        }
545    }
546
547    #[test]
548    fn money_other_currencies() {
549        let cases = ["€500", "£100", "¥1000"];
550        for case in cases {
551            let e = extract(case);
552            assert!(has_type(&e, &EntityType::Money), "Failed: {}", case);
553        }
554    }
555
556    #[test]
557    fn money_unicode_offsets_correct() {
558        // Regression test: Entity offsets must be CHARACTER offsets, not byte offsets.
559        // Euro sign (€) is 3 bytes but 1 character.
560        // This test catches the bug where regex byte offsets were stored directly.
561        let text = "Price: €50 then €100";
562        let ner = RegexNER::new();
563        let entities = ner
564            .extract_entities(text, None)
565            .expect("NER extraction should succeed");
566
567        // "Price: " = 7 chars, so first € is at char 7
568        // "€50 then " = 9 chars, so second € is at char 16
569        let money: Vec<_> = entities
570            .iter()
571            .filter(|e| e.entity_type == EntityType::Money)
572            .collect();
573
574        assert_eq!(money.len(), 2, "Expected 2 money entities, got {:?}", money);
575
576        // First entity: "€50" at char 7
577        assert_eq!(money[0].start, 7, "First € should be at char 7, not byte 7");
578        assert_eq!(money[0].end, 10, "First entity end should be char 10");
579
580        // Second entity: "€100" at char 16
581        assert_eq!(
582            money[1].start, 16,
583            "Second € should be at char 16, not byte 18"
584        );
585        assert_eq!(money[1].end, 20, "Second entity end should be char 20");
586    }
587
588    #[test]
589    fn money_written() {
590        let cases = [
591            "50 dollars",
592            "100 USD",
593            "500 euros",
594            "1000 EUR",
595            "200 pounds",
596        ];
597        for case in cases {
598            let e = extract(case);
599            assert!(has_type(&e, &EntityType::Money), "Failed: {}", case);
600        }
601    }
602
603    #[test]
604    fn money_magnitude_written() {
605        let cases = ["5 billion dollars", "1.5 million euros", "100 million"];
606        for case in cases {
607            let e = extract(case);
608            assert!(has_type(&e, &EntityType::Money), "Failed: {}", case);
609        }
610    }
611
612    // ========================================================================
613    // Percent Tests
614    // ========================================================================
615
616    #[test]
617    fn percent_basic() {
618        let cases = ["15%", "3.5%", "100%", "0.01%"];
619        for case in cases {
620            let e = extract(case);
621            assert!(has_type(&e, &EntityType::Percent), "Failed: {}", case);
622        }
623    }
624
625    #[test]
626    fn percent_written() {
627        let cases = ["15 percent", "50 pct"];
628        for case in cases {
629            let e = extract(case);
630            assert!(has_type(&e, &EntityType::Percent), "Failed: {}", case);
631        }
632    }
633
634    // ========================================================================
635    // Email Tests
636    // ========================================================================
637
638    #[test]
639    fn email_basic() {
640        let cases = [
641            "user@example.com",
642            "john.doe@company.org",
643            "support+ticket@help.co.uk",
644            "test_123@sub.domain.io",
645        ];
646        for case in cases {
647            let e = extract(case);
648            assert!(
649                e.iter().any(|e| e.entity_type == EntityType::Email),
650                "Failed: {}",
651                case
652            );
653        }
654    }
655
656    // ========================================================================
657    // URL Tests
658    // ========================================================================
659
660    #[test]
661    fn url_basic() {
662        let cases = [
663            "https://example.com",
664            "http://www.google.com",
665            "https://sub.domain.co.uk/path?query=1",
666            "http://localhost:8080/api",
667        ];
668        for case in cases {
669            let e = extract(case);
670            assert!(
671                e.iter().any(|e| e.entity_type == EntityType::Url),
672                "Failed: {}",
673                case
674            );
675        }
676    }
677
678    // ========================================================================
679    // Phone Tests
680    // ========================================================================
681
682    #[test]
683    fn phone_us_format() {
684        let cases = [
685            "(555) 123-4567",
686            "555-123-4567",
687            "555.123.4567",
688            "1-555-123-4567",
689            "+1 555 123 4567",
690        ];
691        for case in cases {
692            let e = extract(case);
693            assert!(
694                e.iter().any(|e| e.entity_type == EntityType::Phone),
695                "Failed: {}",
696                case
697            );
698        }
699    }
700
701    #[test]
702    fn phone_international() {
703        let cases = ["+44 20 7946 0958", "+81 3 1234 5678"];
704        for case in cases {
705            let e = extract(case);
706            assert!(
707                e.iter().any(|e| e.entity_type == EntityType::Phone),
708                "Failed: {}",
709                case
710            );
711        }
712    }
713
714    // ========================================================================
715    // Integration Tests
716    // ========================================================================
717
718    #[test]
719    fn mixed_entities() {
720        let text = "Meeting on Jan 15 at 3:30 PM. Cost: $500. Contact: bob@acme.com or (555) 123-4567. Completion: 75%.";
721        let e = extract(text);
722
723        assert!(has_type(&e, &EntityType::Date), "Should have Date: {:?}", e);
724        assert!(has_type(&e, &EntityType::Time), "Should have Time: {:?}", e);
725        assert!(
726            has_type(&e, &EntityType::Money),
727            "Should have Money: {:?}",
728            e
729        );
730        assert!(
731            has_type(&e, &EntityType::Percent),
732            "Should have Percent: {:?}",
733            e
734        );
735        assert!(
736            e.iter().any(|e| e.entity_type == EntityType::Email),
737            "Should have Email: {:?}",
738            e
739        );
740        assert!(
741            e.iter().any(|e| e.entity_type == EntityType::Phone),
742            "Should have Phone: {:?}",
743            e
744        );
745    }
746
747    #[test]
748    fn no_person_org_loc() {
749        let e = extract("John Smith works at Google in New York.");
750        // Should NOT extract Person/Org/Location
751        assert!(!has_type(&e, &EntityType::Person));
752        assert!(!has_type(&e, &EntityType::Organization));
753        assert!(!has_type(&e, &EntityType::Location));
754    }
755
756    #[test]
757    fn entities_sorted_by_position() {
758        let e = extract("$100 on 2024-01-01 at 50%");
759        let positions: Vec<usize> = e.iter().map(|e| e.start).collect();
760        let mut sorted = positions.clone();
761        sorted.sort();
762        assert_eq!(positions, sorted);
763    }
764
765    #[test]
766    fn no_overlapping_entities() {
767        let e = extract("The price is $1,000,000 (1 million dollars).");
768        for i in 0..e.len() {
769            for j in (i + 1)..e.len() {
770                let overlap = e[i].start < e[j].end && e[j].start < e[i].end;
771                assert!(!overlap, "Overlap: {:?} and {:?}", e[i], e[j]);
772            }
773        }
774    }
775
776    #[test]
777    fn empty_text() {
778        let e = extract("");
779        assert!(e.is_empty());
780    }
781
782    #[test]
783    fn no_entities_text() {
784        let e = extract("The quick brown fox jumps over the lazy dog.");
785        assert!(e.is_empty());
786    }
787
788    #[test]
789    fn entity_spans_correct() {
790        use crate::offset::TextSpan;
791
792        let text = "Cost: $100";
793        let e = extract(text);
794        let money = find_text(&e, "$100").expect("money entity should be found");
795        assert_eq!(
796            TextSpan::from_chars(text, money.start, money.end).extract(text),
797            "$100"
798        );
799    }
800
801    #[test]
802    fn provenance_attached() {
803        use anno_core::ExtractionMethod;
804
805        let text = "Contact: test@email.com on 2024-01-15";
806        let e = extract(text);
807
808        // All entities should have provenance
809        for entity in &e {
810            assert!(
811                entity.provenance.is_some(),
812                "Missing provenance for {:?}",
813                entity
814            );
815            let prov = entity
816                .provenance
817                .as_ref()
818                .expect("provenance should be set");
819
820            // Source should be "pattern"
821            assert_eq!(prov.source.as_ref(), "pattern");
822            assert_eq!(prov.method, ExtractionMethod::Pattern);
823
824            // Pattern name should be set
825            assert!(
826                prov.pattern.is_some(),
827                "Missing pattern name for {:?}",
828                entity
829            );
830        }
831
832        // Check specific pattern names
833        let email = find_text(&e, "test@email.com").expect("email entity should be found");
834        assert_eq!(
835            email
836                .provenance
837                .as_ref()
838                .expect("provenance should be set")
839                .pattern
840                .as_ref()
841                .expect("pattern should be set")
842                .as_ref(),
843            "EMAIL"
844        );
845
846        let date = find_text(&e, "2024-01-15").expect("date entity should be found");
847        assert_eq!(
848            date.provenance
849                .as_ref()
850                .expect("provenance should be set")
851                .pattern
852                .as_ref()
853                .expect("pattern should be set")
854                .as_ref(),
855            "DATE_ISO"
856        );
857    }
858
859    // ========================================================================
860    // Multilingual Date Tests
861    // ========================================================================
862
863    #[test]
864    fn japanese_date_format() {
865        let cases = ["2024年1月15日", "2024年12月31日", "2000年01月01日"];
866        for case in cases {
867            let e = extract(case);
868            assert!(has_type(&e, &EntityType::Date), "Failed: {}", case);
869            assert_eq!(e[0].text, case);
870        }
871    }
872
873    #[test]
874    fn korean_date_format() {
875        let cases = ["2024년 1월 15일", "2024년 12월 31일"];
876        for case in cases {
877            let e = extract(case);
878            assert!(has_type(&e, &EntityType::Date), "Failed: {}", case);
879        }
880    }
881
882    #[test]
883    fn german_month_names() {
884        let cases = [
885            ("15. Januar 2024", "15. Januar 2024"),
886            ("3 März 2023", "3 März 2023"),
887            ("25 Dezember", "25 Dezember"),
888        ];
889        for (text, expected) in cases {
890            let e = extract(text);
891            assert!(has_type(&e, &EntityType::Date), "Failed: {}", text);
892            assert!(
893                find_text(&e, expected).is_some(),
894                "Expected '{}' in: {}",
895                expected,
896                text
897            );
898        }
899    }
900
901    #[test]
902    fn french_month_names() {
903        let cases = ["15 janvier 2024", "1er février 2023", "25 décembre"];
904        for case in cases {
905            let e = extract(case);
906            assert!(has_type(&e, &EntityType::Date), "Failed: {}", case);
907        }
908    }
909
910    #[test]
911    fn spanish_month_names() {
912        let cases = ["15 de enero de 2024", "5 marzo 2023", "25 diciembre"];
913        for case in cases {
914            let e = extract(case);
915            assert!(has_type(&e, &EntityType::Date), "Failed: {}", case);
916        }
917    }
918
919    #[test]
920    fn italian_month_names() {
921        let e = extract("15 gennaio 2024");
922        assert!(has_type(&e, &EntityType::Date));
923    }
924
925    #[test]
926    fn portuguese_month_names() {
927        let e = extract("15 de janeiro de 2024");
928        assert!(has_type(&e, &EntityType::Date));
929    }
930
931    #[test]
932    fn dutch_month_names() {
933        let e = extract("15 januari 2024");
934        assert!(has_type(&e, &EntityType::Date));
935    }
936
937    #[test]
938    fn russian_month_names() {
939        let e = extract("15 января 2024");
940        assert!(has_type(&e, &EntityType::Date));
941    }
942
943    #[test]
944    fn multilingual_dates_with_context() {
945        // Test that multilingual dates work in context with other text
946        let text = "Meeting on 2024年1月15日 at the office. Follow-up on 15 janvier.";
947        let e = extract(text);
948        let dates: Vec<_> = e
949            .iter()
950            .filter(|e| e.entity_type == EntityType::Date)
951            .collect();
952        assert_eq!(dates.len(), 2, "Expected 2 dates, got {:?}", dates);
953    }
954}
955
956// =============================================================================
957// BatchCapable and StreamingCapable Trait Implementations
958// =============================================================================
959
960impl crate::BatchCapable for RegexNER {
961    fn extract_entities_batch(
962        &self,
963        texts: &[&str],
964        language: Option<&str>,
965    ) -> Result<Vec<Vec<Entity>>> {
966        texts
967            .iter()
968            .map(|text| self.extract_entities(text, language))
969            .collect()
970    }
971
972    fn optimal_batch_size(&self) -> Option<usize> {
973        Some(64) // Regex matching is fast, can handle larger batches
974    }
975}
976
977impl crate::StreamingCapable for RegexNER {
978    fn recommended_chunk_size(&self) -> usize {
979        10_000 // Regex matching handles larger chunks efficiently
980    }
981}
982
983#[cfg(test)]
984mod proptests;
anno/backends/regex/mod.rs

anno/backends/regex/
mod.rs