1use crate::{Entity, EntityType, Model, Result};
17use once_cell::sync::Lazy;
18use regex::Regex;
19
20pub struct RegexNER;
51
52impl RegexNER {
53 #[must_use]
55 pub fn new() -> Self {
56 Self
57 }
58}
59
60impl Default for RegexNER {
61 fn default() -> Self {
62 Self::new()
63 }
64}
65
66static DATE_ISO: Lazy<Regex> =
68 Lazy::new(|| Regex::new(r"\b\d{4}-\d{2}-\d{2}\b").expect("valid regex"));
69
70static DATE_US: Lazy<Regex> =
71 Lazy::new(|| Regex::new(r"\b\d{1,2}/\d{1,2}/\d{2,4}\b").expect("valid regex"));
72
73static DATE_EU: Lazy<Regex> =
74 Lazy::new(|| Regex::new(r"\b\d{1,2}\.\d{1,2}\.\d{2,4}\b").expect("valid regex"));
75
76static DATE_WRITTEN_FULL: Lazy<Regex> = Lazy::new(|| {
77 Regex::new(r"(?i)\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}(?:st|nd|rd|th)?(?:,?\s*\d{4})?\b").expect("valid regex")
78});
79
80static DATE_WRITTEN_SHORT: Lazy<Regex> = Lazy::new(|| {
81 Regex::new(r"(?i)\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)\.?\s+\d{1,2}(?:st|nd|rd|th)?(?:,?\s*\d{4})?\b").expect("valid regex")
82});
83
84static DATE_WRITTEN_EU: Lazy<Regex> = Lazy::new(|| {
85 Regex::new(r"(?i)\b\d{1,2}(?:st|nd|rd|th)?\s+(?:January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)\.?(?:\s+\d{4})?\b").expect("valid regex")
86});
87
88static DATE_JAPANESE: Lazy<Regex> = Lazy::new(|| {
93 Regex::new(r"\d{4}年\d{1,2}月\d{1,2}日").expect("valid regex")
95});
96
97static DATE_GERMAN_FULL: Lazy<Regex> = Lazy::new(|| {
103 Regex::new(r"(?i)\b(?:Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)\s+\d{1,2}(?:\.)?(?:,?\s*\d{4})?\b").expect("valid regex")
104});
105
106static DATE_GERMAN_EU: Lazy<Regex> = Lazy::new(|| {
107 Regex::new(r"(?i)\b\d{1,2}\.?\s+(?:Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)(?:\s+\d{4})?\b").expect("valid regex")
109});
110
111static DATE_FRENCH_FULL: Lazy<Regex> = Lazy::new(|| {
113 Regex::new(r"(?i)\b(?:janvier|février|fevrier|mars|avril|mai|juin|juillet|août|aout|septembre|octobre|novembre|décembre|decembre)\s+\d{1,2}(?:,?\s*\d{4})?\b").expect("valid regex")
114});
115
116static DATE_FRENCH_EU: Lazy<Regex> = Lazy::new(|| {
117 Regex::new(r"(?i)\b\d{1,2}(?:er)?\s+(?:janvier|février|fevrier|mars|avril|mai|juin|juillet|août|aout|septembre|octobre|novembre|décembre|decembre)(?:\s+\d{4})?\b").expect("valid regex")
119});
120
121static DATE_SPANISH_EU: Lazy<Regex> = Lazy::new(|| {
123 Regex::new(r"(?i)\b\d{1,2}\s+(?:de\s+)?(?:enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre)(?:\s+(?:de\s+)?\d{4})?\b").expect("valid regex")
125});
126
127static DATE_ITALIAN_EU: Lazy<Regex> = Lazy::new(|| {
129 Regex::new(r"(?i)\b\d{1,2}\s+(?:gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|novembre|dicembre)(?:\s+\d{4})?\b").expect("valid regex")
130});
131
132static DATE_PORTUGUESE_EU: Lazy<Regex> = Lazy::new(|| {
134 Regex::new(r"(?i)\b\d{1,2}\s+(?:de\s+)?(?:janeiro|fevereiro|março|marco|abril|maio|junho|julho|agosto|setembro|outubro|novembro|dezembro)(?:\s+(?:de\s+)?\d{4})?\b").expect("valid regex")
136});
137
138static DATE_DUTCH_EU: Lazy<Regex> = Lazy::new(|| {
140 Regex::new(r"(?i)\b\d{1,2}\s+(?:januari|februari|maart|april|mei|juni|juli|augustus|september|oktober|november|december)(?:\s+\d{4})?\b").expect("valid regex")
141});
142
143static DATE_RUSSIAN_EU: Lazy<Regex> = Lazy::new(|| {
145 Regex::new(r"\b\d{1,2}\s+(?:января|февраля|марта|апреля|мая|июня|июля|августа|сентября|октября|ноября|декабря)(?:\s+\d{4})?\b").expect("valid regex")
147});
148
149static DATE_KOREAN: Lazy<Regex> =
154 Lazy::new(|| Regex::new(r"\d{4}년\s*\d{1,2}월\s*\d{1,2}일").expect("valid regex"));
155
156static TIME_12H: Lazy<Regex> = Lazy::new(|| {
157 Regex::new(r"(?i)\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:am|pm|a\.m\.|p\.m\.)\b").expect("valid regex")
158});
159
160static TIME_24H: Lazy<Regex> =
161 Lazy::new(|| Regex::new(r"\b(?:[01]?\d|2[0-3]):[0-5]\d(?::[0-5]\d)?\b").expect("valid regex"));
162
163static TIME_SIMPLE: Lazy<Regex> = Lazy::new(|| {
164 Regex::new(r"(?i)\b\d{1,2}\s*(?:am\b|pm\b|a\.m\.|p\.m\.)").expect("valid regex")
166});
167
168static MONEY_SYMBOL: Lazy<Regex> = Lazy::new(|| {
169 Regex::new(r"[$€£¥][\d,]+(?:\.\d{1,2})?(?:\s*(?:billion|million|thousand|B|M|K|bn|mn))?")
170 .expect("valid regex")
171});
172
173static MONEY_WRITTEN: Lazy<Regex> = Lazy::new(|| {
174 Regex::new(
175 r"(?i)\b\d+(?:,\d{3})*(?:\.\d{1,2})?\s*(?:dollars?|USD|euros?|EUR|pounds?|GBP|yen|JPY)\b",
176 )
177 .expect("valid regex")
178});
179
180static MONEY_MAGNITUDE: Lazy<Regex> = Lazy::new(|| {
181 Regex::new(
182 r"(?i)\b\d+(?:\.\d+)?\s*(?:billion|million|trillion)\s*(?:dollars?|euros?|pounds?)?\b",
183 )
184 .expect("valid regex")
185});
186
187static PERCENT: Lazy<Regex> = Lazy::new(|| {
188 Regex::new(r"\b\d+(?:\.\d+)?\s*(?:%|percent\b|pct\b)").expect("valid regex")
190});
191
192static EMAIL: Lazy<Regex> = Lazy::new(|| {
193 Regex::new(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b").expect("valid regex")
194});
195
196static URL: Lazy<Regex> =
197 Lazy::new(|| Regex::new(r"(?i)\bhttps?://[^\s<>\[\]{}|\\^`\x00-\x1f]+").expect("valid regex"));
198
199static PHONE_US: Lazy<Regex> = Lazy::new(|| {
200 Regex::new(r"(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b").expect("valid regex")
201});
202
203static PHONE_INTL: Lazy<Regex> = Lazy::new(|| {
204 Regex::new(r"\+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}\b").expect("valid regex")
205});
206
207static MENTION: Lazy<Regex> = Lazy::new(|| {
208 Regex::new(r"\B@[\w](?:[\w.]*[\w])?").expect("valid regex")
210});
211
212static HASHTAG: Lazy<Regex> = Lazy::new(|| {
213 Regex::new(r"\B#\w+").expect("valid regex")
215});
216
217impl Model for RegexNER {
218 fn extract_entities(&self, text: &str, _language: Option<&str>) -> Result<Vec<Entity>> {
219 use crate::offset::SpanConverter;
220 use anno_core::Provenance;
221 let mut entities = Vec::new();
222
223 let converter = SpanConverter::new(text);
226
227 let mut add_entity =
231 |m: regex::Match, entity_type: EntityType, confidence: f64, pattern: &'static str| {
232 let char_start = converter.byte_to_char(m.start());
235 let char_end = converter.byte_to_char(m.end());
236 if !overlaps(&entities, char_start, char_end) {
237 entities.push(Entity::with_provenance(
238 m.as_str(),
239 entity_type,
240 char_start,
241 char_end,
242 confidence,
243 Provenance::pattern(pattern),
244 ));
245 }
246 };
247
248 let date_patterns_en: &[(&Lazy<Regex>, &'static str)] = &[
251 (&DATE_ISO, "DATE_ISO"),
252 (&DATE_US, "DATE_US"),
253 (&DATE_EU, "DATE_EU"),
254 (&DATE_WRITTEN_FULL, "DATE_WRITTEN_FULL"),
255 (&DATE_WRITTEN_SHORT, "DATE_WRITTEN_SHORT"),
256 (&DATE_WRITTEN_EU, "DATE_WRITTEN_EU"),
257 ];
258 for (pattern, name) in date_patterns_en {
259 for m in pattern.find_iter(text) {
260 add_entity(m, EntityType::Date, 0.95, name);
261 }
262 }
263
264 let date_patterns_i18n: &[(&Lazy<Regex>, &'static str)] = &[
266 (&DATE_JAPANESE, "DATE_JAPANESE"),
267 (&DATE_KOREAN, "DATE_KOREAN"),
268 (&DATE_GERMAN_FULL, "DATE_GERMAN_FULL"),
269 (&DATE_GERMAN_EU, "DATE_GERMAN_EU"),
270 (&DATE_FRENCH_FULL, "DATE_FRENCH_FULL"),
271 (&DATE_FRENCH_EU, "DATE_FRENCH_EU"),
272 (&DATE_SPANISH_EU, "DATE_SPANISH_EU"),
273 (&DATE_ITALIAN_EU, "DATE_ITALIAN_EU"),
274 (&DATE_PORTUGUESE_EU, "DATE_PORTUGUESE_EU"),
275 (&DATE_DUTCH_EU, "DATE_DUTCH_EU"),
276 (&DATE_RUSSIAN_EU, "DATE_RUSSIAN_EU"),
277 ];
278 for (pattern, name) in date_patterns_i18n {
279 for m in pattern.find_iter(text) {
280 add_entity(m, EntityType::Date, 0.93, name); }
282 }
283
284 let time_patterns: &[(&Lazy<Regex>, &'static str)] = &[
286 (&TIME_12H, "TIME_12H"),
287 (&TIME_24H, "TIME_24H"),
288 (&TIME_SIMPLE, "TIME_SIMPLE"),
289 ];
290 for (pattern, name) in time_patterns {
291 for m in pattern.find_iter(text) {
292 add_entity(m, EntityType::Time, 0.90, name);
293 }
294 }
295
296 let money_patterns: &[(&Lazy<Regex>, &'static str)] = &[
298 (&MONEY_SYMBOL, "MONEY_SYMBOL"),
299 (&MONEY_WRITTEN, "MONEY_WRITTEN"),
300 (&MONEY_MAGNITUDE, "MONEY_MAGNITUDE"),
301 ];
302 for (pattern, name) in money_patterns {
303 for m in pattern.find_iter(text) {
304 add_entity(m, EntityType::Money, 0.95, name);
305 }
306 }
307
308 for m in PERCENT.find_iter(text) {
310 add_entity(m, EntityType::Percent, 0.95, "PERCENT");
311 }
312
313 for m in EMAIL.find_iter(text) {
315 add_entity(m, EntityType::Email, 0.98, "EMAIL");
316 }
317
318 for m in URL.find_iter(text) {
320 add_entity(m, EntityType::Url, 0.98, "URL");
321 }
322
323 let phone_patterns: &[(&Lazy<Regex>, &'static str)] =
325 &[(&PHONE_US, "PHONE_US"), (&PHONE_INTL, "PHONE_INTL")];
326 for (pattern, name) in phone_patterns {
327 for m in pattern.find_iter(text) {
328 add_entity(m, EntityType::Phone, 0.85, name);
329 }
330 }
331
332 for m in MENTION.find_iter(text) {
334 let char_start = converter.byte_to_char(m.start());
337 let char_end = converter.byte_to_char(m.end());
338 if !overlaps(&entities, char_start, char_end) {
339 entities.push(Entity::with_provenance(
341 m.as_str(),
342 EntityType::Other("Mention".to_string()),
343 char_start,
344 char_end,
345 0.95,
346 Provenance::pattern("MENTION"),
347 ));
348 }
349 }
350
351 for m in HASHTAG.find_iter(text) {
352 let char_start = converter.byte_to_char(m.start());
353 let char_end = converter.byte_to_char(m.end());
354 if !overlaps(&entities, char_start, char_end) {
355 entities.push(Entity::with_provenance(
356 m.as_str(),
357 EntityType::Other("Hashtag".to_string()),
358 char_start,
359 char_end,
360 0.95,
361 Provenance::pattern("HASHTAG"),
362 ));
363 }
364 }
365
366 entities.sort_unstable_by_key(|e| e.start);
369
370 Ok(entities)
371 }
372
373 fn supported_types(&self) -> Vec<EntityType> {
374 vec![
375 EntityType::Date,
376 EntityType::Time,
377 EntityType::Money,
378 EntityType::Percent,
379 EntityType::Email,
380 EntityType::Url,
381 EntityType::Phone,
382 ]
383 }
384
385 fn is_available(&self) -> bool {
386 true
387 }
388
389 fn name(&self) -> &'static str {
390 "regex"
391 }
392
393 fn description(&self) -> &'static str {
394 "Regex-based NER (dates, times, money, percentages, emails, URLs, phones)"
395 }
396
397 fn capabilities(&self) -> crate::ModelCapabilities {
398 crate::ModelCapabilities {
399 batch_capable: true,
400 streaming_capable: true,
401 ..Default::default()
402 }
403 }
404}
405
406fn overlaps(entities: &[Entity], start: usize, end: usize) -> bool {
408 entities.iter().any(|e| !(end <= e.start || start >= e.end))
409}
410
411impl crate::StructuredEntityCapable for RegexNER {}
413
414#[cfg(test)]
415mod tests {
416 use super::*;
417
418 fn ner() -> RegexNER {
419 RegexNER::new()
420 }
421
422 fn extract(text: &str) -> Vec<Entity> {
423 ner()
424 .extract_entities(text, None)
425 .expect("NER extraction should succeed")
426 }
427
428 fn has_type(entities: &[Entity], ty: &EntityType) -> bool {
429 entities.iter().any(|e| &e.entity_type == ty)
430 }
431
432 fn count_type(entities: &[Entity], ty: &EntityType) -> usize {
433 entities.iter().filter(|e| &e.entity_type == ty).count()
434 }
435
436 fn find_text<'a>(entities: &'a [Entity], text: &str) -> Option<&'a Entity> {
437 entities.iter().find(|e| e.text == text)
438 }
439
440 #[test]
445 fn date_iso_format() {
446 let e = extract("Meeting on 2024-01-15.");
447 assert!(find_text(&e, "2024-01-15").is_some());
448 }
449
450 #[test]
451 fn date_us_format() {
452 let e = extract("Due by 12/31/2024 and 1/5/24.");
453 assert_eq!(count_type(&e, &EntityType::Date), 2);
454 }
455
456 #[test]
457 fn date_eu_format() {
458 let e = extract("Released on 31.12.2024.");
459 assert!(find_text(&e, "31.12.2024").is_some());
460 }
461
462 #[test]
463 fn date_written_full() {
464 let cases = [
465 "January 15, 2024",
466 "February 28",
467 "March 1st, 2024",
468 "December 25th",
469 ];
470 for case in cases {
471 let e = extract(case);
472 assert!(has_type(&e, &EntityType::Date), "Failed: {}", case);
473 }
474 }
475
476 #[test]
477 fn date_written_short() {
478 let cases = ["Jan 15, 2024", "Feb 28", "Mar. 1st", "Dec 25th, 2024"];
479 for case in cases {
480 let e = extract(case);
481 assert!(has_type(&e, &EntityType::Date), "Failed: {}", case);
482 }
483 }
484
485 #[test]
486 fn date_eu_written() {
487 let cases = ["15 January 2024", "28th February", "1st March 2024"];
488 for case in cases {
489 let e = extract(case);
490 assert!(has_type(&e, &EntityType::Date), "Failed: {}", case);
491 }
492 }
493
494 #[test]
499 fn time_12h_format() {
500 let cases = ["3:30 PM", "10:00 am", "12:30:45 p.m.", "9:00 AM"];
501 for case in cases {
502 let e = extract(case);
503 assert!(has_type(&e, &EntityType::Time), "Failed: {}", case);
504 }
505 }
506
507 #[test]
508 fn time_24h_format() {
509 let cases = ["14:30", "09:00", "23:59:59", "0:00"];
510 for case in cases {
511 let e = extract(case);
512 assert!(has_type(&e, &EntityType::Time), "Failed: {}", case);
513 }
514 }
515
516 #[test]
517 fn time_simple() {
518 let cases = ["3pm", "10 AM", "9 a.m."];
519 for case in cases {
520 let e = extract(case);
521 assert!(has_type(&e, &EntityType::Time), "Failed: {}", case);
522 }
523 }
524
525 #[test]
530 fn money_dollar_basic() {
531 let cases = ["$100", "$1,000", "$99.99", "$1,234,567.89"];
532 for case in cases {
533 let e = extract(case);
534 assert!(has_type(&e, &EntityType::Money), "Failed: {}", case);
535 }
536 }
537
538 #[test]
539 fn money_with_magnitude() {
540 let cases = ["$5 million", "$1.5B", "$100K", "$2 billion"];
541 for case in cases {
542 let e = extract(case);
543 assert!(has_type(&e, &EntityType::Money), "Failed: {}", case);
544 }
545 }
546
547 #[test]
548 fn money_other_currencies() {
549 let cases = ["€500", "£100", "¥1000"];
550 for case in cases {
551 let e = extract(case);
552 assert!(has_type(&e, &EntityType::Money), "Failed: {}", case);
553 }
554 }
555
556 #[test]
557 fn money_unicode_offsets_correct() {
558 let text = "Price: €50 then €100";
562 let ner = RegexNER::new();
563 let entities = ner
564 .extract_entities(text, None)
565 .expect("NER extraction should succeed");
566
567 let money: Vec<_> = entities
570 .iter()
571 .filter(|e| e.entity_type == EntityType::Money)
572 .collect();
573
574 assert_eq!(money.len(), 2, "Expected 2 money entities, got {:?}", money);
575
576 assert_eq!(money[0].start, 7, "First € should be at char 7, not byte 7");
578 assert_eq!(money[0].end, 10, "First entity end should be char 10");
579
580 assert_eq!(
582 money[1].start, 16,
583 "Second € should be at char 16, not byte 18"
584 );
585 assert_eq!(money[1].end, 20, "Second entity end should be char 20");
586 }
587
588 #[test]
589 fn money_written() {
590 let cases = [
591 "50 dollars",
592 "100 USD",
593 "500 euros",
594 "1000 EUR",
595 "200 pounds",
596 ];
597 for case in cases {
598 let e = extract(case);
599 assert!(has_type(&e, &EntityType::Money), "Failed: {}", case);
600 }
601 }
602
603 #[test]
604 fn money_magnitude_written() {
605 let cases = ["5 billion dollars", "1.5 million euros", "100 million"];
606 for case in cases {
607 let e = extract(case);
608 assert!(has_type(&e, &EntityType::Money), "Failed: {}", case);
609 }
610 }
611
612 #[test]
617 fn percent_basic() {
618 let cases = ["15%", "3.5%", "100%", "0.01%"];
619 for case in cases {
620 let e = extract(case);
621 assert!(has_type(&e, &EntityType::Percent), "Failed: {}", case);
622 }
623 }
624
625 #[test]
626 fn percent_written() {
627 let cases = ["15 percent", "50 pct"];
628 for case in cases {
629 let e = extract(case);
630 assert!(has_type(&e, &EntityType::Percent), "Failed: {}", case);
631 }
632 }
633
634 #[test]
639 fn email_basic() {
640 let cases = [
641 "user@example.com",
642 "john.doe@company.org",
643 "support+ticket@help.co.uk",
644 "test_123@sub.domain.io",
645 ];
646 for case in cases {
647 let e = extract(case);
648 assert!(
649 e.iter().any(|e| e.entity_type == EntityType::Email),
650 "Failed: {}",
651 case
652 );
653 }
654 }
655
656 #[test]
661 fn url_basic() {
662 let cases = [
663 "https://example.com",
664 "http://www.google.com",
665 "https://sub.domain.co.uk/path?query=1",
666 "http://localhost:8080/api",
667 ];
668 for case in cases {
669 let e = extract(case);
670 assert!(
671 e.iter().any(|e| e.entity_type == EntityType::Url),
672 "Failed: {}",
673 case
674 );
675 }
676 }
677
678 #[test]
683 fn phone_us_format() {
684 let cases = [
685 "(555) 123-4567",
686 "555-123-4567",
687 "555.123.4567",
688 "1-555-123-4567",
689 "+1 555 123 4567",
690 ];
691 for case in cases {
692 let e = extract(case);
693 assert!(
694 e.iter().any(|e| e.entity_type == EntityType::Phone),
695 "Failed: {}",
696 case
697 );
698 }
699 }
700
701 #[test]
702 fn phone_international() {
703 let cases = ["+44 20 7946 0958", "+81 3 1234 5678"];
704 for case in cases {
705 let e = extract(case);
706 assert!(
707 e.iter().any(|e| e.entity_type == EntityType::Phone),
708 "Failed: {}",
709 case
710 );
711 }
712 }
713
714 #[test]
719 fn mixed_entities() {
720 let text = "Meeting on Jan 15 at 3:30 PM. Cost: $500. Contact: bob@acme.com or (555) 123-4567. Completion: 75%.";
721 let e = extract(text);
722
723 assert!(has_type(&e, &EntityType::Date), "Should have Date: {:?}", e);
724 assert!(has_type(&e, &EntityType::Time), "Should have Time: {:?}", e);
725 assert!(
726 has_type(&e, &EntityType::Money),
727 "Should have Money: {:?}",
728 e
729 );
730 assert!(
731 has_type(&e, &EntityType::Percent),
732 "Should have Percent: {:?}",
733 e
734 );
735 assert!(
736 e.iter().any(|e| e.entity_type == EntityType::Email),
737 "Should have Email: {:?}",
738 e
739 );
740 assert!(
741 e.iter().any(|e| e.entity_type == EntityType::Phone),
742 "Should have Phone: {:?}",
743 e
744 );
745 }
746
747 #[test]
748 fn no_person_org_loc() {
749 let e = extract("John Smith works at Google in New York.");
750 assert!(!has_type(&e, &EntityType::Person));
752 assert!(!has_type(&e, &EntityType::Organization));
753 assert!(!has_type(&e, &EntityType::Location));
754 }
755
756 #[test]
757 fn entities_sorted_by_position() {
758 let e = extract("$100 on 2024-01-01 at 50%");
759 let positions: Vec<usize> = e.iter().map(|e| e.start).collect();
760 let mut sorted = positions.clone();
761 sorted.sort();
762 assert_eq!(positions, sorted);
763 }
764
765 #[test]
766 fn no_overlapping_entities() {
767 let e = extract("The price is $1,000,000 (1 million dollars).");
768 for i in 0..e.len() {
769 for j in (i + 1)..e.len() {
770 let overlap = e[i].start < e[j].end && e[j].start < e[i].end;
771 assert!(!overlap, "Overlap: {:?} and {:?}", e[i], e[j]);
772 }
773 }
774 }
775
776 #[test]
777 fn empty_text() {
778 let e = extract("");
779 assert!(e.is_empty());
780 }
781
782 #[test]
783 fn no_entities_text() {
784 let e = extract("The quick brown fox jumps over the lazy dog.");
785 assert!(e.is_empty());
786 }
787
788 #[test]
789 fn entity_spans_correct() {
790 use crate::offset::TextSpan;
791
792 let text = "Cost: $100";
793 let e = extract(text);
794 let money = find_text(&e, "$100").expect("money entity should be found");
795 assert_eq!(
796 TextSpan::from_chars(text, money.start, money.end).extract(text),
797 "$100"
798 );
799 }
800
801 #[test]
802 fn provenance_attached() {
803 use anno_core::ExtractionMethod;
804
805 let text = "Contact: test@email.com on 2024-01-15";
806 let e = extract(text);
807
808 for entity in &e {
810 assert!(
811 entity.provenance.is_some(),
812 "Missing provenance for {:?}",
813 entity
814 );
815 let prov = entity
816 .provenance
817 .as_ref()
818 .expect("provenance should be set");
819
820 assert_eq!(prov.source.as_ref(), "pattern");
822 assert_eq!(prov.method, ExtractionMethod::Pattern);
823
824 assert!(
826 prov.pattern.is_some(),
827 "Missing pattern name for {:?}",
828 entity
829 );
830 }
831
832 let email = find_text(&e, "test@email.com").expect("email entity should be found");
834 assert_eq!(
835 email
836 .provenance
837 .as_ref()
838 .expect("provenance should be set")
839 .pattern
840 .as_ref()
841 .expect("pattern should be set")
842 .as_ref(),
843 "EMAIL"
844 );
845
846 let date = find_text(&e, "2024-01-15").expect("date entity should be found");
847 assert_eq!(
848 date.provenance
849 .as_ref()
850 .expect("provenance should be set")
851 .pattern
852 .as_ref()
853 .expect("pattern should be set")
854 .as_ref(),
855 "DATE_ISO"
856 );
857 }
858
859 #[test]
864 fn japanese_date_format() {
865 let cases = ["2024年1月15日", "2024年12月31日", "2000年01月01日"];
866 for case in cases {
867 let e = extract(case);
868 assert!(has_type(&e, &EntityType::Date), "Failed: {}", case);
869 assert_eq!(e[0].text, case);
870 }
871 }
872
873 #[test]
874 fn korean_date_format() {
875 let cases = ["2024년 1월 15일", "2024년 12월 31일"];
876 for case in cases {
877 let e = extract(case);
878 assert!(has_type(&e, &EntityType::Date), "Failed: {}", case);
879 }
880 }
881
882 #[test]
883 fn german_month_names() {
884 let cases = [
885 ("15. Januar 2024", "15. Januar 2024"),
886 ("3 März 2023", "3 März 2023"),
887 ("25 Dezember", "25 Dezember"),
888 ];
889 for (text, expected) in cases {
890 let e = extract(text);
891 assert!(has_type(&e, &EntityType::Date), "Failed: {}", text);
892 assert!(
893 find_text(&e, expected).is_some(),
894 "Expected '{}' in: {}",
895 expected,
896 text
897 );
898 }
899 }
900
901 #[test]
902 fn french_month_names() {
903 let cases = ["15 janvier 2024", "1er février 2023", "25 décembre"];
904 for case in cases {
905 let e = extract(case);
906 assert!(has_type(&e, &EntityType::Date), "Failed: {}", case);
907 }
908 }
909
910 #[test]
911 fn spanish_month_names() {
912 let cases = ["15 de enero de 2024", "5 marzo 2023", "25 diciembre"];
913 for case in cases {
914 let e = extract(case);
915 assert!(has_type(&e, &EntityType::Date), "Failed: {}", case);
916 }
917 }
918
919 #[test]
920 fn italian_month_names() {
921 let e = extract("15 gennaio 2024");
922 assert!(has_type(&e, &EntityType::Date));
923 }
924
925 #[test]
926 fn portuguese_month_names() {
927 let e = extract("15 de janeiro de 2024");
928 assert!(has_type(&e, &EntityType::Date));
929 }
930
931 #[test]
932 fn dutch_month_names() {
933 let e = extract("15 januari 2024");
934 assert!(has_type(&e, &EntityType::Date));
935 }
936
937 #[test]
938 fn russian_month_names() {
939 let e = extract("15 января 2024");
940 assert!(has_type(&e, &EntityType::Date));
941 }
942
943 #[test]
944 fn multilingual_dates_with_context() {
945 let text = "Meeting on 2024年1月15日 at the office. Follow-up on 15 janvier.";
947 let e = extract(text);
948 let dates: Vec<_> = e
949 .iter()
950 .filter(|e| e.entity_type == EntityType::Date)
951 .collect();
952 assert_eq!(dates.len(), 2, "Expected 2 dates, got {:?}", dates);
953 }
954}
955
956impl crate::BatchCapable for RegexNER {
961 fn extract_entities_batch(
962 &self,
963 texts: &[&str],
964 language: Option<&str>,
965 ) -> Result<Vec<Vec<Entity>>> {
966 texts
967 .iter()
968 .map(|text| self.extract_entities(text, language))
969 .collect()
970 }
971
972 fn optimal_batch_size(&self) -> Option<usize> {
973 Some(64) }
975}
976
977impl crate::StreamingCapable for RegexNER {
978 fn recommended_chunk_size(&self) -> usize {
979 10_000 }
981}
982
983#[cfg(test)]
984mod proptests;