1use crate::{Entity, EntityType, Model, Result};
17use once_cell::sync::Lazy;
18use regex::Regex;
19
20pub struct RegexNER;
51
52impl RegexNER {
53 #[must_use]
55 pub fn new() -> Self {
56 Self
57 }
58}
59
60impl Default for RegexNER {
61 fn default() -> Self {
62 Self::new()
63 }
64}
65
66static DATE_ISO: Lazy<Regex> =
68 Lazy::new(|| Regex::new(r"\b\d{4}-\d{2}-\d{2}\b").expect("valid regex"));
69
70static DATE_US: Lazy<Regex> =
71 Lazy::new(|| Regex::new(r"\b\d{1,2}/\d{1,2}/\d{2,4}\b").expect("valid regex"));
72
73static DATE_EU: Lazy<Regex> =
74 Lazy::new(|| Regex::new(r"\b\d{1,2}\.\d{1,2}\.\d{2,4}\b").expect("valid regex"));
75
76static DATE_WRITTEN_FULL: Lazy<Regex> = Lazy::new(|| {
77 Regex::new(r"(?i)\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}(?:st|nd|rd|th)?(?:,?\s*\d{4})?\b").expect("valid regex")
78});
79
80static DATE_WRITTEN_SHORT: Lazy<Regex> = Lazy::new(|| {
81 Regex::new(r"(?i)\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)\.?\s+\d{1,2}(?:st|nd|rd|th)?(?:,?\s*\d{4})?\b").expect("valid regex")
82});
83
84static DATE_WRITTEN_EU: Lazy<Regex> = Lazy::new(|| {
85 Regex::new(r"(?i)\b\d{1,2}(?:st|nd|rd|th)?\s+(?:January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)\.?(?:\s+\d{4})?\b").expect("valid regex")
86});
87
88static DATE_JAPANESE: Lazy<Regex> = Lazy::new(|| {
93 Regex::new(r"\d{4}年\d{1,2}月\d{1,2}日").expect("valid regex")
95});
96
97static DATE_GERMAN_FULL: Lazy<Regex> = Lazy::new(|| {
103 Regex::new(r"(?i)\b(?:Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)\s+\d{1,2}(?:\.)?(?:,?\s*\d{4})?\b").expect("valid regex")
104});
105
106static DATE_GERMAN_EU: Lazy<Regex> = Lazy::new(|| {
107 Regex::new(r"(?i)\b\d{1,2}\.?\s+(?:Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)(?:\s+\d{4})?\b").expect("valid regex")
109});
110
111static DATE_FRENCH_FULL: Lazy<Regex> = Lazy::new(|| {
113 Regex::new(r"(?i)\b(?:janvier|février|fevrier|mars|avril|mai|juin|juillet|août|aout|septembre|octobre|novembre|décembre|decembre)\s+\d{1,2}(?:,?\s*\d{4})?\b").expect("valid regex")
114});
115
116static DATE_FRENCH_EU: Lazy<Regex> = Lazy::new(|| {
117 Regex::new(r"(?i)\b\d{1,2}(?:er)?\s+(?:janvier|février|fevrier|mars|avril|mai|juin|juillet|août|aout|septembre|octobre|novembre|décembre|decembre)(?:\s+\d{4})?\b").expect("valid regex")
119});
120
121static DATE_SPANISH_EU: Lazy<Regex> = Lazy::new(|| {
123 Regex::new(r"(?i)\b\d{1,2}\s+(?:de\s+)?(?:enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre)(?:\s+(?:de\s+)?\d{4})?\b").expect("valid regex")
125});
126
127static DATE_ITALIAN_EU: Lazy<Regex> = Lazy::new(|| {
129 Regex::new(r"(?i)\b\d{1,2}\s+(?:gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|novembre|dicembre)(?:\s+\d{4})?\b").expect("valid regex")
130});
131
132static DATE_PORTUGUESE_EU: Lazy<Regex> = Lazy::new(|| {
134 Regex::new(r"(?i)\b\d{1,2}\s+(?:de\s+)?(?:janeiro|fevereiro|março|marco|abril|maio|junho|julho|agosto|setembro|outubro|novembro|dezembro)(?:\s+(?:de\s+)?\d{4})?\b").expect("valid regex")
136});
137
138static DATE_DUTCH_EU: Lazy<Regex> = Lazy::new(|| {
140 Regex::new(r"(?i)\b\d{1,2}\s+(?:januari|februari|maart|april|mei|juni|juli|augustus|september|oktober|november|december)(?:\s+\d{4})?\b").expect("valid regex")
141});
142
143static DATE_RUSSIAN_EU: Lazy<Regex> = Lazy::new(|| {
145 Regex::new(r"\b\d{1,2}\s+(?:января|февраля|марта|апреля|мая|июня|июля|августа|сентября|октября|ноября|декабря)(?:\s+\d{4})?\b").expect("valid regex")
147});
148
149static DATE_KOREAN: Lazy<Regex> =
154 Lazy::new(|| Regex::new(r"\d{4}년\s*\d{1,2}월\s*\d{1,2}일").expect("valid regex"));
155
156static TIME_12H: Lazy<Regex> = Lazy::new(|| {
157 Regex::new(r"(?i)\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:am|pm|a\.m\.|p\.m\.)\b").expect("valid regex")
158});
159
160static TIME_24H: Lazy<Regex> =
161 Lazy::new(|| Regex::new(r"\b(?:[01]?\d|2[0-3]):[0-5]\d(?::[0-5]\d)?\b").expect("valid regex"));
162
163static TIME_SIMPLE: Lazy<Regex> = Lazy::new(|| {
164 Regex::new(r"(?i)\b\d{1,2}\s*(?:am\b|pm\b|a\.m\.|p\.m\.)").expect("valid regex")
166});
167
168static MONEY_SYMBOL: Lazy<Regex> = Lazy::new(|| {
169 Regex::new(r"[$€£¥][\d,]+(?:\.\d{1,2})?(?:\s*(?:billion|million|thousand|B|M|K|bn|mn))?")
170 .expect("valid regex")
171});
172
173static MONEY_WRITTEN: Lazy<Regex> = Lazy::new(|| {
174 Regex::new(
175 r"(?i)\b\d+(?:,\d{3})*(?:\.\d{1,2})?\s*(?:dollars?|USD|euros?|EUR|pounds?|GBP|yen|JPY)\b",
176 )
177 .expect("valid regex")
178});
179
180static MONEY_MAGNITUDE: Lazy<Regex> = Lazy::new(|| {
181 Regex::new(
182 r"(?i)\b\d+(?:\.\d+)?\s*(?:billion|million|trillion)\s*(?:dollars?|euros?|pounds?)?\b",
183 )
184 .expect("valid regex")
185});
186
187static PERCENT: Lazy<Regex> = Lazy::new(|| {
188 Regex::new(r"\b\d+(?:\.\d+)?\s*(?:%|percent\b|pct\b)").expect("valid regex")
190});
191
192static EMAIL: Lazy<Regex> = Lazy::new(|| {
193 Regex::new(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b").expect("valid regex")
194});
195
196static URL: Lazy<Regex> =
197 Lazy::new(|| Regex::new(r"(?i)\bhttps?://[^\s<>\[\]{}|\\^`\x00-\x1f]+").expect("valid regex"));
198
199static PHONE_US: Lazy<Regex> = Lazy::new(|| {
200 Regex::new(r"(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b").expect("valid regex")
201});
202
203static PHONE_INTL: Lazy<Regex> = Lazy::new(|| {
204 Regex::new(r"\+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}\b").expect("valid regex")
205});
206
207static MENTION: Lazy<Regex> = Lazy::new(|| {
208 Regex::new(r"\B@[\w](?:[\w.]*[\w])?").expect("valid regex")
210});
211
212static HASHTAG: Lazy<Regex> = Lazy::new(|| {
213 Regex::new(r"\B#\w+").expect("valid regex")
215});
216
217impl Model for RegexNER {
218 fn extract_entities(&self, text: &str, _language: Option<&str>) -> Result<Vec<Entity>> {
219 use crate::offset::SpanConverter;
220 use anno_core::Provenance;
221 let mut entities = Vec::new();
222
223 let converter = SpanConverter::new(text);
226
227 let mut add_entity =
231 |m: regex::Match, entity_type: EntityType, confidence: f64, pattern: &'static str| {
232 let char_start = converter.byte_to_char(m.start());
235 let char_end = converter.byte_to_char(m.end());
236 if !overlaps(&entities, char_start, char_end) {
237 entities.push(Entity::with_provenance(
238 m.as_str(),
239 entity_type,
240 char_start,
241 char_end,
242 confidence,
243 Provenance::pattern(pattern),
244 ));
245 }
246 };
247
248 let date_patterns_en: &[(&Lazy<Regex>, &'static str)] = &[
251 (&DATE_ISO, "DATE_ISO"),
252 (&DATE_US, "DATE_US"),
253 (&DATE_EU, "DATE_EU"),
254 (&DATE_WRITTEN_FULL, "DATE_WRITTEN_FULL"),
255 (&DATE_WRITTEN_SHORT, "DATE_WRITTEN_SHORT"),
256 (&DATE_WRITTEN_EU, "DATE_WRITTEN_EU"),
257 ];
258 for (pattern, name) in date_patterns_en {
259 for m in pattern.find_iter(text) {
260 add_entity(m, EntityType::Date, 0.95, name);
261 }
262 }
263
264 let date_patterns_i18n: &[(&Lazy<Regex>, &'static str)] = &[
266 (&DATE_JAPANESE, "DATE_JAPANESE"),
267 (&DATE_KOREAN, "DATE_KOREAN"),
268 (&DATE_GERMAN_FULL, "DATE_GERMAN_FULL"),
269 (&DATE_GERMAN_EU, "DATE_GERMAN_EU"),
270 (&DATE_FRENCH_FULL, "DATE_FRENCH_FULL"),
271 (&DATE_FRENCH_EU, "DATE_FRENCH_EU"),
272 (&DATE_SPANISH_EU, "DATE_SPANISH_EU"),
273 (&DATE_ITALIAN_EU, "DATE_ITALIAN_EU"),
274 (&DATE_PORTUGUESE_EU, "DATE_PORTUGUESE_EU"),
275 (&DATE_DUTCH_EU, "DATE_DUTCH_EU"),
276 (&DATE_RUSSIAN_EU, "DATE_RUSSIAN_EU"),
277 ];
278 for (pattern, name) in date_patterns_i18n {
279 for m in pattern.find_iter(text) {
280 add_entity(m, EntityType::Date, 0.93, name); }
282 }
283
284 let time_patterns: &[(&Lazy<Regex>, &'static str)] = &[
286 (&TIME_12H, "TIME_12H"),
287 (&TIME_24H, "TIME_24H"),
288 (&TIME_SIMPLE, "TIME_SIMPLE"),
289 ];
290 for (pattern, name) in time_patterns {
291 for m in pattern.find_iter(text) {
292 add_entity(m, EntityType::Time, 0.90, name);
293 }
294 }
295
296 let money_patterns: &[(&Lazy<Regex>, &'static str)] = &[
298 (&MONEY_SYMBOL, "MONEY_SYMBOL"),
299 (&MONEY_WRITTEN, "MONEY_WRITTEN"),
300 (&MONEY_MAGNITUDE, "MONEY_MAGNITUDE"),
301 ];
302 for (pattern, name) in money_patterns {
303 for m in pattern.find_iter(text) {
304 add_entity(m, EntityType::Money, 0.95, name);
305 }
306 }
307
308 for m in PERCENT.find_iter(text) {
310 add_entity(m, EntityType::Percent, 0.95, "PERCENT");
311 }
312
313 for m in EMAIL.find_iter(text) {
315 add_entity(m, EntityType::Email, 0.98, "EMAIL");
316 }
317
318 for m in URL.find_iter(text) {
320 add_entity(m, EntityType::Url, 0.98, "URL");
321 }
322
323 let phone_patterns: &[(&Lazy<Regex>, &'static str)] =
325 &[(&PHONE_US, "PHONE_US"), (&PHONE_INTL, "PHONE_INTL")];
326 for (pattern, name) in phone_patterns {
327 for m in pattern.find_iter(text) {
328 add_entity(m, EntityType::Phone, 0.85, name);
329 }
330 }
331
332 for m in MENTION.find_iter(text) {
334 let char_start = converter.byte_to_char(m.start());
337 let char_end = converter.byte_to_char(m.end());
338 if !overlaps(&entities, char_start, char_end) {
339 entities.push(Entity::with_provenance(
341 m.as_str(),
342 EntityType::Other("Mention".to_string()),
343 char_start,
344 char_end,
345 0.95,
346 Provenance::pattern("MENTION"),
347 ));
348 }
349 }
350
351 for m in HASHTAG.find_iter(text) {
352 let char_start = converter.byte_to_char(m.start());
353 let char_end = converter.byte_to_char(m.end());
354 if !overlaps(&entities, char_start, char_end) {
355 entities.push(Entity::with_provenance(
356 m.as_str(),
357 EntityType::Other("Hashtag".to_string()),
358 char_start,
359 char_end,
360 0.95,
361 Provenance::pattern("HASHTAG"),
362 ));
363 }
364 }
365
366 entities.sort_unstable_by_key(|e| e.start);
369
370 Ok(entities)
371 }
372
373 fn supported_types(&self) -> Vec<EntityType> {
374 vec![
375 EntityType::Date,
376 EntityType::Time,
377 EntityType::Money,
378 EntityType::Percent,
379 EntityType::Email,
380 EntityType::Url,
381 EntityType::Phone,
382 ]
383 }
384
385 fn is_available(&self) -> bool {
386 true
387 }
388
389 fn name(&self) -> &'static str {
390 "regex"
391 }
392
393 fn description(&self) -> &'static str {
394 "Regex-based NER (dates, times, money, percentages, emails, URLs, phones)"
395 }
396}
397
398fn overlaps(entities: &[Entity], start: usize, end: usize) -> bool {
400 entities.iter().any(|e| !(end <= e.start || start >= e.end))
401}
402
403impl crate::StructuredEntityCapable for RegexNER {}
405
406#[cfg(test)]
407mod tests {
408 use super::*;
409
410 fn ner() -> RegexNER {
411 RegexNER::new()
412 }
413
414 fn extract(text: &str) -> Vec<Entity> {
415 ner()
416 .extract_entities(text, None)
417 .expect("NER extraction should succeed")
418 }
419
420 fn has_type(entities: &[Entity], ty: &EntityType) -> bool {
421 entities.iter().any(|e| &e.entity_type == ty)
422 }
423
424 fn count_type(entities: &[Entity], ty: &EntityType) -> usize {
425 entities.iter().filter(|e| &e.entity_type == ty).count()
426 }
427
428 fn find_text<'a>(entities: &'a [Entity], text: &str) -> Option<&'a Entity> {
429 entities.iter().find(|e| e.text == text)
430 }
431
432 #[test]
437 fn date_iso_format() {
438 let e = extract("Meeting on 2024-01-15.");
439 assert!(find_text(&e, "2024-01-15").is_some());
440 }
441
442 #[test]
443 fn date_us_format() {
444 let e = extract("Due by 12/31/2024 and 1/5/24.");
445 assert_eq!(count_type(&e, &EntityType::Date), 2);
446 }
447
448 #[test]
449 fn date_eu_format() {
450 let e = extract("Released on 31.12.2024.");
451 assert!(find_text(&e, "31.12.2024").is_some());
452 }
453
454 #[test]
455 fn date_written_full() {
456 let cases = [
457 "January 15, 2024",
458 "February 28",
459 "March 1st, 2024",
460 "December 25th",
461 ];
462 for case in cases {
463 let e = extract(case);
464 assert!(has_type(&e, &EntityType::Date), "Failed: {}", case);
465 }
466 }
467
468 #[test]
469 fn date_written_short() {
470 let cases = ["Jan 15, 2024", "Feb 28", "Mar. 1st", "Dec 25th, 2024"];
471 for case in cases {
472 let e = extract(case);
473 assert!(has_type(&e, &EntityType::Date), "Failed: {}", case);
474 }
475 }
476
477 #[test]
478 fn date_eu_written() {
479 let cases = ["15 January 2024", "28th February", "1st March 2024"];
480 for case in cases {
481 let e = extract(case);
482 assert!(has_type(&e, &EntityType::Date), "Failed: {}", case);
483 }
484 }
485
486 #[test]
491 fn time_12h_format() {
492 let cases = ["3:30 PM", "10:00 am", "12:30:45 p.m.", "9:00 AM"];
493 for case in cases {
494 let e = extract(case);
495 assert!(has_type(&e, &EntityType::Time), "Failed: {}", case);
496 }
497 }
498
499 #[test]
500 fn time_24h_format() {
501 let cases = ["14:30", "09:00", "23:59:59", "0:00"];
502 for case in cases {
503 let e = extract(case);
504 assert!(has_type(&e, &EntityType::Time), "Failed: {}", case);
505 }
506 }
507
508 #[test]
509 fn time_simple() {
510 let cases = ["3pm", "10 AM", "9 a.m."];
511 for case in cases {
512 let e = extract(case);
513 assert!(has_type(&e, &EntityType::Time), "Failed: {}", case);
514 }
515 }
516
517 #[test]
522 fn money_dollar_basic() {
523 let cases = ["$100", "$1,000", "$99.99", "$1,234,567.89"];
524 for case in cases {
525 let e = extract(case);
526 assert!(has_type(&e, &EntityType::Money), "Failed: {}", case);
527 }
528 }
529
530 #[test]
531 fn money_with_magnitude() {
532 let cases = ["$5 million", "$1.5B", "$100K", "$2 billion"];
533 for case in cases {
534 let e = extract(case);
535 assert!(has_type(&e, &EntityType::Money), "Failed: {}", case);
536 }
537 }
538
539 #[test]
540 fn money_other_currencies() {
541 let cases = ["€500", "£100", "¥1000"];
542 for case in cases {
543 let e = extract(case);
544 assert!(has_type(&e, &EntityType::Money), "Failed: {}", case);
545 }
546 }
547
548 #[test]
549 fn money_unicode_offsets_correct() {
550 let text = "Price: €50 then €100";
554 let ner = RegexNER::new();
555 let entities = ner
556 .extract_entities(text, None)
557 .expect("NER extraction should succeed");
558
559 let money: Vec<_> = entities
562 .iter()
563 .filter(|e| e.entity_type == EntityType::Money)
564 .collect();
565
566 assert_eq!(money.len(), 2, "Expected 2 money entities, got {:?}", money);
567
568 assert_eq!(money[0].start, 7, "First € should be at char 7, not byte 7");
570 assert_eq!(money[0].end, 10, "First entity end should be char 10");
571
572 assert_eq!(
574 money[1].start, 16,
575 "Second € should be at char 16, not byte 18"
576 );
577 assert_eq!(money[1].end, 20, "Second entity end should be char 20");
578 }
579
580 #[test]
581 fn money_written() {
582 let cases = [
583 "50 dollars",
584 "100 USD",
585 "500 euros",
586 "1000 EUR",
587 "200 pounds",
588 ];
589 for case in cases {
590 let e = extract(case);
591 assert!(has_type(&e, &EntityType::Money), "Failed: {}", case);
592 }
593 }
594
595 #[test]
596 fn money_magnitude_written() {
597 let cases = ["5 billion dollars", "1.5 million euros", "100 million"];
598 for case in cases {
599 let e = extract(case);
600 assert!(has_type(&e, &EntityType::Money), "Failed: {}", case);
601 }
602 }
603
604 #[test]
609 fn percent_basic() {
610 let cases = ["15%", "3.5%", "100%", "0.01%"];
611 for case in cases {
612 let e = extract(case);
613 assert!(has_type(&e, &EntityType::Percent), "Failed: {}", case);
614 }
615 }
616
617 #[test]
618 fn percent_written() {
619 let cases = ["15 percent", "50 pct"];
620 for case in cases {
621 let e = extract(case);
622 assert!(has_type(&e, &EntityType::Percent), "Failed: {}", case);
623 }
624 }
625
626 #[test]
631 fn email_basic() {
632 let cases = [
633 "user@example.com",
634 "john.doe@company.org",
635 "support+ticket@help.co.uk",
636 "test_123@sub.domain.io",
637 ];
638 for case in cases {
639 let e = extract(case);
640 assert!(
641 e.iter().any(|e| e.entity_type == EntityType::Email),
642 "Failed: {}",
643 case
644 );
645 }
646 }
647
648 #[test]
653 fn url_basic() {
654 let cases = [
655 "https://example.com",
656 "http://www.google.com",
657 "https://sub.domain.co.uk/path?query=1",
658 "http://localhost:8080/api",
659 ];
660 for case in cases {
661 let e = extract(case);
662 assert!(
663 e.iter().any(|e| e.entity_type == EntityType::Url),
664 "Failed: {}",
665 case
666 );
667 }
668 }
669
670 #[test]
675 fn phone_us_format() {
676 let cases = [
677 "(555) 123-4567",
678 "555-123-4567",
679 "555.123.4567",
680 "1-555-123-4567",
681 "+1 555 123 4567",
682 ];
683 for case in cases {
684 let e = extract(case);
685 assert!(
686 e.iter().any(|e| e.entity_type == EntityType::Phone),
687 "Failed: {}",
688 case
689 );
690 }
691 }
692
693 #[test]
694 fn phone_international() {
695 let cases = ["+44 20 7946 0958", "+81 3 1234 5678"];
696 for case in cases {
697 let e = extract(case);
698 assert!(
699 e.iter().any(|e| e.entity_type == EntityType::Phone),
700 "Failed: {}",
701 case
702 );
703 }
704 }
705
706 #[test]
711 fn mixed_entities() {
712 let text = "Meeting on Jan 15 at 3:30 PM. Cost: $500. Contact: bob@acme.com or (555) 123-4567. Completion: 75%.";
713 let e = extract(text);
714
715 assert!(has_type(&e, &EntityType::Date), "Should have Date: {:?}", e);
716 assert!(has_type(&e, &EntityType::Time), "Should have Time: {:?}", e);
717 assert!(
718 has_type(&e, &EntityType::Money),
719 "Should have Money: {:?}",
720 e
721 );
722 assert!(
723 has_type(&e, &EntityType::Percent),
724 "Should have Percent: {:?}",
725 e
726 );
727 assert!(
728 e.iter().any(|e| e.entity_type == EntityType::Email),
729 "Should have Email: {:?}",
730 e
731 );
732 assert!(
733 e.iter().any(|e| e.entity_type == EntityType::Phone),
734 "Should have Phone: {:?}",
735 e
736 );
737 }
738
739 #[test]
740 fn no_person_org_loc() {
741 let e = extract("John Smith works at Google in New York.");
742 assert!(!has_type(&e, &EntityType::Person));
744 assert!(!has_type(&e, &EntityType::Organization));
745 assert!(!has_type(&e, &EntityType::Location));
746 }
747
748 #[test]
749 fn entities_sorted_by_position() {
750 let e = extract("$100 on 2024-01-01 at 50%");
751 let positions: Vec<usize> = e.iter().map(|e| e.start).collect();
752 let mut sorted = positions.clone();
753 sorted.sort();
754 assert_eq!(positions, sorted);
755 }
756
757 #[test]
758 fn no_overlapping_entities() {
759 let e = extract("The price is $1,000,000 (1 million dollars).");
760 for i in 0..e.len() {
761 for j in (i + 1)..e.len() {
762 let overlap = e[i].start < e[j].end && e[j].start < e[i].end;
763 assert!(!overlap, "Overlap: {:?} and {:?}", e[i], e[j]);
764 }
765 }
766 }
767
768 #[test]
769 fn empty_text() {
770 let e = extract("");
771 assert!(e.is_empty());
772 }
773
774 #[test]
775 fn no_entities_text() {
776 let e = extract("The quick brown fox jumps over the lazy dog.");
777 assert!(e.is_empty());
778 }
779
780 #[test]
781 fn entity_spans_correct() {
782 use crate::offset::TextSpan;
783
784 let text = "Cost: $100";
785 let e = extract(text);
786 let money = find_text(&e, "$100").expect("money entity should be found");
787 assert_eq!(
788 TextSpan::from_chars(text, money.start, money.end).extract(text),
789 "$100"
790 );
791 }
792
793 #[test]
794 fn provenance_attached() {
795 use anno_core::ExtractionMethod;
796
797 let text = "Contact: test@email.com on 2024-01-15";
798 let e = extract(text);
799
800 for entity in &e {
802 assert!(
803 entity.provenance.is_some(),
804 "Missing provenance for {:?}",
805 entity
806 );
807 let prov = entity
808 .provenance
809 .as_ref()
810 .expect("provenance should be set");
811
812 assert_eq!(prov.source.as_ref(), "pattern");
814 assert_eq!(prov.method, ExtractionMethod::Pattern);
815
816 assert!(
818 prov.pattern.is_some(),
819 "Missing pattern name for {:?}",
820 entity
821 );
822 }
823
824 let email = find_text(&e, "test@email.com").expect("email entity should be found");
826 assert_eq!(
827 email
828 .provenance
829 .as_ref()
830 .expect("provenance should be set")
831 .pattern
832 .as_ref()
833 .expect("pattern should be set")
834 .as_ref(),
835 "EMAIL"
836 );
837
838 let date = find_text(&e, "2024-01-15").expect("date entity should be found");
839 assert_eq!(
840 date.provenance
841 .as_ref()
842 .expect("provenance should be set")
843 .pattern
844 .as_ref()
845 .expect("pattern should be set")
846 .as_ref(),
847 "DATE_ISO"
848 );
849 }
850
851 #[test]
856 fn japanese_date_format() {
857 let cases = ["2024年1月15日", "2024年12月31日", "2000年01月01日"];
858 for case in cases {
859 let e = extract(case);
860 assert!(has_type(&e, &EntityType::Date), "Failed: {}", case);
861 assert_eq!(e[0].text, case);
862 }
863 }
864
865 #[test]
866 fn korean_date_format() {
867 let cases = ["2024년 1월 15일", "2024년 12월 31일"];
868 for case in cases {
869 let e = extract(case);
870 assert!(has_type(&e, &EntityType::Date), "Failed: {}", case);
871 }
872 }
873
874 #[test]
875 fn german_month_names() {
876 let cases = [
877 ("15. Januar 2024", "15. Januar 2024"),
878 ("3 März 2023", "3 März 2023"),
879 ("25 Dezember", "25 Dezember"),
880 ];
881 for (text, expected) in cases {
882 let e = extract(text);
883 assert!(has_type(&e, &EntityType::Date), "Failed: {}", text);
884 assert!(
885 find_text(&e, expected).is_some(),
886 "Expected '{}' in: {}",
887 expected,
888 text
889 );
890 }
891 }
892
893 #[test]
894 fn french_month_names() {
895 let cases = ["15 janvier 2024", "1er février 2023", "25 décembre"];
896 for case in cases {
897 let e = extract(case);
898 assert!(has_type(&e, &EntityType::Date), "Failed: {}", case);
899 }
900 }
901
902 #[test]
903 fn spanish_month_names() {
904 let cases = ["15 de enero de 2024", "5 marzo 2023", "25 diciembre"];
905 for case in cases {
906 let e = extract(case);
907 assert!(has_type(&e, &EntityType::Date), "Failed: {}", case);
908 }
909 }
910
911 #[test]
912 fn italian_month_names() {
913 let e = extract("15 gennaio 2024");
914 assert!(has_type(&e, &EntityType::Date));
915 }
916
917 #[test]
918 fn portuguese_month_names() {
919 let e = extract("15 de janeiro de 2024");
920 assert!(has_type(&e, &EntityType::Date));
921 }
922
923 #[test]
924 fn dutch_month_names() {
925 let e = extract("15 januari 2024");
926 assert!(has_type(&e, &EntityType::Date));
927 }
928
929 #[test]
930 fn russian_month_names() {
931 let e = extract("15 января 2024");
932 assert!(has_type(&e, &EntityType::Date));
933 }
934
935 #[test]
936 fn multilingual_dates_with_context() {
937 let text = "Meeting on 2024年1月15日 at the office. Follow-up on 15 janvier.";
939 let e = extract(text);
940 let dates: Vec<_> = e
941 .iter()
942 .filter(|e| e.entity_type == EntityType::Date)
943 .collect();
944 assert_eq!(dates.len(), 2, "Expected 2 dates, got {:?}", dates);
945 }
946}
947
948impl crate::BatchCapable for RegexNER {
953 fn extract_entities_batch(
954 &self,
955 texts: &[&str],
956 language: Option<&str>,
957 ) -> Result<Vec<Vec<Entity>>> {
958 texts
959 .iter()
960 .map(|text| self.extract_entities(text, language))
961 .collect()
962 }
963
964 fn optimal_batch_size(&self) -> Option<usize> {
965 Some(64) }
967}
968
969impl crate::StreamingCapable for RegexNER {
970 fn recommended_chunk_size(&self) -> usize {
971 10_000 }
973}
974
975#[cfg(test)]
976mod proptests {
977 use super::*;
978 use proptest::prelude::*;
979
980 proptest! {
981 #[test]
982 fn extraction_never_panics(text in ".*") {
983 let ner = RegexNER::new();
984 let _ = ner.extract_entities(&text, None);
985 }
986
987 #[test]
988 fn entities_within_text_bounds(text in ".{1,200}") {
989 let ner = RegexNER::new();
990 if let Ok(entities) = ner.extract_entities(&text, None) {
991 let text_char_len = text.chars().count();
992 for e in entities {
993 prop_assert!(e.start <= text_char_len);
994 prop_assert!(e.end <= text_char_len);
995 prop_assert!(e.start <= e.end);
996 }
997 }
998 }
999
1000 #[test]
1001 fn dollar_amounts_detected(amount in 1u32..10000) {
1002 let text = format!("Cost: ${}", amount);
1003 let ner = RegexNER::new();
1004 let entities = ner.extract_entities(&text, None).unwrap();
1005 prop_assert!(entities.iter().any(|e| e.entity_type == EntityType::Money));
1006 }
1007
1008 #[test]
1009 fn percentages_detected(pct in 1u32..100) {
1010 let text = format!("{}% complete", pct);
1011 let ner = RegexNER::new();
1012 let entities = ner.extract_entities(&text, None).unwrap();
1013 prop_assert!(entities.iter().any(|e| e.entity_type == EntityType::Percent));
1014 }
1015
1016 #[test]
1017 fn emails_detected(user in "[a-z]{3,10}", domain in "[a-z]{3,8}") {
1018 let text = format!("Contact: {}@{}.com", user, domain);
1019 let ner = RegexNER::new();
1020 let entities = ner.extract_entities(&text, None).unwrap();
1021 prop_assert!(entities.iter().any(|e|
1022 e.entity_type == EntityType::Email
1023 ));
1024 }
1025
1026 #[test]
1027 fn urls_detected(path in "[a-z]{1,10}") {
1028 let text = format!("Visit https://example.com/{}", path);
1029 let ner = RegexNER::new();
1030 let entities = ner.extract_entities(&text, None).unwrap();
1031 prop_assert!(entities.iter().any(|e|
1032 e.entity_type == EntityType::Url
1033 ));
1034 }
1035
1036 #[test]
1037 fn iso_dates_detected(y in 2000u32..2030, m in 1u32..13, d in 1u32..29) {
1038 let text = format!("Date: {:04}-{:02}-{:02}", y, m, d);
1039 let ner = RegexNER::new();
1040 let entities = ner.extract_entities(&text, None).unwrap();
1041 prop_assert!(entities.iter().any(|e| e.entity_type == EntityType::Date));
1042 }
1043
1044 #[test]
1045 fn no_overlapping_entities(text in ".{0,100}") {
1046 let ner = RegexNER::new();
1047 if let Ok(entities) = ner.extract_entities(&text, None) {
1048 for i in 0..entities.len() {
1049 for j in (i + 1)..entities.len() {
1050 let e1 = &entities[i];
1051 let e2 = &entities[j];
1052 let overlap = e1.start < e2.end && e2.start < e1.end;
1053 prop_assert!(!overlap, "Overlap: {:?} and {:?}", e1, e2);
1054 }
1055 }
1056 }
1057 }
1058 }
1059}