Skip to main content

anno/backends/
pattern_config.rs

1//! Declarative pattern configuration for RegexNER.
2//!
3//! This provides a cleaner, more maintainable way to define patterns
4//! without boilerplate. Patterns are defined once and compiled lazily.
5
6use anno_core::EntityType;
7use once_cell::sync::Lazy;
8use regex::Regex;
9
10/// A pattern definition: regex + entity type + confidence + name.
11pub struct PatternDef {
12    /// The compiled regex pattern.
13    pub regex: &'static Lazy<Regex>,
14    /// The entity type to assign to matches.
15    pub entity_type: EntityType,
16    /// Confidence score (0.0-1.0).
17    pub confidence: f64,
18    /// Pattern name for provenance tracking.
19    pub name: &'static str,
20}
21
22/// All pattern definitions, organized by priority.
23///
24/// Higher priority patterns are checked first. Within same priority,
25/// order in the slice determines precedence.
26pub static PATTERNS: Lazy<Vec<PatternDef>> = Lazy::new(|| {
27    vec![
28        // =====================================================================
29        // High confidence: Very specific formats
30        // =====================================================================
31
32        // Emails (extremely specific)
33        PatternDef {
34            regex: &EMAIL,
35            entity_type: EntityType::Email,
36            confidence: 0.98,
37            name: "EMAIL",
38        },
39        // URLs (extremely specific)
40        PatternDef {
41            regex: &URL,
42            entity_type: EntityType::Url,
43            confidence: 0.98,
44            name: "URL",
45        },
46        // ISO dates (unambiguous)
47        PatternDef {
48            regex: &DATE_ISO,
49            entity_type: EntityType::Date,
50            confidence: 0.98,
51            name: "DATE_ISO",
52        },
53        // =====================================================================
54        // High confidence: Format-based detection
55        // =====================================================================
56
57        // Money with symbols
58        PatternDef {
59            regex: &MONEY_SYMBOL,
60            entity_type: EntityType::Money,
61            confidence: 0.95,
62            name: "MONEY_SYMBOL",
63        },
64        PatternDef {
65            regex: &MONEY_WRITTEN,
66            entity_type: EntityType::Money,
67            confidence: 0.95,
68            name: "MONEY_WRITTEN",
69        },
70        PatternDef {
71            regex: &MONEY_MAGNITUDE,
72            entity_type: EntityType::Money,
73            confidence: 0.92,
74            name: "MONEY_MAGNITUDE",
75        },
76        // Dates (specific formats)
77        PatternDef {
78            regex: &DATE_US,
79            entity_type: EntityType::Date,
80            confidence: 0.95,
81            name: "DATE_US",
82        },
83        PatternDef {
84            regex: &DATE_EU,
85            entity_type: EntityType::Date,
86            confidence: 0.95,
87            name: "DATE_EU",
88        },
89        PatternDef {
90            regex: &DATE_WRITTEN_FULL,
91            entity_type: EntityType::Date,
92            confidence: 0.95,
93            name: "DATE_WRITTEN_FULL",
94        },
95        PatternDef {
96            regex: &DATE_WRITTEN_SHORT,
97            entity_type: EntityType::Date,
98            confidence: 0.95,
99            name: "DATE_WRITTEN_SHORT",
100        },
101        PatternDef {
102            regex: &DATE_WRITTEN_EU,
103            entity_type: EntityType::Date,
104            confidence: 0.95,
105            name: "DATE_WRITTEN_EU",
106        },
107        // Percentages
108        PatternDef {
109            regex: &PERCENT,
110            entity_type: EntityType::Percent,
111            confidence: 0.95,
112            name: "PERCENT",
113        },
114        // =====================================================================
115        // Medium confidence: Times and phones (can have false positives)
116        // =====================================================================
117
118        // Times
119        PatternDef {
120            regex: &TIME_12H,
121            entity_type: EntityType::Time,
122            confidence: 0.90,
123            name: "TIME_12H",
124        },
125        PatternDef {
126            regex: &TIME_24H,
127            entity_type: EntityType::Time,
128            confidence: 0.88,
129            name: "TIME_24H",
130        },
131        PatternDef {
132            regex: &TIME_SIMPLE,
133            entity_type: EntityType::Time,
134            confidence: 0.85,
135            name: "TIME_SIMPLE",
136        },
137        // Phone numbers (many false positives possible)
138        PatternDef {
139            regex: &PHONE_US,
140            entity_type: EntityType::Phone,
141            confidence: 0.85,
142            name: "PHONE_US",
143        },
144        PatternDef {
145            regex: &PHONE_INTL,
146            entity_type: EntityType::Phone,
147            confidence: 0.85,
148            name: "PHONE_INTL",
149        },
150    ]
151});
152
153// =============================================================================
154// Regex Definitions (compiled once, lazily)
155// =============================================================================
156// Note: These patterns are compile-time constants. If any regex is invalid,
157// it's a programmer error that should panic immediately with a clear message.
158
159// Date patterns
160static DATE_ISO: Lazy<Regex> =
161    Lazy::new(|| Regex::new(r"\b\d{4}-\d{2}-\d{2}\b").expect("DATE_ISO regex is invalid"));
162static DATE_US: Lazy<Regex> =
163    Lazy::new(|| Regex::new(r"\b\d{1,2}/\d{1,2}/\d{2,4}\b").expect("DATE_US regex is invalid"));
164static DATE_EU: Lazy<Regex> =
165    Lazy::new(|| Regex::new(r"\b\d{1,2}\.\d{1,2}\.\d{2,4}\b").expect("DATE_EU regex is invalid"));
166static DATE_WRITTEN_FULL: Lazy<Regex> = Lazy::new(|| {
167    Regex::new(r"(?i)\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}(?:st|nd|rd|th)?(?:,?\s*\d{4})?\b")
168        .expect("DATE_WRITTEN_FULL regex is invalid")
169});
170static DATE_WRITTEN_SHORT: Lazy<Regex> = Lazy::new(|| {
171    Regex::new(r"(?i)\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)\.?\s+\d{1,2}(?:st|nd|rd|th)?(?:,?\s*\d{4})?\b")
172        .expect("DATE_WRITTEN_SHORT regex is invalid")
173});
174static DATE_WRITTEN_EU: Lazy<Regex> = Lazy::new(|| {
175    Regex::new(r"(?i)\b\d{1,2}(?:st|nd|rd|th)?\s+(?:January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)\.?(?:\s+\d{4})?\b")
176        .expect("DATE_WRITTEN_EU regex is invalid")
177});
178
179// Time patterns
180static TIME_12H: Lazy<Regex> = Lazy::new(|| {
181    Regex::new(r"(?i)\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:am|pm|a\.m\.|p\.m\.)\b")
182        .expect("TIME_12H regex is invalid")
183});
184static TIME_24H: Lazy<Regex> = Lazy::new(|| {
185    Regex::new(r"\b(?:[01]?\d|2[0-3]):[0-5]\d(?::[0-5]\d)?\b").expect("TIME_24H regex is invalid")
186});
187static TIME_SIMPLE: Lazy<Regex> = Lazy::new(|| {
188    Regex::new(r"(?i)\b\d{1,2}\s*(?:am\b|pm\b|a\.m\.|p\.m\.)")
189        .expect("TIME_SIMPLE regex is invalid")
190});
191
192// Money patterns
193static MONEY_SYMBOL: Lazy<Regex> = Lazy::new(|| {
194    Regex::new(r"[$€£¥][\d,]+(?:\.\d{1,2})?(?:\s*(?:billion|million|thousand|B|M|K|bn|mn))?")
195        .expect("MONEY_SYMBOL regex is invalid")
196});
197static MONEY_WRITTEN: Lazy<Regex> = Lazy::new(|| {
198    Regex::new(
199        r"(?i)\b\d+(?:,\d{3})*(?:\.\d{1,2})?\s*(?:dollars?|USD|euros?|EUR|pounds?|GBP|yen|JPY)\b",
200    )
201    .expect("MONEY_WRITTEN regex is invalid")
202});
203static MONEY_MAGNITUDE: Lazy<Regex> = Lazy::new(|| {
204    Regex::new(
205        r"(?i)\b\d+(?:\.\d+)?\s*(?:billion|million|trillion)\s*(?:dollars?|euros?|pounds?)?\b",
206    )
207    .expect("MONEY_MAGNITUDE regex is invalid")
208});
209
210// Percent pattern
211static PERCENT: Lazy<Regex> = Lazy::new(|| {
212    Regex::new(r"\b\d+(?:\.\d+)?\s*(?:%|percent\b|pct\b)").expect("PERCENT regex is invalid")
213});
214
215// Contact patterns
216static EMAIL: Lazy<Regex> = Lazy::new(|| {
217    Regex::new(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b")
218        .expect("EMAIL regex is invalid")
219});
220static URL: Lazy<Regex> = Lazy::new(|| {
221    Regex::new(r"(?i)\bhttps?://[^\s<>\[\]{}|\\^`\x00-\x1f]+").expect("URL regex is invalid")
222});
223static PHONE_US: Lazy<Regex> = Lazy::new(|| {
224    Regex::new(r"(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b")
225        .expect("PHONE_US regex is invalid")
226});
227static PHONE_INTL: Lazy<Regex> = Lazy::new(|| {
228    Regex::new(r"\+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}\b")
229        .expect("PHONE_INTL regex is invalid")
230});
231
232/// Get all supported entity types (for Model trait).
233pub fn supported_types() -> Vec<EntityType> {
234    vec![
235        EntityType::Date,
236        EntityType::Time,
237        EntityType::Money,
238        EntityType::Percent,
239        EntityType::Email,
240        EntityType::Url,
241        EntityType::Phone,
242    ]
243}
244
245#[cfg(test)]
246mod tests {
247    use super::*;
248
249    #[test]
250    fn test_patterns_compile() {
251        // Force lazy evaluation
252        assert!(!PATTERNS.is_empty());
253    }
254
255    #[test]
256    fn test_email_pattern() {
257        assert!(EMAIL.is_match("test@example.com"));
258        assert!(!EMAIL.is_match("not an email"));
259    }
260
261    #[test]
262    fn test_url_pattern() {
263        assert!(URL.is_match("https://example.com"));
264        assert!(URL.is_match("http://sub.domain.org/path?query=1"));
265        assert!(!URL.is_match("not a url"));
266    }
267
268    #[test]
269    fn test_money_patterns() {
270        assert!(MONEY_SYMBOL.is_match("$100"));
271        assert!(MONEY_SYMBOL.is_match("€50.00"));
272        assert!(MONEY_WRITTEN.is_match("100 dollars"));
273        assert!(MONEY_MAGNITUDE.is_match("5 million dollars"));
274    }
275}