1use anno_core::EntityType;
7use once_cell::sync::Lazy;
8use regex::Regex;
9
10pub struct PatternDef {
12 pub regex: &'static Lazy<Regex>,
14 pub entity_type: EntityType,
16 pub confidence: f64,
18 pub name: &'static str,
20}
21
22pub static PATTERNS: Lazy<Vec<PatternDef>> = Lazy::new(|| {
27 vec![
28 PatternDef {
34 regex: &EMAIL,
35 entity_type: EntityType::Email,
36 confidence: 0.98,
37 name: "EMAIL",
38 },
39 PatternDef {
41 regex: &URL,
42 entity_type: EntityType::Url,
43 confidence: 0.98,
44 name: "URL",
45 },
46 PatternDef {
48 regex: &DATE_ISO,
49 entity_type: EntityType::Date,
50 confidence: 0.98,
51 name: "DATE_ISO",
52 },
53 PatternDef {
59 regex: &MONEY_SYMBOL,
60 entity_type: EntityType::Money,
61 confidence: 0.95,
62 name: "MONEY_SYMBOL",
63 },
64 PatternDef {
65 regex: &MONEY_WRITTEN,
66 entity_type: EntityType::Money,
67 confidence: 0.95,
68 name: "MONEY_WRITTEN",
69 },
70 PatternDef {
71 regex: &MONEY_MAGNITUDE,
72 entity_type: EntityType::Money,
73 confidence: 0.92,
74 name: "MONEY_MAGNITUDE",
75 },
76 PatternDef {
78 regex: &DATE_US,
79 entity_type: EntityType::Date,
80 confidence: 0.95,
81 name: "DATE_US",
82 },
83 PatternDef {
84 regex: &DATE_EU,
85 entity_type: EntityType::Date,
86 confidence: 0.95,
87 name: "DATE_EU",
88 },
89 PatternDef {
90 regex: &DATE_WRITTEN_FULL,
91 entity_type: EntityType::Date,
92 confidence: 0.95,
93 name: "DATE_WRITTEN_FULL",
94 },
95 PatternDef {
96 regex: &DATE_WRITTEN_SHORT,
97 entity_type: EntityType::Date,
98 confidence: 0.95,
99 name: "DATE_WRITTEN_SHORT",
100 },
101 PatternDef {
102 regex: &DATE_WRITTEN_EU,
103 entity_type: EntityType::Date,
104 confidence: 0.95,
105 name: "DATE_WRITTEN_EU",
106 },
107 PatternDef {
109 regex: &PERCENT,
110 entity_type: EntityType::Percent,
111 confidence: 0.95,
112 name: "PERCENT",
113 },
114 PatternDef {
120 regex: &TIME_12H,
121 entity_type: EntityType::Time,
122 confidence: 0.90,
123 name: "TIME_12H",
124 },
125 PatternDef {
126 regex: &TIME_24H,
127 entity_type: EntityType::Time,
128 confidence: 0.88,
129 name: "TIME_24H",
130 },
131 PatternDef {
132 regex: &TIME_SIMPLE,
133 entity_type: EntityType::Time,
134 confidence: 0.85,
135 name: "TIME_SIMPLE",
136 },
137 PatternDef {
139 regex: &PHONE_US,
140 entity_type: EntityType::Phone,
141 confidence: 0.85,
142 name: "PHONE_US",
143 },
144 PatternDef {
145 regex: &PHONE_INTL,
146 entity_type: EntityType::Phone,
147 confidence: 0.85,
148 name: "PHONE_INTL",
149 },
150 ]
151});
152
153static DATE_ISO: Lazy<Regex> =
161 Lazy::new(|| Regex::new(r"\b\d{4}-\d{2}-\d{2}\b").expect("DATE_ISO regex is invalid"));
162static DATE_US: Lazy<Regex> =
163 Lazy::new(|| Regex::new(r"\b\d{1,2}/\d{1,2}/\d{2,4}\b").expect("DATE_US regex is invalid"));
164static DATE_EU: Lazy<Regex> =
165 Lazy::new(|| Regex::new(r"\b\d{1,2}\.\d{1,2}\.\d{2,4}\b").expect("DATE_EU regex is invalid"));
166static DATE_WRITTEN_FULL: Lazy<Regex> = Lazy::new(|| {
167 Regex::new(r"(?i)\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}(?:st|nd|rd|th)?(?:,?\s*\d{4})?\b")
168 .expect("DATE_WRITTEN_FULL regex is invalid")
169});
170static DATE_WRITTEN_SHORT: Lazy<Regex> = Lazy::new(|| {
171 Regex::new(r"(?i)\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)\.?\s+\d{1,2}(?:st|nd|rd|th)?(?:,?\s*\d{4})?\b")
172 .expect("DATE_WRITTEN_SHORT regex is invalid")
173});
174static DATE_WRITTEN_EU: Lazy<Regex> = Lazy::new(|| {
175 Regex::new(r"(?i)\b\d{1,2}(?:st|nd|rd|th)?\s+(?:January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)\.?(?:\s+\d{4})?\b")
176 .expect("DATE_WRITTEN_EU regex is invalid")
177});
178
179static TIME_12H: Lazy<Regex> = Lazy::new(|| {
181 Regex::new(r"(?i)\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:am|pm|a\.m\.|p\.m\.)\b")
182 .expect("TIME_12H regex is invalid")
183});
184static TIME_24H: Lazy<Regex> = Lazy::new(|| {
185 Regex::new(r"\b(?:[01]?\d|2[0-3]):[0-5]\d(?::[0-5]\d)?\b").expect("TIME_24H regex is invalid")
186});
187static TIME_SIMPLE: Lazy<Regex> = Lazy::new(|| {
188 Regex::new(r"(?i)\b\d{1,2}\s*(?:am\b|pm\b|a\.m\.|p\.m\.)")
189 .expect("TIME_SIMPLE regex is invalid")
190});
191
192static MONEY_SYMBOL: Lazy<Regex> = Lazy::new(|| {
194 Regex::new(r"[$€£¥][\d,]+(?:\.\d{1,2})?(?:\s*(?:billion|million|thousand|B|M|K|bn|mn))?")
195 .expect("MONEY_SYMBOL regex is invalid")
196});
197static MONEY_WRITTEN: Lazy<Regex> = Lazy::new(|| {
198 Regex::new(
199 r"(?i)\b\d+(?:,\d{3})*(?:\.\d{1,2})?\s*(?:dollars?|USD|euros?|EUR|pounds?|GBP|yen|JPY)\b",
200 )
201 .expect("MONEY_WRITTEN regex is invalid")
202});
203static MONEY_MAGNITUDE: Lazy<Regex> = Lazy::new(|| {
204 Regex::new(
205 r"(?i)\b\d+(?:\.\d+)?\s*(?:billion|million|trillion)\s*(?:dollars?|euros?|pounds?)?\b",
206 )
207 .expect("MONEY_MAGNITUDE regex is invalid")
208});
209
210static PERCENT: Lazy<Regex> = Lazy::new(|| {
212 Regex::new(r"\b\d+(?:\.\d+)?\s*(?:%|percent\b|pct\b)").expect("PERCENT regex is invalid")
213});
214
215static EMAIL: Lazy<Regex> = Lazy::new(|| {
217 Regex::new(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b")
218 .expect("EMAIL regex is invalid")
219});
220static URL: Lazy<Regex> = Lazy::new(|| {
221 Regex::new(r"(?i)\bhttps?://[^\s<>\[\]{}|\\^`\x00-\x1f]+").expect("URL regex is invalid")
222});
223static PHONE_US: Lazy<Regex> = Lazy::new(|| {
224 Regex::new(r"(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b")
225 .expect("PHONE_US regex is invalid")
226});
227static PHONE_INTL: Lazy<Regex> = Lazy::new(|| {
228 Regex::new(r"\+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}\b")
229 .expect("PHONE_INTL regex is invalid")
230});
231
232pub fn supported_types() -> Vec<EntityType> {
234 vec![
235 EntityType::Date,
236 EntityType::Time,
237 EntityType::Money,
238 EntityType::Percent,
239 EntityType::Email,
240 EntityType::Url,
241 EntityType::Phone,
242 ]
243}
244
245#[cfg(test)]
246mod tests {
247 use super::*;
248
249 #[test]
250 fn test_patterns_compile() {
251 assert!(!PATTERNS.is_empty());
253 }
254
255 #[test]
256 fn test_email_pattern() {
257 assert!(EMAIL.is_match("test@example.com"));
258 assert!(!EMAIL.is_match("not an email"));
259 }
260
261 #[test]
262 fn test_url_pattern() {
263 assert!(URL.is_match("https://example.com"));
264 assert!(URL.is_match("http://sub.domain.org/path?query=1"));
265 assert!(!URL.is_match("not a url"));
266 }
267
268 #[test]
269 fn test_money_patterns() {
270 assert!(MONEY_SYMBOL.is_match("$100"));
271 assert!(MONEY_SYMBOL.is_match("€50.00"));
272 assert!(MONEY_WRITTEN.is_match("100 dollars"));
273 assert!(MONEY_MAGNITUDE.is_match("5 million dollars"));
274 }
275}