Skip to main content

scirs2_text/information_extraction/
patterns.rs

1//! Regex patterns for information extraction
2//!
3//! This module contains predefined regex patterns for extracting
4//! common entities and information from text.
5
6#![allow(missing_docs)]
7
8use lazy_static::lazy_static;
9use regex::Regex;
10
11lazy_static! {
12    // Common regex patterns for information extraction
13    pub static ref EMAIL_PATTERN: Regex = Regex::new(
14        r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
15    ).expect("Operation failed");
16
17    pub static ref URL_PATTERN: Regex = Regex::new(
18        r"https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&/=]*)"
19    ).expect("Operation failed");
20
21    pub static ref PHONE_PATTERN: Regex = Regex::new(
22        r"(?:\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})"
23    ).expect("Operation failed");
24
25    pub static ref DATE_PATTERN: Regex = Regex::new(
26        r"\b(?:(?:0?[1-9]|1[0-2])[/-](?:0?[1-9]|[12][0-9]|3[01])[/-](?:19|20)?\d{2})|(?:(?:19|20)\d{2}[/-](?:0?[1-9]|1[0-2])[/-](?:0?[1-9]|[12][0-9]|3[01]))|(?:(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4})|(?:\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4})\b"
27    ).expect("Operation failed");
28
29    pub static ref TIME_PATTERN: Regex = Regex::new(
30        r"\b(?:[01]?[0-9]|2[0-3]):[0-5][0-9](?::[0-5][0-9])?(?:\s*[aApP][mM])?\b"
31    ).expect("Operation failed");
32
33    pub static ref MONEY_PATTERN: Regex = Regex::new(
34        r"[$€£¥]\s*\d+(?:,\d{3})*(?:\.\d{1,2})?|\d+(?:,\d{3})*(?:\.\d{1,2})?\s*(?:dollars?|euros?|pounds?|yen)"
35    ).expect("Operation failed");
36
37    pub static ref PERCENTAGE_PATTERN: Regex = Regex::new(
38        r"\b\d+(?:\.\d+)?%\b"
39    ).expect("Operation failed");
40}