commonregex/
lib.rs

1extern crate regex;
2use regex::Regex;
3
4
5pub const   DATE           :&str= r#"(?i)(?:[0-3]?\d(?:st|nd|rd|th)?\s+(?:of\s+)?(?:jan\.?|january|feb\.?|february|mar\.?|march|apr\.?|april|may|jun\.?|june|jul\.?|july|aug\.?|august|sep\.?|september|oct\.?|october|nov\.?|november|dec\.?|december)|(?:jan\.?|january|feb\.?|february|mar\.?|march|apr\.?|april|may|jun\.?|june|jul\.?|july|aug\.?|august|sep\.?|september|oct\.?|october|nov\.?|november|dec\.?|december)\s+[0-3]?\d(?:st|nd|rd|th)?)(?:,)?\s*(?:\d{4})?|[0-3]?\d[-\./][0-3]?\d[-\./]\d{2,4}"#;
6pub const	TIME           :&str= r#"(?i)\d{1,2}:\d{2} ?(?:[ap]\.?m\.?)?|\d[ap]\.?m\.?"#;
7pub const	PHONE          :&str= r#"(?:(?:\+?\d{1,3}[-.\s*]?)?(?:\(?\d{3}\)?[-.\s*]?)?\d{3}[-.\s*]?\d{4,6})|(?:(?:(?:\(\+?\d{2}\))|(?:\+?\d{2}))\s*\d{2}\s*\d{3}\s*\d{4})"#;
8pub const	PHONEWTHEXT    :&str= r#"(?i)(?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*(?:[2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|(?:[2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?(?:[2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?(?:[0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(?:\d+)?)"#;
9pub const	LINK           :&str= r#"(http(s)?:\\/\\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"#;
10pub const	EMAIL          :&str= r#"(?i)([A-Za-z0-9!#$%&'*+\\/=?^_{|.}~-]+@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)"#;
11pub const	IPV4           :&str= r#"(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"#;
12pub const	IPV6           :&str= r#"(?:(?:(?:[0-9A-Fa-f]{1,4}:){7}(?:[0-9A-Fa-f]{1,4}|:))|(?:(?:[0-9A-Fa-f]{1,4}:){6}(?::[0-9A-Fa-f]{1,4}|(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(?:(?:[0-9A-Fa-f]{1,4}:){5}(?:(?:(?::[0-9A-Fa-f]{1,4}){1,2})|:(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(?:(?:[0-9A-Fa-f]{1,4}:){4}(?:(?:(?::[0-9A-Fa-f]{1,4}){1,3})|(?:(?::[0-9A-Fa-f]{1,4})?:(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(?:(?:[0-9A-Fa-f]{1,4}:){3}(?:(?:(?::[0-9A-Fa-f]{1,4}){1,4})|(?:(?::[0-9A-Fa-f]{1,4}){0,2}:(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(?:(?:[0-9A-Fa-f]{1,4}:){2}(?:(?:(?::[0-9A-Fa-f]{1,4}){1,5})|(?:(?::[0-9A-Fa-f]{1,4}){0,3}:(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(?:(?:[0-9A-Fa-f]{1,4}:){1}(?:(?:(?::[0-9A-Fa-f]{1,4}){1,6})|(?:(?::[0-9A-Fa-f]{1,4}){0,4}:(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(?::(?:(?:(?::[0-9A-Fa-f]{1,4}){1,7})|(?:(?::[0-9A-Fa-f]{1,4}){0,5}:(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(?:%.+)?\s*"#;
13pub const	PRICE          :&str= r#"[$]\s?[+-]?[0-9]{1,3}(?:(?:,?[0-9]{3}))*(?:\.[0-9]{1,2})?"#;
14pub const	HEXCOLOR       :&str= r#"(?:#?([0-9a-fA-F]{6}|[0-9a-fA-F]{3}))"#;
15pub const	CREDITCARD     :&str= r#"(?:(?:(?:\d{4}[- ]?){3}\d{4}|\d{15,16}))"#;
16pub const	VISA           :&str= r#"4\d{3}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}"#;
17pub const	MASTERCARD     :&str= r#"5[1-5]\d{2}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}"#;
18pub const	BTCADDRESS     :&str= r#"[13][a-km-zA-HJ-NP-Z1-9]{25,34}"#;
19pub const	STREETADDRESS  :&str= r#"\d{1,4} [\w\s]{1,20}(?:street|st|avenue|ave|road|rd|highway|hwy|square|sq|trail|trl|drive|dr|court|ct|park|parkway|pkwy|circle|cir|boulevard|blvd)\W?"#;
20pub const	ZIPCODE        :&str= r#"\b\d{5}(?:[-\s]\d{4})?\b"#;
21pub const	POBOX          :&str= r#"(?i)P\.? ?O\.? Box \d+"#;
22pub const	SSN            :&str= r#"(?:\d{3}-\d{2}-\d{4})"#;
23pub const	MD5HEX         :&str= r#"[0-9a-fA-F]{32}"#;
24pub const	SHA1HEX        :&str= r#"[0-9a-fA-F]{40}"#;
25pub const	SHA256HEX      :&str= r#"[0-9a-fA-F]{64}"#;
26pub const	GUID           :&str= r#"[0-9a-fA-F]{8}-?[a-fA-F0-9]{4}-?[a-fA-F0-9]{4}-?[a-fA-F0-9]{4}-?[a-fA-F0-9]{12}"#;
27pub const	ISBN13         :&str= r#"(?:[\d]-?){12}[\dxX]"#;
28pub const	ISBN10         :&str= r#"(?:[\d]-?){9}[\dxX]"#;
29pub const	MACADDRESS     :&str= r#"(([a-fA-F0-9]{2}[:-]){5}([a-fA-F0-9]{2}))"#;
30pub const	IBAN           :&str= r#"[A-Z]{2}\d{2}[A-Z0-9]{4}\d{7}([A-Z\d]?){0,16}"#;
31pub const	GITREPO        :&str= r#"((git|ssh|http(s)?)|(git@[\w\.]+))(:(\\/\\/)?)([\w\.@\\:/\-~]+)(\.git)(\\/)?"#;
32
33
34#[derive(Debug)]
35pub struct CommonRegex<'a> {
36    pub dates: Vec<&'a str>,
37    pub times: Vec<&'a str>,
38    pub phones: Vec<&'a str>,
39    pub phones_with_exts: Vec<&'a str>,
40    pub links: Vec<&'a str>,
41    pub emails: Vec<&'a str>,
42    pub ipv4s: Vec<&'a str>,
43    pub ipv6s: Vec<&'a str>,
44    pub prices: Vec<&'a str>,
45    pub hex_colors: Vec<&'a str>,
46    pub credit_cards: Vec<&'a str>,
47    pub visas: Vec<&'a str>,
48    pub mastercards: Vec<&'a str>,
49    pub btc_addresses: Vec<&'a str>,
50    pub street_addresses: Vec<&'a str>,
51    pub zip_codes: Vec<&'a str>,
52    pub po_boxs: Vec<&'a str>,
53    pub ssns: Vec<&'a str>,
54    pub md5s: Vec<&'a str>,
55    pub sha1s: Vec<&'a str>,
56    pub sha2s: Vec<&'a str>,
57    pub guids: Vec<&'a str>,
58    pub isbn13s: Vec<&'a str>,
59    pub isbn10s: Vec<&'a str>,
60    pub mac_addresses: Vec<&'a str>,
61    pub ibans: Vec<&'a str>,
62    pub gitrepos: Vec<&'a str>,
63 }
64
65impl <'a>CommonRegex<'a> {
66    pub fn new() -> CommonRegex<'a> {
67        CommonRegex {
68                dates: Vec::new(),
69                times: Vec::new(),
70                phones: Vec::new(),
71                phones_with_exts: Vec::new(),
72                links: Vec::new(),
73                emails: Vec::new(),
74                ipv4s: Vec::new(),
75                ipv6s: Vec::new(),
76                prices: Vec::new(),
77                hex_colors: Vec::new(),
78                credit_cards: Vec::new(),
79                visas: Vec::new(),
80                mastercards: Vec::new(),
81                btc_addresses: Vec::new(),
82                street_addresses: Vec::new(),
83                zip_codes: Vec::new(),
84                po_boxs: Vec::new(),
85                ssns: Vec::new(),
86                md5s: Vec::new(),
87                sha1s: Vec::new(),
88                sha2s: Vec::new(),
89                guids: Vec::new(),
90                isbn13s: Vec::new(),
91                isbn10s: Vec::new(),
92                mac_addresses: Vec::new(),
93                ibans: Vec::new(),
94                gitrepos: Vec::new(),
95        }
96    }
97    pub fn common_regex(&self, text: &'a str) -> CommonRegex<'a> {
98        CommonRegex {
99                dates: dates(text),
100                times: times(text),
101                phones: phones(text),
102                phones_with_exts: phones_with_exts(text),
103                links: links(text),
104                emails: emails(text),
105                ipv4s: ips(text),
106                ipv6s: ipv6s(text),
107                prices: prices(text),
108                hex_colors: hex_colors(text),
109                credit_cards: credit_cards(text),
110                visas: visas(text),
111                mastercards: mastercards(text),
112                btc_addresses: btc_addresses(text),
113                street_addresses: street_addresses(text),
114                zip_codes: zip_codes(text),
115                po_boxs: po_boxs(text),
116                ssns: ssns(text),
117                md5s: md5s(text),
118                sha1s: sha1s(text),
119                sha2s: sha2s(text),
120                guids: guids(text),
121                isbn13s: isbn13s(text),
122                isbn10s: isbn10s(text),
123                mac_addresses: mac_addresses(text),
124                ibans: ibans(text),
125                gitrepos: gitrepos(text),
126        }
127    }
128}
129
130
131pub fn common_regex<'a>(text: &'a str) -> CommonRegex<'a> {
132        CommonRegex {
133                dates: dates(text),
134                times: times(text),
135                phones: phones(text),
136                phones_with_exts: phones_with_exts(text),
137                links: links(text),  // FIXME: Regex parse error
138                emails: emails(text),
139                ipv4s: ips(text),
140                ipv6s: ipv6s(text),
141                prices: prices(text),
142                hex_colors: hex_colors(text),
143                credit_cards: credit_cards(text),
144                visas: visas(text),
145                mastercards: mastercards(text),
146                btc_addresses: btc_addresses(text),
147                street_addresses: street_addresses(text),
148                zip_codes: zip_codes(text),
149                po_boxs: po_boxs(text),
150                ssns: ssns(text),
151                md5s: md5s(text),
152                sha1s: sha1s(text),
153                sha2s: sha2s(text),
154                guids: guids(text),
155                isbn13s: isbn13s(text),
156                isbn10s: isbn10s(text),
157                mac_addresses: mac_addresses(text),
158                ibans: ibans(text),
159                gitrepos: gitrepos(text)
160        }
161    }
162
163
164#[allow(dead_code)]
165pub fn parse<'caps>(regex: &str, text: &'caps str) -> Vec<&'caps str> {
166    let mut caps:Vec<&str> = [].to_vec();
167    for cap in Regex::new(regex).unwrap().captures_iter(text) {
168        caps.push(cap.get(0).map_or("", |m| m.as_str()));
169    }
170    caps
171}
172
173#[allow(dead_code)]
174pub fn dates(text: &str) -> Vec<&str> {
175   parse(DATE, text)
176}
177
178#[allow(dead_code)]
179pub fn times(text: &str) -> Vec<&str> {
180    parse(TIME, text)
181}
182
183#[allow(dead_code)]
184pub fn phones(text: &str) -> Vec<&str> {
185    parse(PHONE, text)
186}
187
188#[allow(dead_code)]
189pub fn phones_with_exts(text: &str) -> Vec<&str> {
190    parse(PHONEWTHEXT, text)
191}
192
193#[allow(dead_code)]
194pub fn links(text: &str) -> Vec<&str> {
195    parse(LINK, text)
196}
197
198#[allow(dead_code)]
199pub fn emails(text: &str) -> Vec<&str> {
200    parse(EMAIL, text)
201}
202
203#[allow(dead_code)]
204pub fn ips(text: &str) -> Vec<&str> {
205    parse(IPV4, text)
206}
207
208#[allow(dead_code)]
209pub fn ipv6s(text: &str) -> Vec<&str> {
210    parse(IPV6, text)
211}
212
213#[allow(dead_code)]
214pub fn prices(text: &str) -> Vec<&str> {
215    parse(PRICE, text)
216}
217
218#[allow(dead_code)]
219pub fn hex_colors(text: &str) -> Vec<&str> {
220    parse(HEXCOLOR, text)
221}
222
223#[allow(dead_code)]
224pub fn credit_cards(text: &str) -> Vec<&str> {
225    parse(CREDITCARD, text)
226}
227
228#[allow(dead_code)]
229pub fn visas(text: &str) -> Vec<&str> {
230    parse(VISA, text)
231}
232
233#[allow(dead_code)]
234pub fn mastercards(text: &str) -> Vec<&str> {
235    parse(MASTERCARD, text)
236}
237
238#[allow(dead_code)]
239pub fn btc_addresses(text: &str) -> Vec<&str> {
240    parse(BTCADDRESS, text)    
241}
242
243#[allow(dead_code)]
244pub fn street_addresses(text: &str) -> Vec<&str> {
245    parse(STREETADDRESS, text)
246}
247
248#[allow(dead_code)]
249pub fn zip_codes(text: &str) -> Vec<&str> {
250    parse(ZIPCODE, text)
251}
252
253#[allow(dead_code)]
254pub fn po_boxs(text: &str) -> Vec<&str> {
255    parse(POBOX, text)
256}
257
258#[allow(dead_code)]
259pub fn ssns(text: &str) -> Vec<&str> {
260    parse(SSN, text)
261}
262
263#[allow(dead_code)]
264pub fn md5s(text: &str) -> Vec<&str> {
265    parse(MD5HEX, text)
266}
267
268#[allow(dead_code)]
269pub fn sha1s(text: &str) -> Vec<&str> {
270    parse(SHA1HEX, text)
271}
272
273#[allow(dead_code)]
274pub fn sha2s(text: &str) -> Vec<&str> {
275    parse(SHA256HEX, text)
276}
277
278#[allow(dead_code)]
279pub fn guids(text: &str) -> Vec<&str> {
280    parse(GUID, text)
281}
282
283#[allow(dead_code)]
284pub fn isbn13s(text: &str) -> Vec<&str> {
285    parse(ISBN13, text)
286}
287
288#[allow(dead_code)]
289pub fn isbn10s(text: &str) -> Vec<&str> {
290    parse(ISBN10, text)
291}
292
293#[allow(dead_code)]
294pub fn mac_addresses(text: &str) -> Vec<&str> {
295    parse(MACADDRESS, text)
296}
297
298#[allow(dead_code)]
299pub fn ibans(text: &str) -> Vec<&str> {
300    parse(IBAN, text)
301}
302
303#[allow(dead_code)]
304pub fn gitrepos(text: &str) -> Vec<&str> {
305    parse(GITREPO, text)
306}
307
308
309#[cfg(test)]
310mod tests {
311    use super::common_regex;
312    #[test]
313    fn test_common_regex() {
314        let text = "John, please get that article on www.linkedin.com to me by 5:00PM 
315                               on Jan 9th 2012. 4:00 would be ideal, actually. If you have any 
316                               questions, You can reach me at (519)-236-2723x341 or get in touch with
317                               my associate at harold.smith@gmail.com";
318
319        assert_eq!(format!("{:?}", common_regex(text)),r#"CommonRegex { dates: ["Jan 9th 2012"], times: ["5:00PM", "4:00 "], phones: ["(519)-236-2723"], phones_with_exts: ["(519)-236-2723x341"], links: ["www.linkedin.com", "harold.smith@gmail.com"], emails: ["harold.smith@gmail.com"], ipv4s: [], ipv6s: [], prices: [], hex_colors: ["201", "dea", "eac", "519", "236", "272", "341"], credit_cards: [], visas: [], mastercards: [], btc_addresses: [], street_addresses: [], zip_codes: [], po_boxs: [], ssns: [], md5s: [], sha1s: [], sha2s: [], guids: [], isbn13s: [], isbn10s: [], mac_addresses: [], ibans: [], gitrepos: [] }"#);
320    }
321
322    #[test]
323    fn test_times() {
324        use super::times;
325        assert_eq!(format!("{:?}", times("When are you free? Do you want to meet up for coffee at 4:00?")), r#"["4:00"]"#);
326    }
327
328    #[test]
329    fn test_prices() {
330        use super::prices;
331        assert_eq!(format!("{:?}", prices("They said the price was US$5,000.90, actually it is US$3,900.5. It\'s $1100.4 less, can you imagine this?")), r#"["$5,000.90", "$3,900.5", "$110"]"#);
332    }
333
334    #[test]
335    fn test_ipv6s() {
336        use super::ipv6s;
337        assert_eq!(format!("{:?}", ipv6s("The IPv6 address for localhost is 0:0:0:0:0:0:0:1, or alternatively, ::1.")), r#"["0:0:0:0:0:0:0:1", "::1"]"#);
338    }
339}