1extern crate regex;
2use regex::Regex;
3
4
5pub const DATE :&str= r#"(?i)(?:[0-3]?\d(?:st|nd|rd|th)?\s+(?:of\s+)?(?:jan\.?|january|feb\.?|february|mar\.?|march|apr\.?|april|may|jun\.?|june|jul\.?|july|aug\.?|august|sep\.?|september|oct\.?|october|nov\.?|november|dec\.?|december)|(?:jan\.?|january|feb\.?|february|mar\.?|march|apr\.?|april|may|jun\.?|june|jul\.?|july|aug\.?|august|sep\.?|september|oct\.?|october|nov\.?|november|dec\.?|december)\s+[0-3]?\d(?:st|nd|rd|th)?)(?:,)?\s*(?:\d{4})?|[0-3]?\d[-\./][0-3]?\d[-\./]\d{2,4}"#;
6pub const TIME :&str= r#"(?i)\d{1,2}:\d{2} ?(?:[ap]\.?m\.?)?|\d[ap]\.?m\.?"#;
7pub const PHONE :&str= r#"(?:(?:\+?\d{1,3}[-.\s*]?)?(?:\(?\d{3}\)?[-.\s*]?)?\d{3}[-.\s*]?\d{4,6})|(?:(?:(?:\(\+?\d{2}\))|(?:\+?\d{2}))\s*\d{2}\s*\d{3}\s*\d{4})"#;
8pub const PHONEWTHEXT :&str= r#"(?i)(?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*(?:[2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|(?:[2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?(?:[2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?(?:[0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(?:\d+)?)"#;
9pub const LINK :&str= r#"(http(s)?:\\/\\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"#;
10pub const EMAIL :&str= r#"(?i)([A-Za-z0-9!#$%&'*+\\/=?^_{|.}~-]+@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)"#;
11pub const IPV4 :&str= r#"(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"#;
12pub const IPV6 :&str= r#"(?:(?:(?:[0-9A-Fa-f]{1,4}:){7}(?:[0-9A-Fa-f]{1,4}|:))|(?:(?:[0-9A-Fa-f]{1,4}:){6}(?::[0-9A-Fa-f]{1,4}|(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(?:(?:[0-9A-Fa-f]{1,4}:){5}(?:(?:(?::[0-9A-Fa-f]{1,4}){1,2})|:(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(?:(?:[0-9A-Fa-f]{1,4}:){4}(?:(?:(?::[0-9A-Fa-f]{1,4}){1,3})|(?:(?::[0-9A-Fa-f]{1,4})?:(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(?:(?:[0-9A-Fa-f]{1,4}:){3}(?:(?:(?::[0-9A-Fa-f]{1,4}){1,4})|(?:(?::[0-9A-Fa-f]{1,4}){0,2}:(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(?:(?:[0-9A-Fa-f]{1,4}:){2}(?:(?:(?::[0-9A-Fa-f]{1,4}){1,5})|(?:(?::[0-9A-Fa-f]{1,4}){0,3}:(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(?:(?:[0-9A-Fa-f]{1,4}:){1}(?:(?:(?::[0-9A-Fa-f]{1,4}){1,6})|(?:(?::[0-9A-Fa-f]{1,4}){0,4}:(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(?::(?:(?:(?::[0-9A-Fa-f]{1,4}){1,7})|(?:(?::[0-9A-Fa-f]{1,4}){0,5}:(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(?:%.+)?\s*"#;
13pub const PRICE :&str= r#"[$]\s?[+-]?[0-9]{1,3}(?:(?:,?[0-9]{3}))*(?:\.[0-9]{1,2})?"#;
14pub const HEXCOLOR :&str= r#"(?:#?([0-9a-fA-F]{6}|[0-9a-fA-F]{3}))"#;
15pub const CREDITCARD :&str= r#"(?:(?:(?:\d{4}[- ]?){3}\d{4}|\d{15,16}))"#;
16pub const VISA :&str= r#"4\d{3}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}"#;
17pub const MASTERCARD :&str= r#"5[1-5]\d{2}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}"#;
18pub const BTCADDRESS :&str= r#"[13][a-km-zA-HJ-NP-Z1-9]{25,34}"#;
19pub const STREETADDRESS :&str= r#"\d{1,4} [\w\s]{1,20}(?:street|st|avenue|ave|road|rd|highway|hwy|square|sq|trail|trl|drive|dr|court|ct|park|parkway|pkwy|circle|cir|boulevard|blvd)\W?"#;
20pub const ZIPCODE :&str= r#"\b\d{5}(?:[-\s]\d{4})?\b"#;
21pub const POBOX :&str= r#"(?i)P\.? ?O\.? Box \d+"#;
22pub const SSN :&str= r#"(?:\d{3}-\d{2}-\d{4})"#;
23pub const MD5HEX :&str= r#"[0-9a-fA-F]{32}"#;
24pub const SHA1HEX :&str= r#"[0-9a-fA-F]{40}"#;
25pub const SHA256HEX :&str= r#"[0-9a-fA-F]{64}"#;
26pub const GUID :&str= r#"[0-9a-fA-F]{8}-?[a-fA-F0-9]{4}-?[a-fA-F0-9]{4}-?[a-fA-F0-9]{4}-?[a-fA-F0-9]{12}"#;
27pub const ISBN13 :&str= r#"(?:[\d]-?){12}[\dxX]"#;
28pub const ISBN10 :&str= r#"(?:[\d]-?){9}[\dxX]"#;
29pub const MACADDRESS :&str= r#"(([a-fA-F0-9]{2}[:-]){5}([a-fA-F0-9]{2}))"#;
30pub const IBAN :&str= r#"[A-Z]{2}\d{2}[A-Z0-9]{4}\d{7}([A-Z\d]?){0,16}"#;
31pub const GITREPO :&str= r#"((git|ssh|http(s)?)|(git@[\w\.]+))(:(\\/\\/)?)([\w\.@\\:/\-~]+)(\.git)(\\/)?"#;
32
33
34#[derive(Debug)]
35pub struct CommonRegex<'a> {
36 pub dates: Vec<&'a str>,
37 pub times: Vec<&'a str>,
38 pub phones: Vec<&'a str>,
39 pub phones_with_exts: Vec<&'a str>,
40 pub links: Vec<&'a str>,
41 pub emails: Vec<&'a str>,
42 pub ipv4s: Vec<&'a str>,
43 pub ipv6s: Vec<&'a str>,
44 pub prices: Vec<&'a str>,
45 pub hex_colors: Vec<&'a str>,
46 pub credit_cards: Vec<&'a str>,
47 pub visas: Vec<&'a str>,
48 pub mastercards: Vec<&'a str>,
49 pub btc_addresses: Vec<&'a str>,
50 pub street_addresses: Vec<&'a str>,
51 pub zip_codes: Vec<&'a str>,
52 pub po_boxs: Vec<&'a str>,
53 pub ssns: Vec<&'a str>,
54 pub md5s: Vec<&'a str>,
55 pub sha1s: Vec<&'a str>,
56 pub sha2s: Vec<&'a str>,
57 pub guids: Vec<&'a str>,
58 pub isbn13s: Vec<&'a str>,
59 pub isbn10s: Vec<&'a str>,
60 pub mac_addresses: Vec<&'a str>,
61 pub ibans: Vec<&'a str>,
62 pub gitrepos: Vec<&'a str>,
63 }
64
65impl <'a>CommonRegex<'a> {
66 pub fn new() -> CommonRegex<'a> {
67 CommonRegex {
68 dates: Vec::new(),
69 times: Vec::new(),
70 phones: Vec::new(),
71 phones_with_exts: Vec::new(),
72 links: Vec::new(),
73 emails: Vec::new(),
74 ipv4s: Vec::new(),
75 ipv6s: Vec::new(),
76 prices: Vec::new(),
77 hex_colors: Vec::new(),
78 credit_cards: Vec::new(),
79 visas: Vec::new(),
80 mastercards: Vec::new(),
81 btc_addresses: Vec::new(),
82 street_addresses: Vec::new(),
83 zip_codes: Vec::new(),
84 po_boxs: Vec::new(),
85 ssns: Vec::new(),
86 md5s: Vec::new(),
87 sha1s: Vec::new(),
88 sha2s: Vec::new(),
89 guids: Vec::new(),
90 isbn13s: Vec::new(),
91 isbn10s: Vec::new(),
92 mac_addresses: Vec::new(),
93 ibans: Vec::new(),
94 gitrepos: Vec::new(),
95 }
96 }
97 pub fn common_regex(&self, text: &'a str) -> CommonRegex<'a> {
98 CommonRegex {
99 dates: dates(text),
100 times: times(text),
101 phones: phones(text),
102 phones_with_exts: phones_with_exts(text),
103 links: links(text),
104 emails: emails(text),
105 ipv4s: ips(text),
106 ipv6s: ipv6s(text),
107 prices: prices(text),
108 hex_colors: hex_colors(text),
109 credit_cards: credit_cards(text),
110 visas: visas(text),
111 mastercards: mastercards(text),
112 btc_addresses: btc_addresses(text),
113 street_addresses: street_addresses(text),
114 zip_codes: zip_codes(text),
115 po_boxs: po_boxs(text),
116 ssns: ssns(text),
117 md5s: md5s(text),
118 sha1s: sha1s(text),
119 sha2s: sha2s(text),
120 guids: guids(text),
121 isbn13s: isbn13s(text),
122 isbn10s: isbn10s(text),
123 mac_addresses: mac_addresses(text),
124 ibans: ibans(text),
125 gitrepos: gitrepos(text),
126 }
127 }
128}
129
130
131pub fn common_regex<'a>(text: &'a str) -> CommonRegex<'a> {
132 CommonRegex {
133 dates: dates(text),
134 times: times(text),
135 phones: phones(text),
136 phones_with_exts: phones_with_exts(text),
137 links: links(text), emails: emails(text),
139 ipv4s: ips(text),
140 ipv6s: ipv6s(text),
141 prices: prices(text),
142 hex_colors: hex_colors(text),
143 credit_cards: credit_cards(text),
144 visas: visas(text),
145 mastercards: mastercards(text),
146 btc_addresses: btc_addresses(text),
147 street_addresses: street_addresses(text),
148 zip_codes: zip_codes(text),
149 po_boxs: po_boxs(text),
150 ssns: ssns(text),
151 md5s: md5s(text),
152 sha1s: sha1s(text),
153 sha2s: sha2s(text),
154 guids: guids(text),
155 isbn13s: isbn13s(text),
156 isbn10s: isbn10s(text),
157 mac_addresses: mac_addresses(text),
158 ibans: ibans(text),
159 gitrepos: gitrepos(text)
160 }
161 }
162
163
164#[allow(dead_code)]
165pub fn parse<'caps>(regex: &str, text: &'caps str) -> Vec<&'caps str> {
166 let mut caps:Vec<&str> = [].to_vec();
167 for cap in Regex::new(regex).unwrap().captures_iter(text) {
168 caps.push(cap.get(0).map_or("", |m| m.as_str()));
169 }
170 caps
171}
172
173#[allow(dead_code)]
174pub fn dates(text: &str) -> Vec<&str> {
175 parse(DATE, text)
176}
177
178#[allow(dead_code)]
179pub fn times(text: &str) -> Vec<&str> {
180 parse(TIME, text)
181}
182
183#[allow(dead_code)]
184pub fn phones(text: &str) -> Vec<&str> {
185 parse(PHONE, text)
186}
187
188#[allow(dead_code)]
189pub fn phones_with_exts(text: &str) -> Vec<&str> {
190 parse(PHONEWTHEXT, text)
191}
192
193#[allow(dead_code)]
194pub fn links(text: &str) -> Vec<&str> {
195 parse(LINK, text)
196}
197
198#[allow(dead_code)]
199pub fn emails(text: &str) -> Vec<&str> {
200 parse(EMAIL, text)
201}
202
203#[allow(dead_code)]
204pub fn ips(text: &str) -> Vec<&str> {
205 parse(IPV4, text)
206}
207
208#[allow(dead_code)]
209pub fn ipv6s(text: &str) -> Vec<&str> {
210 parse(IPV6, text)
211}
212
213#[allow(dead_code)]
214pub fn prices(text: &str) -> Vec<&str> {
215 parse(PRICE, text)
216}
217
218#[allow(dead_code)]
219pub fn hex_colors(text: &str) -> Vec<&str> {
220 parse(HEXCOLOR, text)
221}
222
223#[allow(dead_code)]
224pub fn credit_cards(text: &str) -> Vec<&str> {
225 parse(CREDITCARD, text)
226}
227
228#[allow(dead_code)]
229pub fn visas(text: &str) -> Vec<&str> {
230 parse(VISA, text)
231}
232
233#[allow(dead_code)]
234pub fn mastercards(text: &str) -> Vec<&str> {
235 parse(MASTERCARD, text)
236}
237
238#[allow(dead_code)]
239pub fn btc_addresses(text: &str) -> Vec<&str> {
240 parse(BTCADDRESS, text)
241}
242
243#[allow(dead_code)]
244pub fn street_addresses(text: &str) -> Vec<&str> {
245 parse(STREETADDRESS, text)
246}
247
248#[allow(dead_code)]
249pub fn zip_codes(text: &str) -> Vec<&str> {
250 parse(ZIPCODE, text)
251}
252
253#[allow(dead_code)]
254pub fn po_boxs(text: &str) -> Vec<&str> {
255 parse(POBOX, text)
256}
257
258#[allow(dead_code)]
259pub fn ssns(text: &str) -> Vec<&str> {
260 parse(SSN, text)
261}
262
263#[allow(dead_code)]
264pub fn md5s(text: &str) -> Vec<&str> {
265 parse(MD5HEX, text)
266}
267
268#[allow(dead_code)]
269pub fn sha1s(text: &str) -> Vec<&str> {
270 parse(SHA1HEX, text)
271}
272
273#[allow(dead_code)]
274pub fn sha2s(text: &str) -> Vec<&str> {
275 parse(SHA256HEX, text)
276}
277
278#[allow(dead_code)]
279pub fn guids(text: &str) -> Vec<&str> {
280 parse(GUID, text)
281}
282
283#[allow(dead_code)]
284pub fn isbn13s(text: &str) -> Vec<&str> {
285 parse(ISBN13, text)
286}
287
288#[allow(dead_code)]
289pub fn isbn10s(text: &str) -> Vec<&str> {
290 parse(ISBN10, text)
291}
292
293#[allow(dead_code)]
294pub fn mac_addresses(text: &str) -> Vec<&str> {
295 parse(MACADDRESS, text)
296}
297
298#[allow(dead_code)]
299pub fn ibans(text: &str) -> Vec<&str> {
300 parse(IBAN, text)
301}
302
303#[allow(dead_code)]
304pub fn gitrepos(text: &str) -> Vec<&str> {
305 parse(GITREPO, text)
306}
307
308
309#[cfg(test)]
310mod tests {
311 use super::common_regex;
312 #[test]
313 fn test_common_regex() {
314 let text = "John, please get that article on www.linkedin.com to me by 5:00PM
315 on Jan 9th 2012. 4:00 would be ideal, actually. If you have any
316 questions, You can reach me at (519)-236-2723x341 or get in touch with
317 my associate at harold.smith@gmail.com";
318
319 assert_eq!(format!("{:?}", common_regex(text)),r#"CommonRegex { dates: ["Jan 9th 2012"], times: ["5:00PM", "4:00 "], phones: ["(519)-236-2723"], phones_with_exts: ["(519)-236-2723x341"], links: ["www.linkedin.com", "harold.smith@gmail.com"], emails: ["harold.smith@gmail.com"], ipv4s: [], ipv6s: [], prices: [], hex_colors: ["201", "dea", "eac", "519", "236", "272", "341"], credit_cards: [], visas: [], mastercards: [], btc_addresses: [], street_addresses: [], zip_codes: [], po_boxs: [], ssns: [], md5s: [], sha1s: [], sha2s: [], guids: [], isbn13s: [], isbn10s: [], mac_addresses: [], ibans: [], gitrepos: [] }"#);
320 }
321
322 #[test]
323 fn test_times() {
324 use super::times;
325 assert_eq!(format!("{:?}", times("When are you free? Do you want to meet up for coffee at 4:00?")), r#"["4:00"]"#);
326 }
327
328 #[test]
329 fn test_prices() {
330 use super::prices;
331 assert_eq!(format!("{:?}", prices("They said the price was US$5,000.90, actually it is US$3,900.5. It\'s $1100.4 less, can you imagine this?")), r#"["$5,000.90", "$3,900.5", "$110"]"#);
332 }
333
334 #[test]
335 fn test_ipv6s() {
336 use super::ipv6s;
337 assert_eq!(format!("{:?}", ipv6s("The IPv6 address for localhost is 0:0:0:0:0:0:0:1, or alternatively, ::1.")), r#"["0:0:0:0:0:0:0:1", "::1"]"#);
338 }
339}