regex_cache/
cache.rs

1// Copyright 2017 1aim GmbH
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy of
4// this software and associated documentation files (the "Software"), to deal in
5// the Software without restriction, including without limitation the rights to
6// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
7// of the Software, and to permit persons to whom the Software is furnished to do
8// so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in all
11// copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19// SOFTWARE.
20
21use std::ops::{Deref, DerefMut};
22use std::sync::{Mutex, Arc};
23use std::borrow::Cow;
24use std::fmt;
25use std::str;
26
27use regex::{Regex, RegexBuilder, Error};
28use regex::{Match, Captures, Replacer};
29use crate::syntax;
30use crate::options::Options;
31use crate::lru::LruCache;
32
33/// An LRU cache for regular expressions.
34#[derive(Clone, Debug)]
35pub struct RegexCache(LruCache<String, Regex>);
36
37impl RegexCache {
38	/// Create a new LRU cache with the given size limit.
39	pub fn new(capacity: usize) -> RegexCache {
40		RegexCache(LruCache::new(capacity))
41	}
42
43	/// Save the given regular expression in the cache.
44	///
45	/// # Example
46	///
47	/// ```
48	/// # use regex_cache::{Regex, RegexCache};
49	/// let mut cache = RegexCache::new(100);
50	/// let     re    = Regex::new(r"^\d+$").unwrap();
51	///
52	/// // By saving the previously created regular expression further calls to
53	/// // `compile` won't actually compile the regular expression.
54	/// cache.save(re);
55	///
56	/// assert!(cache.compile(r"^\d+$").unwrap().is_match("1234"));
57	/// assert!(!cache.compile(r"^\d+$").unwrap().is_match("abcd"));
58	/// ```
59	pub fn save(&mut self, re: Regex) -> &Regex {
60		let source = re.as_str().to_owned();
61
62		if !self.0.contains_key(re.as_str()) {
63			self.insert(source.clone(), re);
64		}
65
66		self.0.get_mut(&source).unwrap()
67	}
68
69	/// Create a new regular expression in the cache.
70	///
71	/// # Example
72	///
73	/// ```
74	/// # use regex_cache::RegexCache;
75	/// let mut cache = RegexCache::new(100);
76	///
77	/// assert!(cache.compile(r"^\d+$").unwrap().is_match("1234"));
78	/// assert!(!cache.compile(r"^\d+$").unwrap().is_match("abcd"));
79	/// ```
80	pub fn compile(&mut self, source: &str) -> Result<&Regex, Error> {
81		if !self.0.contains_key(source) {
82			self.0.insert(source.into(), Regex::new(source)?);
83		}
84
85		Ok(self.0.get_mut(source).unwrap())
86	}
87
88	/// Configure a new regular expression.
89	///
90	/// # Example
91	///
92	/// ```
93	/// # use regex_cache::RegexCache;
94	/// let mut cache = RegexCache::new(100);
95	///
96	/// assert!(cache.configure(r"abc", |b| b.case_insensitive(true)).unwrap()
97	/// 	.is_match("ABC"));
98	///
99	/// assert!(!cache.configure(r"abc", |b| b.case_insensitive(true)).unwrap()
100	/// 	.is_match("123"));
101	/// ```
102	pub fn configure<F>(&mut self, source: &str, f: F) -> Result<&Regex, Error>
103		where F: FnOnce(&mut RegexBuilder) -> &mut RegexBuilder
104	{
105		if !self.0.contains_key(source) {
106			self.0.insert(source.into(), f(&mut RegexBuilder::new(source)).build()?);
107		}
108
109		Ok(self.0.get_mut(source).unwrap())
110	}
111}
112
113impl Deref for RegexCache {
114	type Target = LruCache<String, Regex>;
115
116	fn deref(&self) -> &Self::Target {
117		&self.0
118	}
119}
120
121impl DerefMut for RegexCache {
122	fn deref_mut(&mut self) -> &mut Self::Target {
123		&mut self.0
124	}
125}
126
127#[derive(Clone)]
128pub struct CachedRegex {
129	builder: CachedRegexBuilder,
130}
131
132macro_rules! regex {
133	($self:ident) => (
134		$self.builder.cache.lock().unwrap().configure(&$self.builder.source, |b|
135			$self.builder.options.define(b)).unwrap()
136	)
137}
138
139impl CachedRegex {
140	/// Create a new cached `Regex` for the given source, checking the syntax is
141	/// valid.
142	pub fn new(cache: Arc<Mutex<RegexCache>>, source: &str) -> Result<CachedRegex, Error> {
143		if let Err(err) = syntax::Parser::new().parse(source) {
144			return Err(Error::Syntax(err.to_string()));
145		}
146
147		Ok(CachedRegex::new_unchecked(cache, source))
148	}
149
150	/// Create a new cached `Regex` for the given source, without checking if the 
151	/// syntax is valid.
152	/// 
153	/// Only use this if you know that the syntax is valid or you are ready to 
154	/// handle potential syntax errors later on.
155	pub fn new_unchecked(cache: Arc<Mutex<RegexCache>>, source: &str) -> CachedRegex {
156		CachedRegex::from(CachedRegexBuilder::new(cache, source))
157	}
158
159	fn from(builder: CachedRegexBuilder) -> Self {
160		CachedRegex {
161			builder: builder,
162		}
163	}
164
165	/// Refer to `Regex::is_match`.
166	pub fn is_match(&self, text: &str) -> bool {
167		regex!(self).is_match(text)
168	}
169
170	/// Refer to `Regex::find`.
171	pub fn find<'t>(&self, text: &'t str) -> Option<Match<'t>> {
172		regex!(self).find(text)
173	}
174
175	/// Refer to `Regex::captures`.
176	pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> {
177		regex!(self).captures(text)
178	}
179
180	/// Refer to `Regex::replace`.
181	pub fn replace<'t, R: Replacer>(&self, text: &'t str, rep: R) -> Cow<'t, str> {
182		regex!(self).replace(text, rep)
183	}
184
185	/// Refer to `Regex::replace_all`.
186	pub fn replace_all<'t, R: Replacer>(&self, text: &'t str, rep: R) -> Cow<'t, str> {
187		regex!(self).replace_all(text, rep)
188	}
189
190	/// Refer to `Regex::shortest_match`.
191	pub fn shortest_match(&self, text: &str) -> Option<usize> {
192		regex!(self).shortest_match(text)
193	}
194
195	pub fn captures_len(&self) -> usize {
196		regex!(self).captures_len()
197	}
198
199	pub fn as_str(&self) -> &str {
200		&self.builder.source
201	}
202}
203
204impl fmt::Debug for CachedRegex {
205	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
206		fmt::Debug::fmt(regex!(self), f)
207	}
208}
209
210impl fmt::Display for CachedRegex {
211	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
212		fmt::Display::fmt(regex!(self), f)
213	}
214}
215
216/// A configurable builder for a cached `Regex`.
217#[derive(Clone, Debug)]
218pub struct CachedRegexBuilder {
219	cache:   Arc<Mutex<RegexCache>>,
220	source:  String,
221	options: Options,
222}
223
224impl CachedRegexBuilder {
225	/// Create a new regular expression builder with the given pattern.
226	///
227	/// If the pattern is invalid, then an error will be returned when
228	/// `compile` is called.
229	pub fn new(cache: Arc<Mutex<RegexCache>>, source: &str) -> CachedRegexBuilder {
230		CachedRegexBuilder {
231			cache:   cache,
232			source:  source.to_owned(),
233			options: Default::default(),
234		}
235	}
236
237	/// Consume the builder and compile the regular expression.
238	///
239	/// Note that calling `as_str` on the resulting `Regex` will produce the
240	/// pattern given to `new` verbatim. Notably, it will not incorporate any
241	/// of the flags set on this builder.
242	pub fn build(&self) -> Result<CachedRegex, Error> {
243		if let Err(err) = syntax::Parser::new().parse(&self.source) {
244			return Err(Error::Syntax(err.to_string()));
245		}
246
247		Ok(CachedRegex::from(self.clone()))
248	}
249
250	/// Consume the builder and compile the regular expression without checking 
251	/// if the syntax is valid.
252	/// 
253	/// Only use this if you know that the syntax is valid or you are ready to 
254	/// handle potential syntax errors later on.
255	///
256	/// Note that calling `as_str` on the resulting `Regex` will produce the
257	/// pattern given to `new` verbatim. Notably, it will not incorporate any
258	/// of the flags set on this builder.
259	pub fn build_unchecked(&self) -> CachedRegex {
260		CachedRegex::from(self.clone())
261	}
262
263	/// Set the value for the case insensitive (`i`) flag.
264	pub fn case_insensitive(&mut self, yes: bool) -> &mut CachedRegexBuilder {
265		self.options.case_insensitive = yes;
266		self
267	}
268
269	/// Set the value for the multi-line matching (`m`) flag.
270	pub fn multi_line(&mut self, yes: bool) -> &mut CachedRegexBuilder {
271		self.options.multi_line = yes;
272		self
273	}
274
275	/// Set the value for the any character (`s`) flag, where in `.` matches
276	/// anything when `s` is set and matches anything except for new line when
277	/// it is not set (the default).
278	///
279	/// N.B. "matches anything" means "any byte" for `regex::bytes::Regex`
280	/// expressions and means "any Unicode scalar value" for `regex::Regex`
281	/// expressions.
282	pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut CachedRegexBuilder {
283		self.options.dot_matches_new_line = yes;
284		self
285	}
286
287	/// Set the value for the greedy swap (`U`) flag.
288	pub fn swap_greed(&mut self, yes: bool) -> &mut CachedRegexBuilder {
289		self.options.swap_greed = yes;
290		self
291	}
292
293	/// Set the value for the ignore whitespace (`x`) flag.
294	pub fn ignore_whitespace(&mut self, yes: bool) -> &mut CachedRegexBuilder {
295		self.options.ignore_whitespace = yes;
296		self
297	}
298
299	/// Set the value for the Unicode (`u`) flag.
300	pub fn unicode(&mut self, yes: bool) -> &mut CachedRegexBuilder {
301		self.options.unicode = yes;
302		self
303	}
304
305	/// Set the approximate size limit of the compiled regular expression.
306	///
307	/// This roughly corresponds to the number of bytes occupied by a single
308	/// compiled program. If the program exceeds this number, then a
309	/// compilation error is returned.
310	pub fn size_limit(&mut self, limit: usize) -> &mut CachedRegexBuilder {
311		self.options.size_limit = limit;
312		self
313	}
314
315	/// Set the approximate size of the cache used by the DFA.
316	///
317	/// This roughly corresponds to the number of bytes that the DFA will
318	/// use while searching.
319	///
320	/// Note that this is a *per thread* limit. There is no way to set a global
321	/// limit. In particular, if a regex is used from multiple threads
322	/// simulanteously, then each thread may use up to the number of bytes
323	/// specified here.
324	pub fn dfa_size_limit(&mut self, limit: usize) -> &mut CachedRegexBuilder {
325		self.options.dfa_size_limit = limit;
326		self
327	}
328}
329
330#[cfg(test)]
331mod test {
332	use std::sync::{Arc, Mutex};
333	use crate::cache::{RegexCache, CachedRegex};
334
335	#[test]
336	fn respects_limit() {
337		let mut cache = RegexCache::new(2);
338
339		cache.compile("[01]2").unwrap();
340		cache.compile("[21]0").unwrap();
341
342		assert_eq!(cache.len(), 2);
343		cache.compile("[21]3").unwrap();
344		assert_eq!(cache.len(), 2);
345	}
346
347	#[test]
348	fn cached_regex() {
349		let cache = Arc::new(Mutex::new(RegexCache::new(100)));
350		let re = CachedRegex::new(cache.clone(), r"^\d+$").unwrap();
351
352		assert!(re.is_match("123"));
353		assert!(!re.is_match("abc"));
354	}
355}