jj_lib/
str_util.rs

1// Copyright 2021-2023 The Jujutsu Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! String helpers.
16
17use std::borrow::Borrow;
18use std::borrow::Cow;
19use std::collections::BTreeMap;
20use std::fmt;
21use std::fmt::Debug;
22use std::ops::Deref;
23
24use bstr::ByteSlice as _;
25use either::Either;
26use globset::Glob;
27use globset::GlobBuilder;
28use thiserror::Error;
29
30/// Error occurred during pattern string parsing.
31#[derive(Debug, Error)]
32pub enum StringPatternParseError {
33    /// Unknown pattern kind is specified.
34    #[error("Invalid string pattern kind `{0}:`")]
35    InvalidKind(String),
36    /// Failed to parse glob pattern.
37    #[error(transparent)]
38    GlobPattern(globset::Error),
39    /// Failed to parse regular expression.
40    #[error(transparent)]
41    Regex(regex::Error),
42}
43
44/// A wrapper for [`Glob`] and its matcher with a more concise `Debug` impl.
45#[derive(Clone)]
46pub struct GlobPattern {
47    glob: Glob,
48    // TODO: Maybe better to add StringPattern::to_matcher(), and move regex
49    // compilation there.
50    regex: regex::bytes::Regex,
51}
52
53impl GlobPattern {
54    /// Returns true if this pattern matches `haystack`.
55    pub fn is_match(&self, haystack: &[u8]) -> bool {
56        self.regex.is_match(haystack)
57    }
58
59    /// Returns the original glob pattern.
60    pub fn as_str(&self) -> &str {
61        self.glob.glob()
62    }
63
64    /// Converts this glob pattern to a bytes regex.
65    pub fn to_regex(&self) -> regex::bytes::Regex {
66        self.regex.clone()
67    }
68}
69
70impl Debug for GlobPattern {
71    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
72        f.debug_tuple("GlobPattern").field(&self.as_str()).finish()
73    }
74}
75
76fn parse_glob(src: &str, icase: bool) -> Result<GlobPattern, StringPatternParseError> {
77    let glob = GlobBuilder::new(src)
78        .case_insensitive(icase)
79        // Don't use platform-dependent default. This pattern isn't meant for
80        // testing file-system paths. If backslash escape were disabled, "\" in
81        // pattern would be normalized to "/" on Windows.
82        .backslash_escape(true)
83        .build()
84        .map_err(StringPatternParseError::GlobPattern)?;
85    // Based on new_regex() in globset. We don't use GlobMatcher::is_match(path)
86    // because the input string shouldn't be normalized as path.
87    let regex = regex::bytes::RegexBuilder::new(glob.regex())
88        .dot_matches_new_line(true)
89        .build()
90        .expect("glob regex should be valid");
91    Ok(GlobPattern { glob, regex })
92}
93
94/// Pattern to be tested against string property like commit description or
95/// bookmark name.
96#[derive(Clone, Debug)]
97pub enum StringPattern {
98    /// Matches strings exactly.
99    Exact(String),
100    /// Matches strings case‐insensitively.
101    ExactI(String),
102    /// Matches strings that contain a substring.
103    Substring(String),
104    /// Matches strings that case‐insensitively contain a substring.
105    SubstringI(String),
106    /// Matches with a Unix‐style shell wildcard pattern.
107    Glob(Box<GlobPattern>),
108    /// Matches with a case‐insensitive Unix‐style shell wildcard pattern.
109    GlobI(Box<GlobPattern>),
110    /// Matches substrings with a regular expression.
111    Regex(regex::bytes::Regex),
112    /// Matches substrings with a case‐insensitive regular expression.
113    RegexI(regex::bytes::Regex),
114}
115
116impl StringPattern {
117    /// Pattern that matches any string.
118    pub const fn everything() -> Self {
119        Self::Substring(String::new())
120    }
121
122    /// Parses the given string as a [`StringPattern`]. Everything before the
123    /// first ":" is considered the string's prefix. If the prefix is
124    /// "exact[-i]:", "glob[-i]:", or "substring[-i]:", a pattern of the
125    /// specified kind is returned. Returns an error if the string has an
126    /// unrecognized prefix. Otherwise, a `StringPattern::Exact` is
127    /// returned.
128    pub fn parse(src: &str) -> Result<Self, StringPatternParseError> {
129        if let Some((kind, pat)) = src.split_once(':') {
130            Self::from_str_kind(pat, kind)
131        } else {
132            Ok(Self::exact(src))
133        }
134    }
135
136    /// Constructs a pattern that matches exactly.
137    pub fn exact(src: impl Into<String>) -> Self {
138        Self::Exact(src.into())
139    }
140
141    /// Constructs a pattern that matches case‐insensitively.
142    pub fn exact_i(src: impl Into<String>) -> Self {
143        Self::ExactI(src.into())
144    }
145
146    /// Constructs a pattern that matches a substring.
147    pub fn substring(src: impl Into<String>) -> Self {
148        Self::Substring(src.into())
149    }
150
151    /// Constructs a pattern that case‐insensitively matches a substring.
152    pub fn substring_i(src: impl Into<String>) -> Self {
153        Self::SubstringI(src.into())
154    }
155
156    /// Parses the given string as a glob pattern.
157    pub fn glob(src: &str) -> Result<Self, StringPatternParseError> {
158        // TODO: if no meta character found, it can be mapped to Exact.
159        Ok(Self::Glob(Box::new(parse_glob(src, false)?)))
160    }
161
162    /// Parses the given string as a case‐insensitive glob pattern.
163    pub fn glob_i(src: &str) -> Result<Self, StringPatternParseError> {
164        Ok(Self::GlobI(Box::new(parse_glob(src, true)?)))
165    }
166
167    /// Parses the given string as a regular expression.
168    pub fn regex(src: &str) -> Result<Self, StringPatternParseError> {
169        let pattern = regex::bytes::Regex::new(src).map_err(StringPatternParseError::Regex)?;
170        Ok(Self::Regex(pattern))
171    }
172
173    /// Parses the given string as a case-insensitive regular expression.
174    pub fn regex_i(src: &str) -> Result<Self, StringPatternParseError> {
175        let pattern = regex::bytes::RegexBuilder::new(src)
176            .case_insensitive(true)
177            .build()
178            .map_err(StringPatternParseError::Regex)?;
179        Ok(Self::RegexI(pattern))
180    }
181
182    /// Parses the given string as a pattern of the specified `kind`.
183    pub fn from_str_kind(src: &str, kind: &str) -> Result<Self, StringPatternParseError> {
184        match kind {
185            "exact" => Ok(Self::exact(src)),
186            "exact-i" => Ok(Self::exact_i(src)),
187            "substring" => Ok(Self::substring(src)),
188            "substring-i" => Ok(Self::substring_i(src)),
189            "glob" => Self::glob(src),
190            "glob-i" => Self::glob_i(src),
191            "regex" => Self::regex(src),
192            "regex-i" => Self::regex_i(src),
193            _ => Err(StringPatternParseError::InvalidKind(kind.to_owned())),
194        }
195    }
196
197    /// Returns true if this pattern matches input strings exactly.
198    pub fn is_exact(&self) -> bool {
199        self.as_exact().is_some()
200    }
201
202    /// Returns a literal pattern if this should match input strings exactly.
203    ///
204    /// This can be used to optimize map lookup by exact key.
205    pub fn as_exact(&self) -> Option<&str> {
206        // TODO: Handle trivial case‐insensitive patterns here? It might make people
207        // expect they can use case‐insensitive patterns in contexts where they
208        // generally can’t.
209        match self {
210            Self::Exact(literal) => Some(literal),
211            _ => None,
212        }
213    }
214
215    /// Returns the original string of this pattern.
216    pub fn as_str(&self) -> &str {
217        match self {
218            Self::Exact(literal) => literal,
219            Self::ExactI(literal) => literal,
220            Self::Substring(needle) => needle,
221            Self::SubstringI(needle) => needle,
222            Self::Glob(pattern) => pattern.as_str(),
223            Self::GlobI(pattern) => pattern.as_str(),
224            Self::Regex(pattern) => pattern.as_str(),
225            Self::RegexI(pattern) => pattern.as_str(),
226        }
227    }
228
229    /// Converts this pattern to a glob string. Returns `None` if the pattern
230    /// can't be represented as a glob.
231    pub fn to_glob(&self) -> Option<Cow<'_, str>> {
232        // TODO: Handle trivial case‐insensitive patterns here? It might make people
233        // expect they can use case‐insensitive patterns in contexts where they
234        // generally can’t.
235        match self {
236            Self::Exact(literal) => Some(globset::escape(literal).into()),
237            Self::Substring(needle) => {
238                if needle.is_empty() {
239                    Some("*".into())
240                } else {
241                    Some(format!("*{}*", globset::escape(needle)).into())
242                }
243            }
244            Self::Glob(pattern) => Some(pattern.as_str().into()),
245            Self::ExactI(_) => None,
246            Self::SubstringI(_) => None,
247            Self::GlobI(_) => None,
248            Self::Regex(_) => None,
249            Self::RegexI(_) => None,
250        }
251    }
252
253    /// Returns true if this pattern matches the `haystack` string.
254    ///
255    /// When matching against a case‐insensitive pattern, only ASCII case
256    /// differences are currently folded. This may change in the future.
257    pub fn is_match(&self, haystack: &str) -> bool {
258        self.is_match_bytes(haystack.as_bytes())
259    }
260
261    /// Returns true if this pattern matches the `haystack` bytes.
262    pub fn is_match_bytes(&self, haystack: &[u8]) -> bool {
263        // TODO: Unicode case folding is complicated and can be
264        // locale‐specific. The `globset` crate and Gitoxide only deal with
265        // ASCII case folding, so we do the same here; a more elaborate case
266        // folding system will require making sure those behave in a matching
267        // manner where relevant. That said, regex patterns are unicode-aware by
268        // default, so we already have some inconsistencies.
269        //
270        // Care will need to be taken regarding normalization and the choice of an
271        // appropriate case‐insensitive comparison scheme (`toNFKC_Casefold`?) to ensure
272        // that it is compatible with the standard case‐insensitivity of haystack
273        // components (like internationalized domain names in email addresses). The
274        // availability of normalization and case folding schemes in database backends
275        // will also need to be considered. A locale‐specific case folding
276        // scheme would likely not be appropriate for Jujutsu.
277        //
278        // For some discussion of this topic, see:
279        // <https://github.com/unicode-org/icu4x/issues/3151>
280        match self {
281            Self::Exact(literal) => haystack == literal.as_bytes(),
282            Self::ExactI(literal) => haystack.eq_ignore_ascii_case(literal.as_bytes()),
283            Self::Substring(needle) => haystack.contains_str(needle),
284            Self::SubstringI(needle) => haystack
285                .to_ascii_lowercase()
286                .contains_str(needle.to_ascii_lowercase()),
287            // (Glob, GlobI) and (Regex, RegexI) pairs are identical here, but
288            // callers might want to translate these to backend-specific query
289            // differently.
290            Self::Glob(pattern) => pattern.is_match(haystack),
291            Self::GlobI(pattern) => pattern.is_match(haystack),
292            Self::Regex(pattern) => pattern.is_match(haystack),
293            Self::RegexI(pattern) => pattern.is_match(haystack),
294        }
295    }
296
297    /// Converts the pattern into a bytes regex.
298    pub fn to_regex(&self) -> regex::bytes::Regex {
299        match self {
300            Self::Exact(literal) => {
301                regex::bytes::RegexBuilder::new(&format!("^{}$", regex::escape(literal)))
302                    .build()
303                    .expect("impossible to fail to compile regex of literal")
304            }
305            Self::ExactI(literal) => {
306                regex::bytes::RegexBuilder::new(&format!("^{}$", regex::escape(literal)))
307                    .case_insensitive(true)
308                    .build()
309                    .expect("impossible to fail to compile regex of literal")
310            }
311            Self::Substring(literal) => regex::bytes::RegexBuilder::new(&regex::escape(literal))
312                .build()
313                .expect("impossible to fail to compile regex of literal"),
314            Self::SubstringI(literal) => regex::bytes::RegexBuilder::new(&regex::escape(literal))
315                .case_insensitive(true)
316                .build()
317                .expect("impossible to fail to compile regex of literal"),
318            Self::Glob(glob_pattern) => glob_pattern.to_regex(),
319            // The regex generated represents the case insensitivity itself
320            Self::GlobI(glob_pattern) => glob_pattern.to_regex(),
321            Self::Regex(regex) => regex.clone(),
322            Self::RegexI(regex) => regex.clone(),
323        }
324    }
325
326    /// Iterates entries of the given `map` whose string keys match this
327    /// pattern.
328    pub fn filter_btree_map<'a, 'b, K: Borrow<str> + Ord, V>(
329        &'b self,
330        map: &'a BTreeMap<K, V>,
331    ) -> impl Iterator<Item = (&'a K, &'a V)> + use<'a, 'b, K, V> {
332        self.filter_btree_map_with(map, |key| key, |key| key)
333    }
334
335    /// Iterates entries of the given `map` whose string-like keys match this
336    /// pattern.
337    ///
338    /// The borrowed key type is constrained by the `Deref::Target`. It must be
339    /// convertible to/from `str`.
340    pub fn filter_btree_map_as_deref<'a, 'b, K, V>(
341        &'b self,
342        map: &'a BTreeMap<K, V>,
343    ) -> impl Iterator<Item = (&'a K, &'a V)> + use<'a, 'b, K, V>
344    where
345        K: Borrow<K::Target> + Deref + Ord,
346        K::Target: AsRef<str> + Ord,
347        str: AsRef<K::Target>,
348    {
349        self.filter_btree_map_with(map, AsRef::as_ref, AsRef::as_ref)
350    }
351
352    fn filter_btree_map_with<'a, 'b, K, Q, V, FromKey, ToKey>(
353        &'b self,
354        map: &'a BTreeMap<K, V>,
355        from_key: FromKey,
356        to_key: ToKey,
357        // TODO: Q, FromKey, and ToKey don't have to be captured, but
358        // "currently, all type parameters are required to be mentioned in the
359        // precise captures list" as of rustc 1.85.0.
360    ) -> impl Iterator<Item = (&'a K, &'a V)> + use<'a, 'b, K, Q, V, FromKey, ToKey>
361    where
362        K: Borrow<Q> + Ord,
363        Q: Ord + ?Sized,
364        FromKey: Fn(&Q) -> &str,
365        ToKey: Fn(&str) -> &Q,
366    {
367        if let Some(key) = self.as_exact() {
368            Either::Left(map.get_key_value(to_key(key)).into_iter())
369        } else {
370            Either::Right(
371                map.iter()
372                    .filter(move |&(key, _)| self.is_match(from_key(key.borrow()))),
373            )
374        }
375    }
376}
377
378impl fmt::Display for StringPattern {
379    /// Shows the original string of this pattern.
380    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
381        write!(f, "{}", self.as_str())
382    }
383}
384
385#[cfg(test)]
386mod tests {
387    use assert_matches::assert_matches;
388
389    use super::*;
390
391    #[test]
392    fn test_string_pattern_to_glob() {
393        assert_eq!(StringPattern::everything().to_glob(), Some("*".into()));
394        assert_eq!(StringPattern::exact("a").to_glob(), Some("a".into()));
395        assert_eq!(StringPattern::exact("*").to_glob(), Some("[*]".into()));
396        assert_eq!(
397            StringPattern::glob("*").unwrap().to_glob(),
398            Some("*".into())
399        );
400        assert_eq!(
401            StringPattern::Substring("a".into()).to_glob(),
402            Some("*a*".into())
403        );
404        assert_eq!(
405            StringPattern::Substring("*".into()).to_glob(),
406            Some("*[*]*".into())
407        );
408    }
409
410    #[test]
411    fn test_parse() {
412        // Parse specific pattern kinds.
413        assert_matches!(
414            StringPattern::parse("exact:foo"),
415            Ok(StringPattern::Exact(s)) if s == "foo"
416        );
417        assert_matches!(
418            StringPattern::from_str_kind("foo", "exact"),
419            Ok(StringPattern::Exact(s)) if s == "foo"
420        );
421        assert_matches!(
422            StringPattern::parse("glob:foo*"),
423            Ok(StringPattern::Glob(p)) if p.as_str() == "foo*"
424        );
425        assert_matches!(
426            StringPattern::from_str_kind("foo*", "glob"),
427            Ok(StringPattern::Glob(p)) if p.as_str() == "foo*"
428        );
429        assert_matches!(
430            StringPattern::parse("substring:foo"),
431            Ok(StringPattern::Substring(s)) if s == "foo"
432        );
433        assert_matches!(
434            StringPattern::from_str_kind("foo", "substring"),
435            Ok(StringPattern::Substring(s)) if s == "foo"
436        );
437        assert_matches!(
438            StringPattern::parse("substring-i:foo"),
439            Ok(StringPattern::SubstringI(s)) if s == "foo"
440        );
441        assert_matches!(
442            StringPattern::from_str_kind("foo", "substring-i"),
443            Ok(StringPattern::SubstringI(s)) if s == "foo"
444        );
445        assert_matches!(
446            StringPattern::parse("regex:foo"),
447            Ok(StringPattern::Regex(p)) if p.as_str() == "foo"
448        );
449        assert_matches!(
450            StringPattern::from_str_kind("foo", "regex"),
451            Ok(StringPattern::Regex(p)) if p.as_str() == "foo"
452        );
453        assert_matches!(
454            StringPattern::parse("regex-i:foo"),
455            Ok(StringPattern::RegexI(p)) if p.as_str() == "foo"
456        );
457        assert_matches!(
458            StringPattern::from_str_kind("foo", "regex-i"),
459            Ok(StringPattern::RegexI(p)) if p.as_str() == "foo"
460        );
461
462        // Parse a pattern that contains a : itself.
463        assert_matches!(
464            StringPattern::parse("exact:foo:bar"),
465            Ok(StringPattern::Exact(s)) if s == "foo:bar"
466        );
467
468        // If no kind is specified, the input is treated as an exact pattern.
469        assert_matches!(
470            StringPattern::parse("foo"),
471            Ok(StringPattern::Exact(s)) if s == "foo"
472        );
473
474        // Parsing an unknown prefix results in an error.
475        assert_matches!(
476            StringPattern::parse("unknown-prefix:foo"),
477            Err(StringPatternParseError::InvalidKind(_))
478        );
479    }
480
481    #[test]
482    fn test_glob_is_match() {
483        assert!(StringPattern::glob("foo").unwrap().is_match("foo"));
484        assert!(!StringPattern::glob("foo").unwrap().is_match("foobar"));
485
486        // "." in string isn't any special
487        assert!(StringPattern::glob("*").unwrap().is_match(".foo"));
488
489        // "/" in string isn't any special
490        assert!(StringPattern::glob("*").unwrap().is_match("foo/bar"));
491        assert!(StringPattern::glob(r"*/*").unwrap().is_match("foo/bar"));
492        assert!(!StringPattern::glob(r"*/*").unwrap().is_match(r"foo\bar"));
493
494        // "\" is an escape character
495        assert!(!StringPattern::glob(r"*\*").unwrap().is_match("foo/bar"));
496        assert!(StringPattern::glob(r"*\*").unwrap().is_match("foo*"));
497        assert!(StringPattern::glob(r"\\").unwrap().is_match(r"\"));
498
499        // "*" matches newline
500        assert!(StringPattern::glob(r"*").unwrap().is_match("foo\nbar"));
501
502        assert!(!StringPattern::glob("f?O").unwrap().is_match("Foo"));
503        assert!(StringPattern::glob_i("f?O").unwrap().is_match("Foo"));
504    }
505
506    #[test]
507    fn test_regex_is_match() {
508        // Unicode mode is enabled by default
509        assert!(StringPattern::regex(r"^\w$").unwrap().is_match("\u{c0}"));
510        assert!(StringPattern::regex(r"^.$").unwrap().is_match("\u{c0}"));
511        // ASCII-compatible mode should also work
512        assert!(StringPattern::regex(r"^(?-u)\w$").unwrap().is_match("a"));
513        assert!(
514            !StringPattern::regex(r"^(?-u)\w$")
515                .unwrap()
516                .is_match("\u{c0}")
517        );
518        assert!(
519            StringPattern::regex(r"^(?-u).{2}$")
520                .unwrap()
521                .is_match("\u{c0}")
522        );
523    }
524
525    #[test]
526    fn test_string_pattern_to_regex() {
527        let check = |pattern: StringPattern, match_to: &str| {
528            let regex = pattern.to_regex();
529            regex.is_match(match_to.as_bytes())
530        };
531        assert!(check(StringPattern::exact("$a"), "$a"));
532        assert!(!check(StringPattern::exact("$a"), "$A"));
533        assert!(!check(StringPattern::exact("a"), "aa"));
534        assert!(!check(StringPattern::exact("a"), "aa"));
535        assert!(check(StringPattern::exact_i("a"), "A"));
536        assert!(check(StringPattern::substring("$a"), "$abc"));
537        assert!(!check(StringPattern::substring("$a"), "$Abc"));
538        assert!(check(StringPattern::substring_i("$a"), "$Abc"));
539        assert!(!check(StringPattern::glob("a").unwrap(), "A"));
540        assert!(check(StringPattern::glob_i("a").unwrap(), "A"));
541        assert!(check(StringPattern::regex("^a{1,3}").unwrap(), "abcde"));
542        assert!(!check(StringPattern::regex("^a{1,3}").unwrap(), "Abcde"));
543        assert!(check(StringPattern::regex_i("^a{1,3}").unwrap(), "Abcde"));
544    }
545}