ogc_cql2/
qstring.rs

1// SPDX-License-Identifier: Apache-2.0
2
3#![warn(missing_docs)]
4
5//! CQL2 friendly string type that caters for a literal character sequence to
6//! be used as-is or in a case-insensitive way.
7//!
8
9use core::fmt;
10use std::ops;
11use unicase::UniCase;
12use unicode_normalization::{UnicodeNormalization, char::is_combining_mark};
13
14/// Flags to indicate how to handle a given literal string; i.e. whether to
15/// ignore its case, its accents, ignore both, or use as is.
16#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
17pub(crate) struct Ignoring(u8);
18
19impl Ignoring {
20    const NEITHER: Self = Self(0);
21    const CASE: Self = Self(1);
22    const ACCENT: Self = Self(2);
23}
24
25impl fmt::Display for Ignoring {
26    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
27        match *self {
28            Ignoring::NEITHER => write!(f, "/_"),
29            Ignoring::CASE => write!(f, "/c"),
30            Ignoring::ACCENT => write!(f, "/a"),
31            _ => write!(f, "/b"),
32        }
33    }
34}
35
36impl ops::BitAnd for Ignoring {
37    type Output = Self;
38
39    fn bitand(self, rhs: Self) -> Self::Output {
40        Self(self.0 & rhs.0)
41    }
42}
43
44impl ops::BitOr for Ignoring {
45    type Output = Self;
46
47    fn bitor(self, rhs: Self) -> Self::Output {
48        Self(self.0 | rhs.0)
49    }
50}
51
52/// String based type used by [Queryables][1] to represent a plain string, and
53/// a set of flags to indicate how to use it in case and/or accent insensitive
54/// contexts.
55///
56/// [1]: crate::queryable::Q
57#[derive(Debug, Clone, PartialOrd, Ord)]
58pub struct QString {
59    /// String literal.
60    inner: String,
61    /// How to use in case and accent sensitive contexts.
62    flags: Ignoring,
63}
64
65impl fmt::Display for QString {
66    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
67        write!(f, "/{}{}", self.inner, self.flags)
68    }
69}
70
71impl PartialEq for QString {
72    fn eq(&self, other: &Self) -> bool {
73        let to_icase = self.is_icase() || other.is_icase();
74        let to_iaccent = self.is_iaccent() || other.is_iaccent();
75        match (to_icase, to_iaccent) {
76            (true, true) => {
77                UniCase::new(QString::unaccent(&self.inner))
78                    == UniCase::new(QString::unaccent(&other.inner))
79            }
80            (true, false) => UniCase::new(&self.inner) == UniCase::new(&other.inner),
81            (false, true) => QString::unaccent(&self.inner) == QString::unaccent(&other.inner),
82            (false, false) => self.inner == other.inner,
83        }
84    }
85}
86
87impl Eq for QString {}
88
89impl QString {
90    /// Constructor for a plain instance.
91    pub fn plain<S: Into<String>>(s: S) -> Self {
92        Self {
93            inner: s.into(),
94            flags: Ignoring::NEITHER,
95        }
96    }
97
98    /// Create a new instance from `self` w/ the added ICASE (ignore case) flag
99    /// set.
100    pub fn and_icase(&self) -> Self {
101        Self {
102            inner: self.inner.to_owned(),
103            flags: self.flags.clone() | Ignoring::CASE,
104        }
105    }
106
107    /// Create a new instance from `self` w/ the added IACCENT (ignore accent)
108    /// flag set.
109    pub fn and_iaccent(&self) -> Self {
110        Self {
111            inner: self.inner.to_owned(),
112            flags: self.flags.clone() | Ignoring::ACCENT,
113        }
114    }
115
116    /// Return the inner raw string.
117    pub fn as_str(&self) -> &str {
118        &self.inner
119    }
120
121    /// Return a new string from the given argument w/ all Unicode 'Mn' (Combining
122    /// Mark) codepoints removed.
123    pub fn unaccent(s: &str) -> String {
124        if s.is_empty() {
125            return "".into();
126        }
127
128        let result: String = s.nfd().filter(|x| !is_combining_mark(*x)).nfc().collect();
129        result
130    }
131
132    /// Return a this [`Ignoring`] flags as a byte.
133    pub(crate) fn flags(&self) -> u8 {
134        self.flags.0
135    }
136
137    /// Return a reference to this `inner` string.
138    pub(crate) fn inner(&self) -> &str {
139        &self.inner
140    }
141
142    /// Return TRUE if this is a plain string; FALSE otherwise.
143    #[allow(dead_code)]
144    pub(crate) fn is_plain(&self) -> bool {
145        self.flags.0 == 0
146    }
147
148    /// Return TRUE if this is a case-insensitive string; FALSE otherwise.
149    pub(crate) fn is_icase(&self) -> bool {
150        self.flags.0 % 2 == 1
151    }
152
153    /// Return TRUE if this is an accent-insensitive string; FALSE otherwise.
154    pub(crate) fn is_iaccent(&self) -> bool {
155        self.flags.0 >= 2
156    }
157
158    /// Whether `input` matches the LIKE `pattern`.
159    pub(crate) fn like(input: &Self, pattern: &Self) -> bool {
160        const WC: char = '%';
161
162        // recursively compare 2 sub-strings, 1 char at a time...
163        fn recursive(input: &[char], pattern: &[char]) -> bool {
164            // w/ an empty pattern, only empty input matches...
165            if pattern.is_empty() {
166                return input.is_empty();
167            }
168
169            if input.is_empty() {
170                return pattern.iter().all(|&x| x == '%');
171            }
172
173            if pattern[0] == '\\' && pattern.len() > 1 {
174                let escaped = pattern[1];
175                return recursive(&input[1..], &pattern[2..])
176                    || (input[0] == escaped) && recursive(&input[1..], &pattern[2..]);
177            }
178
179            if pattern[0] == '%' {
180                return recursive(&input[1..], pattern) || recursive(input, &pattern[1..]);
181            }
182
183            if pattern[0] == '_' {
184                return recursive(&input[1..], &pattern[1..]);
185            }
186
187            (input[0] == pattern[0]) && recursive(&input[1..], &pattern[1..])
188        }
189
190        // reduce multiple occurences of unescaped wildcards (uwc) to just one.
191        fn reduce_wildcards(pattern: &str) -> Vec<char> {
192            let mut result: Vec<char> = Vec::with_capacity(pattern.len());
193            let mut chars = pattern.chars();
194            let mut saw_uwc = false;
195            while let Some(c) = chars.next() {
196                let state = if c == '\\' {
197                    result.push('\\');
198                    if let Some(n) = chars.next() {
199                        result.push(n);
200                    }
201                    false
202                } else if c == WC {
203                    if !saw_uwc {
204                        result.push(WC);
205                    }
206                    true
207                } else {
208                    result.push(c);
209                    false
210                };
211                saw_uwc = state;
212            }
213            result
214        }
215
216        // case-insensitive mode kicks in when either arguments is unicase.
217        let input_icase = input.is_icase();
218        let pattern_icase = pattern.is_icase();
219        let icase = input_icase || pattern_icase;
220        // same deal w/ ignore-accents...
221        let input_iaccent = input.is_iaccent();
222        let pattern_iaccent = pattern.is_iaccent();
223        let iaccent = input_iaccent || pattern_iaccent;
224
225        let folded_input: Vec<char> = match (icase, iaccent) {
226            (true, true) => UniCase::unicode(QString::unaccent(&input.inner))
227                .to_folded_case()
228                .chars()
229                .collect(),
230            (true, false) => UniCase::unicode(input.inner.as_str())
231                .to_folded_case()
232                .chars()
233                .collect(),
234            (false, true) => QString::unaccent(&input.inner).as_str().chars().collect(),
235            (false, false) => input.inner.chars().collect(),
236        };
237
238        let binding1 = UniCase::unicode(QString::unaccent(&pattern.inner)).to_folded_case();
239        let binding2 = UniCase::unicode(&pattern.inner).to_folded_case();
240        let binding3 = QString::unaccent(&pattern.inner);
241        let folded_pattern = match (icase, iaccent) {
242            (true, true) => binding1.as_str(),
243            (true, false) => binding2.as_str(),
244            (false, true) => binding3.as_str(),
245            (false, false) => pattern.inner.as_str(),
246        };
247
248        // replace repeated wildcards w/ one. mind escaped instances.
249        let reduced_pattern = reduce_wildcards(folded_pattern);
250
251        recursive(&folded_input, &reduced_pattern)
252    }
253
254    /// Constructor for an accent-insensitive instance.
255    #[cfg(test)]
256    pub fn iaccent(s: &str) -> Self {
257        Self {
258            inner: s.to_owned(),
259            flags: Ignoring::ACCENT,
260        }
261    }
262
263    /// Constructor for a case-insensitive instance.
264    #[cfg(test)]
265    pub fn icase(s: &str) -> Self {
266        Self {
267            inner: s.to_owned(),
268            flags: Ignoring::CASE,
269        }
270    }
271}
272
273#[cfg(test)]
274mod tests {
275    use super::*;
276    use rand::{
277        Rng,
278        distr::{
279            Alphanumeric,
280            uniform::{UniformChar, UniformSampler},
281        },
282    };
283    use tracing::debug;
284
285    #[test]
286    fn test_display() {
287        const S1: &str = "/chișinău/_";
288        const S2: &str = "/CHIȘINĂU/c";
289        const S3: &str = "/CHIȘINĂU/a";
290        const S5: &str = "/chișinău/b";
291
292        let s1 = QString::plain("chișinău");
293        assert!(s1.is_plain());
294        assert_eq!(s1.to_string(), S1);
295
296        let s2 = QString::icase("CHIȘINĂU");
297        assert!(s2.is_icase());
298        assert_eq!(s2.to_string(), S2);
299
300        let s3 = QString::iaccent("CHIȘINĂU");
301        assert!(s3.is_iaccent());
302        assert_eq!(s3.to_string(), S3);
303
304        let s4 = s1.and_icase();
305        assert!(s1.is_plain());
306        assert!(!s4.is_plain());
307        assert!(s4.is_icase());
308
309        let s5 = s4.and_iaccent();
310        assert_eq!(s5.to_string(), S5);
311        assert!(s5.is_icase());
312        assert!(s5.is_iaccent());
313    }
314
315    #[test]
316    fn test_equality() {
317        let s1 = QString::plain("chisinau");
318        let s2 = QString::icase("CHISINAU");
319        let s3 = QString::iaccent("chișinău");
320        let s4 = QString::iaccent("CHIȘINĂU").and_icase();
321        let s5 = QString::plain("CHISINAU").and_iaccent();
322
323        assert!(s1 == s2);
324        assert!(s3 == s4);
325        assert!(s4 == s5);
326
327        // all together now...
328        let s4 = s2.and_iaccent();
329        let s5 = s3.and_icase();
330
331        assert!(s1 == s3);
332        assert!(s1 == s4);
333        assert!(s1 == s5);
334
335        // remain valid after same bit set multiple times...
336        let s5 = s4.and_iaccent();
337        assert_eq!(s2, s4);
338        assert_eq!(s2, s5);
339        assert_eq!(s4, s5);
340        assert!(s5.is_icase());
341        assert!(s5.is_iaccent());
342    }
343
344    #[test]
345    fn test_unaccent() {
346        let slo = "chisinau";
347        let shi = "CHISINAU";
348        let aaaa = ["ẵ", "aͣ", "ą", "ǟ", "aₐ", "ắ"];
349        let nota = ["ɑ", "Ⓐ", "ⓐ", "æ", "ǽ", "ⱥ", "ᶏ", "ₐ"];
350
351        let iaccented = QString::unaccent("chișinău");
352        assert_eq!(iaccented, slo);
353
354        let iaccented = QString::unaccent("CHIȘINĂU");
355        assert_eq!(iaccented, shi);
356
357        // now test when LIKE wildcard characters are included...
358
359        let iaccented = QString::unaccent("Chiș%");
360        tracing::debug!("iaccented = '{iaccented}'");
361        assert_eq!(iaccented, "Chis%");
362
363        let iaccented = QString::unaccent("cHis%");
364        tracing::debug!("iaccented = '{iaccented}'");
365        assert_eq!(iaccented, "cHis%");
366
367        // ...and when combined w/ icase...
368
369        let a = QString::unaccent(&UniCase::new("chișinău%").to_folded_case());
370        tracing::debug!("a = '{a}'");
371        let b = UniCase::new(QString::unaccent("chișinău%")).to_folded_case();
372        tracing::debug!("b = '{b}'");
373        assert_eq!(a, b);
374
375        // test 'a' combos...
376        for c in aaaa.into_iter() {
377            let a = QString::unaccent(c);
378            assert!(a.starts_with('a'));
379        }
380        for c in nota.into_iter() {
381            let a = QString::unaccent(c);
382            assert!(!a.starts_with('a'));
383        }
384    }
385
386    fn starts_with_foo() -> String {
387        let mut rng = rand::rng();
388        let size: usize = rng.random_range(5..15);
389        let s = (0..size)
390            .map(|_| rng.sample(Alphanumeric) as char)
391            .collect();
392        let hit = rng.random_bool(0.25);
393        if hit { format!("Foo{s}") } else { s }
394    }
395
396    #[test]
397    fn test_like_small() {
398        let pattern = QString::icase("foo%");
399        for _ in 0..1000 {
400            let s = starts_with_foo();
401            if s.starts_with("Foo") {
402                let input = QString::icase(&s);
403                let result = QString::like(&input, &pattern);
404                if !result {
405                    eprintln!("*** Was expecting '{s}' to succeed");
406                    panic!("Ooops")
407                }
408            };
409        }
410    }
411
412    #[test]
413    fn test_like_capital() {
414        let pattern = QString::icase("FOO%");
415        for _ in 0..1000 {
416            let s = starts_with_foo();
417            if s.starts_with("Foo") {
418                let input = QString::icase(&s);
419                let result = QString::like(&input, &pattern);
420                if !result {
421                    eprintln!("*** Was expecting '{s}' to succeed");
422                    panic!("Ooops")
423                }
424            };
425        }
426    }
427
428    #[test]
429    fn test_nfkd() {
430        const S: &str = "ἄbc";
431
432        let r1: String = S
433            .chars()
434            .map(|c| UnicodeNormalization::nfkd(c).nth(0).unwrap())
435            .collect();
436        tracing::debug!("'{r1}'");
437        assert_eq!(r1, "αbc");
438
439        assert_eq!(QString::unaccent(S), r1);
440    }
441
442    #[test]
443    #[tracing_test::traced_test]
444    fn test_like_bench() {
445        // generate random word, 5 to 10 characters long from latin characters.
446        fn random_latin_word() -> String {
447            let mut rng = rand::rng();
448            let len: usize = Rng::random_range(&mut rng, 5..10);
449            let dist = UniformChar::new_inclusive('\u{0041}', '\u{024F}').unwrap();
450            (0..len).map(|_| dist.sample(&mut rng)).collect()
451        }
452
453        const PATTERN: &str = "Ä%%";
454        let pattern = QString::plain(PATTERN).and_iaccent().and_icase();
455        for _ in 0..1000 {
456            let raw = random_latin_word();
457            let cooked = raw
458                .nfd()
459                .filter(|x| !is_combining_mark(*x))
460                .nfc()
461                .collect::<String>();
462            let ricotta = UniCase::unicode(&cooked).to_folded_case();
463            let expected = ricotta.starts_with('a');
464            let input = QString::plain(&raw).and_icase().and_iaccent();
465            let actual = QString::like(&input, &pattern);
466            if actual != expected {
467                debug!("    raw: '{raw}' {}", raw.escape_unicode());
468                debug!("  cotta: '{cooked}' {}", cooked.escape_unicode());
469                debug!("ricotta: '{ricotta}' {}", ricotta.escape_unicode());
470                panic!(
471                    "IA(IC({input})) LIKE IC(IA({pattern})) is {actual} but expected {expected}"
472                );
473            }
474        }
475    }
476}