Skip to main content

ogc_cql2/
qstring.rs

1// SPDX-License-Identifier: Apache-2.0
2
3#![warn(missing_docs)]
4
5//! CQL2 friendly string type that caters for a literal character sequence to
6//! be used as-is or in a case-insensitive way.
7//!
8
9use core::fmt;
10use std::ops;
11use unicase::UniCase;
12use unicode_normalization::{UnicodeNormalization, char::is_combining_mark};
13
14/// Flags to indicate how to handle a given literal string; i.e. whether to
15/// ignore its case, its accents, ignore both, or use as is.
16#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
17pub(crate) struct Ignoring(u8);
18
19impl Ignoring {
20    const NEITHER: Self = Self(0);
21    const CASE: Self = Self(1);
22    const ACCENT: Self = Self(2);
23}
24
25impl fmt::Display for Ignoring {
26    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
27        match *self {
28            Ignoring::NEITHER => write!(f, "/_"),
29            Ignoring::CASE => write!(f, "/c"),
30            Ignoring::ACCENT => write!(f, "/a"),
31            _ => write!(f, "/b"),
32        }
33    }
34}
35
36impl ops::BitAnd for Ignoring {
37    type Output = Self;
38
39    fn bitand(self, rhs: Self) -> Self::Output {
40        Self(self.0 & rhs.0)
41    }
42}
43
44impl ops::BitOr for Ignoring {
45    type Output = Self;
46
47    fn bitor(self, rhs: Self) -> Self::Output {
48        Self(self.0 | rhs.0)
49    }
50}
51
52/// String based type used by [`Queryable`s][crate::Q] to represent a plain string, and
53/// a set of flags to indicate how to use it in case and/or accent insensitive
54/// contexts.
55#[derive(Debug, Clone, PartialOrd, Ord)]
56pub struct QString {
57    /// String literal.
58    inner: String,
59    /// How to use in case and accent sensitive contexts.
60    flags: Ignoring,
61}
62
63impl fmt::Display for QString {
64    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
65        write!(f, "/{}{}", self.inner, self.flags)
66    }
67}
68
69impl PartialEq for QString {
70    fn eq(&self, other: &Self) -> bool {
71        let to_icase = self.is_icase() || other.is_icase();
72        let to_iaccent = self.is_iaccent() || other.is_iaccent();
73        match (to_icase, to_iaccent) {
74            (true, true) => {
75                UniCase::new(QString::unaccent(&self.inner))
76                    == UniCase::new(QString::unaccent(&other.inner))
77            }
78            (true, false) => UniCase::new(&self.inner) == UniCase::new(&other.inner),
79            (false, true) => QString::unaccent(&self.inner) == QString::unaccent(&other.inner),
80            (false, false) => self.inner == other.inner,
81        }
82    }
83}
84
85impl Eq for QString {}
86
87impl QString {
88    /// Constructor for a plain instance.
89    pub fn plain<S: Into<String>>(s: S) -> Self {
90        Self {
91            inner: s.into(),
92            flags: Ignoring::NEITHER,
93        }
94    }
95
96    /// Create a new instance from `self` w/ the added ICASE (ignore case) flag
97    /// set.
98    pub fn and_icase(&self) -> Self {
99        Self {
100            inner: self.inner.to_owned(),
101            flags: self.flags.clone() | Ignoring::CASE,
102        }
103    }
104
105    /// Create a new instance from `self` w/ the added IACCENT (ignore accent)
106    /// flag set.
107    pub fn and_iaccent(&self) -> Self {
108        Self {
109            inner: self.inner.to_owned(),
110            flags: self.flags.clone() | Ignoring::ACCENT,
111        }
112    }
113
114    /// Return the inner raw string.
115    pub fn as_str(&self) -> &str {
116        &self.inner
117    }
118
119    /// Return a new string from the given argument w/ all Unicode 'Mn' (Combining
120    /// Mark) codepoints removed.
121    pub fn unaccent(s: &str) -> String {
122        if s.is_empty() {
123            return "".into();
124        }
125
126        let result: String = s.nfd().filter(|x| !is_combining_mark(*x)).nfc().collect();
127        result
128    }
129
130    /// Return a this [`Ignoring`] flags as a byte.
131    pub(crate) fn flags(&self) -> u8 {
132        self.flags.0
133    }
134
135    /// Return a reference to this `inner` string.
136    pub(crate) fn inner(&self) -> &str {
137        &self.inner
138    }
139
140    /// Return TRUE if this is a plain string; FALSE otherwise.
141    #[allow(dead_code)]
142    pub(crate) fn is_plain(&self) -> bool {
143        self.flags.0 == 0
144    }
145
146    /// Return TRUE if this is a case-insensitive string; FALSE otherwise.
147    pub(crate) fn is_icase(&self) -> bool {
148        self.flags.0 % 2 == 1
149    }
150
151    /// Return TRUE if this is an accent-insensitive string; FALSE otherwise.
152    pub(crate) fn is_iaccent(&self) -> bool {
153        self.flags.0 >= 2
154    }
155
156    /// Whether `input` matches the LIKE `pattern`.
157    pub(crate) fn like(input: &Self, pattern: &Self) -> bool {
158        const WC: char = '%';
159
160        // recursively compare 2 sub-strings, 1 char at a time...
161        fn recursive(input: &[char], pattern: &[char]) -> bool {
162            // w/ an empty pattern, only empty input matches...
163            if pattern.is_empty() {
164                return input.is_empty();
165            }
166
167            if input.is_empty() {
168                return pattern.iter().all(|&x| x == '%');
169            }
170
171            if pattern[0] == '\\' && pattern.len() > 1 {
172                let escaped = pattern[1];
173                return recursive(&input[1..], &pattern[2..])
174                    || (input[0] == escaped) && recursive(&input[1..], &pattern[2..]);
175            }
176
177            if pattern[0] == '%' {
178                return recursive(&input[1..], pattern) || recursive(input, &pattern[1..]);
179            }
180
181            if pattern[0] == '_' {
182                return recursive(&input[1..], &pattern[1..]);
183            }
184
185            (input[0] == pattern[0]) && recursive(&input[1..], &pattern[1..])
186        }
187
188        // reduce multiple occurences of unescaped wildcards (uwc) to just one.
189        fn reduce_wildcards(pattern: &str) -> Vec<char> {
190            let mut result: Vec<char> = Vec::with_capacity(pattern.len());
191            let mut chars = pattern.chars();
192            let mut saw_uwc = false;
193            while let Some(c) = chars.next() {
194                let state = if c == '\\' {
195                    result.push('\\');
196                    if let Some(n) = chars.next() {
197                        result.push(n);
198                    }
199                    false
200                } else if c == WC {
201                    if !saw_uwc {
202                        result.push(WC);
203                    }
204                    true
205                } else {
206                    result.push(c);
207                    false
208                };
209                saw_uwc = state;
210            }
211            result
212        }
213
214        // case-insensitive mode kicks in when either arguments is unicase.
215        let input_icase = input.is_icase();
216        let pattern_icase = pattern.is_icase();
217        let icase = input_icase || pattern_icase;
218        // same deal w/ ignore-accents...
219        let input_iaccent = input.is_iaccent();
220        let pattern_iaccent = pattern.is_iaccent();
221        let iaccent = input_iaccent || pattern_iaccent;
222
223        let folded_input: Vec<char> = match (icase, iaccent) {
224            (true, true) => UniCase::unicode(QString::unaccent(&input.inner))
225                .to_folded_case()
226                .chars()
227                .collect(),
228            (true, false) => UniCase::unicode(input.inner.as_str())
229                .to_folded_case()
230                .chars()
231                .collect(),
232            (false, true) => QString::unaccent(&input.inner).as_str().chars().collect(),
233            (false, false) => input.inner.chars().collect(),
234        };
235
236        let binding1 = UniCase::unicode(QString::unaccent(&pattern.inner)).to_folded_case();
237        let binding2 = UniCase::unicode(&pattern.inner).to_folded_case();
238        let binding3 = QString::unaccent(&pattern.inner);
239        let folded_pattern = match (icase, iaccent) {
240            (true, true) => binding1.as_str(),
241            (true, false) => binding2.as_str(),
242            (false, true) => binding3.as_str(),
243            (false, false) => pattern.inner.as_str(),
244        };
245
246        // replace repeated wildcards w/ one. mind escaped instances.
247        let reduced_pattern = reduce_wildcards(folded_pattern);
248
249        recursive(&folded_input, &reduced_pattern)
250    }
251
252    /// Constructor for an accent-insensitive instance.
253    #[cfg(test)]
254    pub fn iaccent(s: &str) -> Self {
255        Self {
256            inner: s.to_owned(),
257            flags: Ignoring::ACCENT,
258        }
259    }
260
261    /// Constructor for a case-insensitive instance.
262    #[cfg(test)]
263    pub fn icase(s: &str) -> Self {
264        Self {
265            inner: s.to_owned(),
266            flags: Ignoring::CASE,
267        }
268    }
269}
270
271#[cfg(test)]
272mod tests {
273    use super::*;
274    use rand::{
275        Rng,
276        distr::{
277            Alphanumeric,
278            uniform::{UniformChar, UniformSampler},
279        },
280    };
281    use tracing::debug;
282
283    #[test]
284    fn test_display() {
285        const S1: &str = "/chișinău/_";
286        const S2: &str = "/CHIȘINĂU/c";
287        const S3: &str = "/CHIȘINĂU/a";
288        const S5: &str = "/chișinău/b";
289
290        let s1 = QString::plain("chișinău");
291        assert!(s1.is_plain());
292        assert_eq!(s1.to_string(), S1);
293
294        let s2 = QString::icase("CHIȘINĂU");
295        assert!(s2.is_icase());
296        assert_eq!(s2.to_string(), S2);
297
298        let s3 = QString::iaccent("CHIȘINĂU");
299        assert!(s3.is_iaccent());
300        assert_eq!(s3.to_string(), S3);
301
302        let s4 = s1.and_icase();
303        assert!(s1.is_plain());
304        assert!(!s4.is_plain());
305        assert!(s4.is_icase());
306
307        let s5 = s4.and_iaccent();
308        assert_eq!(s5.to_string(), S5);
309        assert!(s5.is_icase());
310        assert!(s5.is_iaccent());
311    }
312
313    #[test]
314    fn test_equality() {
315        let s1 = QString::plain("chisinau");
316        let s2 = QString::icase("CHISINAU");
317        let s3 = QString::iaccent("chișinău");
318        let s4 = QString::iaccent("CHIȘINĂU").and_icase();
319        let s5 = QString::plain("CHISINAU").and_iaccent();
320
321        assert!(s1 == s2);
322        assert!(s3 == s4);
323        assert!(s4 == s5);
324
325        // all together now...
326        let s4 = s2.and_iaccent();
327        let s5 = s3.and_icase();
328
329        assert!(s1 == s3);
330        assert!(s1 == s4);
331        assert!(s1 == s5);
332
333        // remain valid after same bit set multiple times...
334        let s5 = s4.and_iaccent();
335        assert_eq!(s2, s4);
336        assert_eq!(s2, s5);
337        assert_eq!(s4, s5);
338        assert!(s5.is_icase());
339        assert!(s5.is_iaccent());
340    }
341
342    #[test]
343    fn test_unaccent() {
344        let slo = "chisinau";
345        let shi = "CHISINAU";
346        let aaaa = ["ẵ", "aͣ", "ą", "ǟ", "aₐ", "ắ"];
347        let nota = ["ɑ", "Ⓐ", "ⓐ", "æ", "ǽ", "ⱥ", "ᶏ", "ₐ"];
348
349        let iaccented = QString::unaccent("chișinău");
350        assert_eq!(iaccented, slo);
351
352        let iaccented = QString::unaccent("CHIȘINĂU");
353        assert_eq!(iaccented, shi);
354
355        // now test when LIKE wildcard characters are included...
356
357        let iaccented = QString::unaccent("Chiș%");
358        tracing::debug!("iaccented = '{iaccented}'");
359        assert_eq!(iaccented, "Chis%");
360
361        let iaccented = QString::unaccent("cHis%");
362        tracing::debug!("iaccented = '{iaccented}'");
363        assert_eq!(iaccented, "cHis%");
364
365        // ...and when combined w/ icase...
366
367        let a = QString::unaccent(&UniCase::new("chișinău%").to_folded_case());
368        tracing::debug!("a = '{a}'");
369        let b = UniCase::new(QString::unaccent("chișinău%")).to_folded_case();
370        tracing::debug!("b = '{b}'");
371        assert_eq!(a, b);
372
373        // test 'a' combos...
374        for c in aaaa.into_iter() {
375            let a = QString::unaccent(c);
376            assert!(a.starts_with('a'));
377        }
378        for c in nota.into_iter() {
379            let a = QString::unaccent(c);
380            assert!(!a.starts_with('a'));
381        }
382    }
383
384    fn starts_with_foo() -> String {
385        let mut rng = rand::rng();
386        let size: usize = rng.random_range(5..15);
387        let s = (0..size)
388            .map(|_| rng.sample(Alphanumeric) as char)
389            .collect();
390        let hit = rng.random_bool(0.25);
391        if hit { format!("Foo{s}") } else { s }
392    }
393
394    #[test]
395    fn test_like_small() {
396        let pattern = QString::icase("foo%");
397        for _ in 0..1000 {
398            let s = starts_with_foo();
399            if s.starts_with("Foo") {
400                let input = QString::icase(&s);
401                let result = QString::like(&input, &pattern);
402                if !result {
403                    panic!("Ooops! Was expecting '{s}' to succeed")
404                }
405            };
406        }
407    }
408
409    #[test]
410    fn test_like_capital() {
411        let pattern = QString::icase("FOO%");
412        for _ in 0..1000 {
413            let s = starts_with_foo();
414            if s.starts_with("Foo") {
415                let input = QString::icase(&s);
416                let result = QString::like(&input, &pattern);
417                if !result {
418                    panic!("Ooops! Was expecting '{s}' to succeed")
419                }
420            };
421        }
422    }
423
424    #[test]
425    fn test_nfkd() {
426        const S: &str = "ἄbc";
427
428        let r1: String = S
429            .chars()
430            .map(|c| UnicodeNormalization::nfkd(c).nth(0).unwrap())
431            .collect();
432        tracing::debug!("'{r1}'");
433        assert_eq!(r1, "αbc");
434
435        assert_eq!(QString::unaccent(S), r1);
436    }
437
438    #[test]
439    fn test_like_bench() {
440        // generate random word, 5 to 10 characters long from latin characters.
441        fn random_latin_word() -> String {
442            let mut rng = rand::rng();
443            let len: usize = Rng::random_range(&mut rng, 5..10);
444            let dist = UniformChar::new_inclusive('\u{0041}', '\u{024F}').unwrap();
445            (0..len).map(|_| dist.sample(&mut rng)).collect()
446        }
447
448        const PATTERN: &str = "Ä%%";
449        let pattern = QString::plain(PATTERN).and_iaccent().and_icase();
450        for _ in 0..1000 {
451            let raw = random_latin_word();
452            let cooked = raw
453                .nfd()
454                .filter(|x| !is_combining_mark(*x))
455                .nfc()
456                .collect::<String>();
457            let ricotta = UniCase::unicode(&cooked).to_folded_case();
458            let expected = ricotta.starts_with('a');
459            let input = QString::plain(&raw).and_icase().and_iaccent();
460            let actual = QString::like(&input, &pattern);
461            if actual != expected {
462                debug!("    raw: '{raw}' {}", raw.escape_unicode());
463                debug!("  cotta: '{cooked}' {}", cooked.escape_unicode());
464                debug!("ricotta: '{ricotta}' {}", ricotta.escape_unicode());
465                panic!(
466                    "IA(IC({input})) LIKE IC(IA({pattern})) is {actual} but expected {expected}"
467                );
468            }
469        }
470    }
471}