xee_interpreter/string/
collation.rs

1use std::cmp::Ordering;
2use std::collections::hash_map::Entry;
3use std::rc::Rc;
4
5use ahash::{HashMap, HashMapExt};
6use icu::collator::{BackwardSecondLevel, CaseLevel, Numeric};
7use icu::{
8    collator::{self, AlternateHandling, CaseFirst, Collator, MaxVariable, Strength},
9    locid::Locale,
10};
11
12use iri_string::types::{IriAbsoluteStr, IriReferenceStr, IriStr, IriString};
13
14use crate::error;
15
16#[derive(Debug, Clone, Eq, PartialEq)]
17pub(crate) struct CollatorQuery {
18    pub(crate) fallback: bool,
19    pub(crate) lang: Option<String>,
20    pub(crate) strength: Strength,
21    pub(crate) max_variable: MaxVariable,
22    pub(crate) alternate: AlternateHandling,
23    pub(crate) backwards: bool,
24    pub(crate) normalization: bool,
25    pub(crate) case_level: bool,
26    pub(crate) case_first: CaseFirst,
27    pub(crate) numeric: bool,
28    // both version and reorder are not supported at this point, as
29    // they don't seem to have equivalents in icu4x
30}
31
32impl From<CollatorQuery> for collator::CollatorOptions {
33    fn from(query: CollatorQuery) -> Self {
34        let mut options = collator::CollatorOptions::new();
35        options.strength = Some(query.strength);
36        options.alternate_handling = Some(query.alternate);
37        options.case_first = Some(query.case_first);
38        options.max_variable = Some(query.max_variable);
39        options.case_level = Some(if query.case_level {
40            CaseLevel::On
41        } else {
42            CaseLevel::Off
43        });
44        options.numeric = Some(if query.numeric {
45            Numeric::On
46        } else {
47            Numeric::Off
48        });
49        options.backward_second_level = Some({
50            if query.backwards {
51                BackwardSecondLevel::On
52            } else {
53                BackwardSecondLevel::Off
54            }
55        });
56        options
57    }
58}
59
60impl CollatorQuery {
61    fn from_url(url: &IriStr) -> error::Result<Self> {
62        let query = url.query_str().unwrap_or("");
63
64        let mut fallback = None;
65        let mut lang = None;
66        let mut strength = None;
67        let mut max_variable = None;
68        let mut alternate = None;
69        let mut backwards = None;
70        let mut normalization = None;
71        let mut case_level = None;
72        let mut case_first = None;
73        let mut numeric = None;
74        let mut has_unrecognized_key = false;
75
76        // last one wins
77        for (key, value) in Self::parse_collation_query(query) {
78            match key {
79                "fallback" => {
80                    fallback = Some(yes_no_query_parameter(value));
81                }
82                "lang" => {
83                    lang = Some(value.to_string());
84                }
85                "strength" => {
86                    strength = Some(strength_query_parameter(value));
87                }
88                "maxVariable" => {
89                    max_variable = Some(max_variable_query_parameter(value));
90                }
91                "alternate" => {
92                    alternate = Some(alternate_query_parameter(value));
93                }
94                "backwards" => {
95                    backwards = Some(yes_no_query_parameter(value));
96                }
97                "normalization" => {
98                    normalization = Some(yes_no_query_parameter(value));
99                }
100                "caseLevel" => {
101                    case_level = Some(yes_no_query_parameter(value));
102                }
103                "caseFirst" => {
104                    case_first = Some(case_first_query_parameter(value));
105                }
106                "numeric" => {
107                    numeric = Some(yes_no_query_parameter(value));
108                }
109                _ => {
110                    has_unrecognized_key = true;
111                }
112            }
113        }
114        let fallback = fallback.unwrap_or(Ok(true)).unwrap_or(true);
115
116        // if depends on fallback whether we accept unrecognized values
117        fn unwrap_or_fail<T>(
118            v: Option<Result<T, Unrecognized>>,
119            default: T,
120            fallback: bool,
121        ) -> error::Result<T> {
122            if let Some(v) = v {
123                if let Ok(v) = v {
124                    Ok(v)
125                } else if fallback {
126                    Ok(default)
127                } else {
128                    Err(error::Error::FOCH0002)
129                }
130            } else {
131                Ok(default)
132            }
133        }
134
135        // if fallback is no we don't recognize any unrecognized keys
136        if !fallback && has_unrecognized_key {
137            return Err(error::Error::FOCH0002);
138        }
139
140        Ok(CollatorQuery {
141            fallback,
142            lang: lang.map(|s| s.to_string()),
143            strength: unwrap_or_fail(strength, Strength::Tertiary, fallback)?,
144            max_variable: unwrap_or_fail(max_variable, MaxVariable::Punctuation, fallback)?,
145            alternate: unwrap_or_fail(alternate, AlternateHandling::NonIgnorable, fallback)?,
146            backwards: unwrap_or_fail(backwards, false, fallback)?,
147            normalization: unwrap_or_fail(normalization, false, fallback)?,
148            case_level: unwrap_or_fail(case_level, false, fallback)?,
149            case_first: unwrap_or_fail(case_first, CaseFirst::Off, fallback)?,
150            numeric: unwrap_or_fail(numeric, false, fallback)?,
151        })
152    }
153
154    fn parse_collation_query(s: &str) -> impl Iterator<Item = (&str, &str)> {
155        // the spec doesn't use normal query parameters separated by & but
156        // semi-colon separated parameters, probably because & is
157        // already used in XML.
158        s.split(';').filter_map(|part| {
159            let mut parts = part.split('=');
160            let key = parts.next()?;
161            let value = parts.next()?;
162            Some((key, value))
163        })
164    }
165}
166
167#[derive(Debug)]
168pub enum Collation {
169    // 5.3.2
170    CodePoint,
171    // 5.3.3
172    Uca(Box<Collator>),
173    // 5.3.4
174    HtmlAscii,
175}
176
177impl Collation {
178    fn new(base_uri: Option<&IriAbsoluteStr>, uri: &IriReferenceStr) -> error::Result<Self> {
179        let uri = if let Some(base_uri) = base_uri {
180            let uri: IriString = uri.resolve_against(base_uri).into();
181            uri
182        } else {
183            let uri: IriString = uri.to_iri().map_err(|_| error::Error::FOCH0002)?.to_owned();
184            uri
185        };
186        if uri.scheme_str() != "http" || uri.authority_str() != Some("www.w3.org") {
187            return Err(error::Error::FOCH0002);
188        }
189        let path = uri.path_str();
190        Ok(match path {
191            "/2005/xpath-functions/collation/codepoint" => Collation::CodePoint,
192            "/2013/collation/UCA" => {
193                let collator_query = CollatorQuery::from_url(&uri)?;
194                Collation::Uca(Box::new(Self::uca_collator(collator_query)?))
195            }
196            "/2005/xpath-functions/collation/html-ascii-case-insensitive" => Collation::HtmlAscii,
197            // TODO: a bit of a hack, we support the qt3 caseblind collation too so that the test suite will work
198            "/2010/09/qt-fots-catalog/collation/caseblind" => Collation::HtmlAscii,
199            _ => return Err(error::Error::FOCH0002),
200        })
201    }
202
203    fn uca_collator(collator_query: CollatorQuery) -> error::Result<Collator> {
204        let locale = if let Some(lang) = &collator_query.lang {
205            match Locale::try_from_bytes(lang.as_bytes()) {
206                Ok(locale) => locale,
207                Err(_) => {
208                    if collator_query.fallback {
209                        // in case of fallback, get a locale anyway
210                        Locale::UND
211                    } else {
212                        return Err(error::Error::FOCH0002);
213                    }
214                }
215            }
216        } else {
217            // this is implementation defined according to the XPath spec
218            // we choose to use the undefined locale
219            Locale::UND
220        };
221
222        let locale = locale.into();
223        let options = collator_query.into();
224
225        Collator::try_new(&locale, options).map_err(|_| error::Error::FOCH0002)
226    }
227
228    pub(crate) fn compare(&self, a: &str, b: &str) -> Ordering {
229        match self {
230            Collation::CodePoint => a.cmp(b),
231            Collation::Uca(collator) => collator.compare(a, b),
232            Collation::HtmlAscii => a.to_ascii_lowercase().cmp(&b.to_ascii_lowercase()),
233        }
234    }
235}
236
237#[derive(Debug)]
238pub(crate) struct Collations {
239    collations: HashMap<String, Rc<Collation>>,
240}
241
242impl Collations {
243    pub(crate) fn new() -> Self {
244        Self {
245            collations: HashMap::new(),
246        }
247    }
248
249    pub(crate) fn load(
250        &mut self,
251        base_uri: Option<&IriAbsoluteStr>,
252        uri: &IriReferenceStr,
253    ) -> error::Result<Rc<Collation>> {
254        // try to find cached collator. we cache by uri
255        match self.collations.entry(uri.to_string()) {
256            Entry::Occupied(entry) => Ok(entry.get().clone()),
257            Entry::Vacant(entry) => {
258                let collation = Collation::new(base_uri, uri)?;
259                Ok(entry.insert(Rc::new(collation)).clone())
260            }
261        }
262    }
263}
264
265struct Unrecognized;
266
267fn yes_no_query_parameter(value: &str) -> Result<bool, Unrecognized> {
268    match value {
269        "yes" => Ok(true),
270        "no" => Ok(false),
271        _ => Err(Unrecognized),
272    }
273}
274
275fn strength_query_parameter(value: &str) -> Result<Strength, Unrecognized> {
276    match value {
277        "primary" | "1" => Ok(Strength::Primary),
278        "secondary" | "2" => Ok(Strength::Secondary),
279        "tertiary" | "3" => Ok(Strength::Tertiary),
280        "quaternary" | "4" => Ok(Strength::Quaternary),
281        "identical" | "5" => Ok(Strength::Identical),
282        _ => Err(Unrecognized),
283    }
284}
285
286fn max_variable_query_parameter(value: &str) -> Result<MaxVariable, Unrecognized> {
287    match value {
288        "space" => Ok(MaxVariable::Space),
289        "punct" => Ok(MaxVariable::Punctuation),
290        "symbol" => Ok(MaxVariable::Symbol),
291        "currency" => Ok(MaxVariable::Currency),
292        _ => Err(Unrecognized),
293    }
294}
295
296fn alternate_query_parameter(value: &str) -> Result<AlternateHandling, Unrecognized> {
297    match value {
298        "non-ignorable" => Ok(AlternateHandling::NonIgnorable),
299        "shifted" => Ok(AlternateHandling::Shifted),
300        // blanked not supported by icu4x
301        _ => Err(Unrecognized),
302    }
303}
304
305fn case_first_query_parameter(value: &str) -> Result<CaseFirst, Unrecognized> {
306    match value {
307        "upper" => Ok(CaseFirst::UpperFirst),
308        "lower" => Ok(CaseFirst::LowerFirst),
309        _ => Err(Unrecognized),
310    }
311}
312
313#[cfg(test)]
314mod tests {
315
316    use super::*;
317
318    // these tests verify the behavior to the url crate
319
320    #[test]
321    fn test_base_url() {
322        let base: &IriAbsoluteStr = "http://www.w3.org/".try_into().unwrap();
323        let path: &IriReferenceStr = "/2005/xpath-functions/collation/codepoint"
324            .try_into()
325            .unwrap();
326        let url = path.resolve_against(base);
327        assert_eq!(
328            url.to_string(),
329            "http://www.w3.org/2005/xpath-functions/collation/codepoint"
330        );
331    }
332
333    #[test]
334    fn test_base_url_with_full_url() {
335        let base: &IriAbsoluteStr = "http://www.another.org/".try_into().unwrap();
336        let path: &IriReferenceStr = "http://www.w3.org/2005/xpath-functions/collation/codepoint"
337            .try_into()
338            .unwrap();
339        let url = path.resolve_against(base);
340        assert_eq!(
341            url.to_string(),
342            "http://www.w3.org/2005/xpath-functions/collation/codepoint"
343        );
344    }
345
346    #[test]
347    fn test_base_url_with_just_qs() {
348        let base: &IriAbsoluteStr = "http://www.w3.org/2013/collation/UCA".try_into().unwrap();
349        let path: &IriReferenceStr = "?lang=foo".try_into().unwrap();
350        let url = path.resolve_against(base);
351        assert_eq!(
352            url.to_string(),
353            "http://www.w3.org/2013/collation/UCA?lang=foo"
354        );
355    }
356
357    #[test]
358    fn test_deserialize_query_string() {
359        let url : &IriStr = "http://www.w3.org/2013/collation/UCA?fallback=yes;lang=en;strength=primary;max_variable=punctuation;alternate=non-ignorable;backwards=no;normalization=no;caseLevel=no;caseFirst=upper;numeric=no".try_into().unwrap();
360        let query = CollatorQuery::from_url(url).unwrap();
361        assert_eq!(
362            query,
363            CollatorQuery {
364                fallback: true,
365                lang: Some("en".to_string()),
366                strength: Strength::Primary,
367                max_variable: MaxVariable::Punctuation,
368                alternate: AlternateHandling::NonIgnorable,
369                backwards: false,
370                normalization: false,
371                case_level: false,
372                case_first: CaseFirst::UpperFirst,
373                numeric: false,
374            }
375        )
376    }
377
378    #[test]
379    fn test_deserialize_query_string_default() {
380        let url: &IriStr = "http://www.w3.org/2013/collation/UCA?lang=en"
381            .try_into()
382            .unwrap();
383        let query = CollatorQuery::from_url(url).unwrap();
384        assert_eq!(
385            query,
386            CollatorQuery {
387                fallback: true,
388                lang: Some("en".to_string()),
389                strength: Strength::Tertiary,
390                max_variable: MaxVariable::Punctuation,
391                alternate: AlternateHandling::NonIgnorable,
392                backwards: false,
393                normalization: false,
394                case_level: false,
395                case_first: CaseFirst::Off,
396                numeric: false,
397            }
398        )
399    }
400
401    #[test]
402    fn test_deserialize_query_no_fallback_reject_wrong_value() {
403        let url: &IriStr =
404            "http://www.w3.org/2013/collation/UCA?lang=en;fallback=no;strength=nonsense"
405                .try_into()
406                .unwrap();
407        assert!(CollatorQuery::from_url(url).is_err());
408    }
409
410    #[test]
411    fn test_deserialize_query_no_fallback_reject_extra_param() {
412        let url: &IriStr =
413            "http://www.w3.org/2013/collation/UCA?lang=en;fallback=no;extra=nonsense"
414                .try_into()
415                .unwrap();
416        assert!(CollatorQuery::from_url(url).is_err());
417    }
418
419    #[test]
420    fn test_deserialize_query_yes_fallback_default_for_wrong_value() {
421        let url: &IriStr =
422            "http://www.w3.org/2013/collation/UCA?lang=en;fallback=yes;strength=nonsense"
423                .try_into()
424                .unwrap();
425        let query = CollatorQuery::from_url(url).unwrap();
426        assert_eq!(
427            query,
428            CollatorQuery {
429                fallback: true,
430                lang: Some("en".to_string()),
431                strength: Strength::Tertiary,
432                max_variable: MaxVariable::Punctuation,
433                alternate: AlternateHandling::NonIgnorable,
434                backwards: false,
435                normalization: false,
436                case_level: false,
437                case_first: CaseFirst::Off,
438                numeric: false,
439            }
440        )
441    }
442
443    #[test]
444    fn test_deserialize_query_yes_fallback_ignore_extra_parameter() {
445        let url: IriString =
446            "http://www.w3.org/2013/collation/UCA?lang=en;fallback=yes;extra=nonsense"
447                .try_into()
448                .unwrap();
449        let query = CollatorQuery::from_url(&url).unwrap();
450        assert_eq!(
451            query,
452            CollatorQuery {
453                fallback: true,
454                lang: Some("en".to_string()),
455                strength: Strength::Tertiary,
456                max_variable: MaxVariable::Punctuation,
457                alternate: AlternateHandling::NonIgnorable,
458                backwards: false,
459                normalization: false,
460                case_level: false,
461                case_first: CaseFirst::Off,
462                numeric: false,
463            }
464        )
465    }
466
467    #[test]
468    fn test_load_uca_collation() {
469        let mut collations = Collations::new();
470        let url: &IriReferenceStr = "http://www.w3.org/2013/collation/UCA?lang=se;fallback=no"
471            .try_into()
472            .unwrap();
473        let collation = collations.load(None, url);
474        assert!(collation.is_ok());
475    }
476
477    #[test]
478    fn test_load_uca_collation_fallback() {
479        let mut collations = Collations::new();
480        let url: &IriReferenceStr = "http://www.w3.org/2013/collation/UCA?lang=en-US;fallback=yes"
481            .try_into()
482            .unwrap();
483        let collation = collations.load(None, url);
484        assert!(collation.is_ok());
485    }
486
487    // FIXME: This fallback test is broken since we switched to static instead
488    // of blob data. I'm not sure it matters; the conformance tests
489    // still work
490
491    // #[test]
492    // fn test_load_uca_collation_no_fallback() {
493    //     let mut collations = Collations::new();
494    //     let collation = collations.load(
495    //         None,
496    //         "http://www.w3.org/2013/collation/UCA?lang=en-US;fallback=no",
497    //     );
498    //     assert!(collation.is_err());
499    // }
500
501    #[test]
502    fn test_load_codepoint_collation() {
503        let mut collations = Collations::new();
504        let url: &IriReferenceStr = "http://www.w3.org/2005/xpath-functions/collation/codepoint"
505            .try_into()
506            .unwrap();
507        let collation = collations.load(None, url);
508        assert!(collation.is_ok());
509    }
510
511    #[test]
512    fn test_load_html_ascii_collation() {
513        let mut collations = Collations::new();
514        let url: &IriReferenceStr =
515            "http://www.w3.org/2005/xpath-functions/collation/html-ascii-case-insensitive"
516                .try_into()
517                .unwrap();
518        let collation = collations.load(None, url);
519        assert!(collation.is_ok());
520    }
521}