precis_core/
stringclasses.rs

1//! This module contains the implementation and traits for the
2//! String classes such as it is defined by the PRECIS framework
3//! [`rfc8264`](https://datatracker.ietf.org/doc/html/rfc8264#section-4)
4
5use crate::common;
6use crate::context;
7use crate::DerivedPropertyValue;
8use crate::{CodepointInfo, Error, UnexpectedError};
9
10/// Interface for specific classes to deal with specific Unicode
11/// code groups defined in RFC 8264.
12/// Next callbacks will be invoked to calculate the derived property
13/// according to the algorithm defined in [`rfc8264`](https://datatracker.ietf.org/doc/html/rfc8264#section-8)
14pub trait SpecificDerivedPropertyValue {
15    /// Callback invoked when the Unicode code point belongs to
16    /// [Spaces](https://datatracker.ietf.org/doc/html/rfc8264#section-9.14)
17    fn on_spaces(&self) -> DerivedPropertyValue;
18    /// Callback invoked when the Unicode code point belongs to
19    /// [Symbols](https://datatracker.ietf.org/doc/html/rfc8264#section-9.15)
20    fn on_symbols(&self) -> DerivedPropertyValue;
21    /// Callback invoked when the Unicode code point belongs to
22    /// [Punctuation](https://datatracker.ietf.org/doc/html/rfc8264#section-9.16)
23    fn on_punctuation(&self) -> DerivedPropertyValue;
24    /// Callback invoked when the Unicode code point belongs to
25    /// [`HasCompat`](https://datatracker.ietf.org/doc/html/rfc8264#section-9.17)
26    fn on_has_compat(&self) -> DerivedPropertyValue;
27    /// Callback invoked when the Unicode code point belongs to
28    /// [`OtherLetterDigits`](https://datatracker.ietf.org/doc/html/rfc8264#section-9.18)
29    fn on_other_letter_digits(&self) -> DerivedPropertyValue;
30}
31
32/// Implements the algorithm to calculate the value of the derived property.
33/// This algorithm is as follows (implementations MUST NOT modify the order
34/// of operations within this algorithm, because doing so would cause
35/// inconsistent results across implementations):
36///
37/// > If .`cp`. .in. `Exceptions` Then `Exceptions`(`cp`);\
38/// > Else If .`cp`. .in. `BackwardCompatible` Then `BackwardCompatible`(`cp`);\
39/// > Else If .`cp`. .in. `Unassigned` Then `UNASSIGNED`;\
40/// > Else If .`cp`. .in. `ASCII7` Then `PVALID`;\
41/// > Else If .`cp`. .in. `JoinControl` Then `CONTEXTJ`;\
42/// > Else If .`cp`. .in. `OldHangulJamo` Then `DISALLOWED`;\
43/// > Else If .`cp`. .in. `PrecisIgnorableProperties` Then `DISALLOWED`;\
44/// > Else If .`cp`. .in. `Controls` Then `DISALLOWED`;\
45/// > Else If .`cp`. .in. `HasCompat` Then `ID_DIS` or `FREE_PVAL`;\
46/// > Else If .`cp`. .in. `LetterDigits` Then `PVALID`;\
47/// > Else If .`cp`. .in. `OtherLetterDigits` Then `ID_DIS` or `FREE_PVAL`;\
48/// > Else If .`cp`. .in. `Spaces` Then `ID_DIS` or `FREE_PVAL`;\
49/// > Else If .`cp`. .in. `Symbols` Then `ID_DIS` or `FREE_PVAL`;\
50/// > Else If .`cp`. .in. `Punctuation` Then `ID_DIS` or `FREE_PVAL`;\
51/// > Else `DISALLOWED`;
52///
53/// # Arguments
54/// * `cp` - Unicode code point
55/// * `obj` - Object implementing the [`SpecificDerivedPropertyValue`] trait.
56///
57/// # Return
58/// This function returns the derived property value as defined in
59/// [RFC 8264](https://datatracker.ietf.org/doc/html/rfc8264#section-8)
60#[allow(clippy::if_same_then_else)]
61fn get_derived_property_value(
62    cp: u32,
63    obj: &dyn SpecificDerivedPropertyValue,
64) -> DerivedPropertyValue {
65    match common::get_exception_val(cp) {
66        Some(val) => *val,
67        None => match common::get_backward_compatible_val(cp) {
68            Some(val) => *val,
69            None => {
70                if common::is_unassigned(cp) {
71                    DerivedPropertyValue::Unassigned
72                } else if common::is_ascii7(cp) {
73                    DerivedPropertyValue::PValid
74                } else if common::is_join_control(cp) {
75                    DerivedPropertyValue::ContextJ
76                } else if common::is_old_hangul_jamo(cp) {
77                    DerivedPropertyValue::Disallowed
78                } else if common::is_precis_ignorable_property(cp) {
79                    DerivedPropertyValue::Disallowed
80                } else if common::is_control(cp) {
81                    DerivedPropertyValue::Disallowed
82                } else if common::has_compat(cp) {
83                    obj.on_has_compat()
84                } else if common::is_letter_digit(cp) {
85                    DerivedPropertyValue::PValid
86                } else if common::is_other_letter_digit(cp) {
87                    obj.on_other_letter_digits()
88                } else if common::is_space(cp) {
89                    obj.on_spaces()
90                } else if common::is_symbol(cp) {
91                    obj.on_symbols()
92                } else if common::is_punctuation(cp) {
93                    obj.on_punctuation()
94                } else {
95                    DerivedPropertyValue::Disallowed
96                }
97            }
98        },
99    }
100}
101
102fn allowed_by_context_rule(
103    label: &str,
104    val: DerivedPropertyValue,
105    cp: u32,
106    offset: usize,
107) -> Result<(), Error> {
108    match context::get_context_rule(cp) {
109        None => Err(Error::Unexpected(UnexpectedError::MissingContextRule(
110            CodepointInfo::new(cp, offset, val),
111        ))),
112        Some(rule) => match rule(label, offset) {
113            Ok(allowed) => {
114                if allowed {
115                    Ok(())
116                } else {
117                    Err(Error::BadCodepoint(CodepointInfo::new(cp, offset, val)))
118                }
119            }
120            Err(e) => match e {
121                context::ContextRuleError::NotApplicable => Err(Error::Unexpected(
122                    UnexpectedError::ContextRuleNotApplicable(CodepointInfo::new(cp, offset, val)),
123                )),
124                context::ContextRuleError::Undefined => {
125                    Err(Error::Unexpected(UnexpectedError::Undefined))
126                }
127            },
128        },
129    }
130}
131
132/// Base interface for all String classes in PRECIS framework.
133pub trait StringClass {
134    /// Gets the derived property value according to the algorithm defined
135    /// in [`rfc8264`](https://datatracker.ietf.org/doc/html/rfc8264#section-8)
136    /// # Arguments
137    /// * `c`- Unicode character
138    /// # Return
139    /// This method returns the derived property value associated to a Unicode character
140    fn get_value_from_char(&self, c: char) -> DerivedPropertyValue;
141
142    /// Gets the derived property value according to the algorithm defined
143    /// in [`rfc8264`](https://datatracker.ietf.org/doc/html/rfc8264#section-8)
144    /// # Arguments:
145    /// * `cp`- Unicode code point
146    /// # Return
147    /// This method returns the derived property value associated to a Unicode character
148    fn get_value_from_codepoint(&self, cp: u32) -> DerivedPropertyValue;
149
150    /// Ensures that the string consists only of Unicode code points that
151    /// are explicitly allowed by the PRECIS
152    /// [String Class](https://datatracker.ietf.org/doc/html/rfc8264#section-4)
153    /// # Arguments:
154    /// * `label` - string to check
155    /// # Returns
156    /// true if all character of `label` are allowed by the String Class.
157    fn allows<S>(&self, label: S) -> Result<(), Error>
158    where
159        S: AsRef<str>,
160    {
161        for (offset, c) in label.as_ref().chars().enumerate() {
162            let val = self.get_value_from_char(c);
163
164            match val {
165                DerivedPropertyValue::PValid | DerivedPropertyValue::SpecClassPval => Ok(()),
166                DerivedPropertyValue::SpecClassDis
167                | DerivedPropertyValue::Disallowed
168                | DerivedPropertyValue::Unassigned => Err(Error::BadCodepoint(CodepointInfo::new(
169                    c as u32, offset, val,
170                ))),
171                DerivedPropertyValue::ContextJ | DerivedPropertyValue::ContextO => {
172                    allowed_by_context_rule(label.as_ref(), val, c as u32, offset)
173                }
174            }?
175        }
176
177        Ok(())
178    }
179}
180
181/// Concrete class representing PRECIS `IdentifierClass` from
182/// [RFC 8264](https://datatracker.ietf.org/doc/html/rfc8264#section-4.2).
183/// # Example
184/// ```rust
185/// # use precis_core::{DerivedPropertyValue,IdentifierClass,StringClass};
186/// let id = IdentifierClass::default();
187/// // character 𐍁 is OtherLetterDigits (R)
188/// assert_eq!(id.get_value_from_char('𐍁'), DerivedPropertyValue::SpecClassDis);
189/// // Character S is ASCII7 (K)
190/// assert_eq!(id.get_value_from_char('S'), DerivedPropertyValue::PValid);
191/// // Character 0x1170 is OldHangulJamo (I)
192/// assert_eq!(id.get_value_from_codepoint(0x1170), DerivedPropertyValue::Disallowed);
193/// ```
194#[derive(Debug, Default, Copy, Clone, PartialEq, Eq)]
195pub struct IdentifierClass {}
196
197impl SpecificDerivedPropertyValue for IdentifierClass {
198    // `ID_DIS` mapped to `SPEC_CLASS_DIS`
199    fn on_has_compat(&self) -> DerivedPropertyValue {
200        DerivedPropertyValue::SpecClassDis
201    }
202    fn on_other_letter_digits(&self) -> DerivedPropertyValue {
203        DerivedPropertyValue::SpecClassDis
204    }
205    fn on_spaces(&self) -> DerivedPropertyValue {
206        DerivedPropertyValue::SpecClassDis
207    }
208    fn on_symbols(&self) -> DerivedPropertyValue {
209        DerivedPropertyValue::SpecClassDis
210    }
211    fn on_punctuation(&self) -> DerivedPropertyValue {
212        DerivedPropertyValue::SpecClassDis
213    }
214}
215
216impl StringClass for IdentifierClass {
217    fn get_value_from_char(&self, c: char) -> DerivedPropertyValue {
218        get_derived_property_value(c as u32, self)
219    }
220
221    fn get_value_from_codepoint(&self, cp: u32) -> DerivedPropertyValue {
222        get_derived_property_value(cp, self)
223    }
224}
225
226/// Concrete class representing PRECIS `FreeformClass` from
227/// [RFC 8264](https://datatracker.ietf.org/doc/html/rfc8264#section-4.3).
228/// # Example
229/// ```rust
230/// # use precis_core::{DerivedPropertyValue,FreeformClass,StringClass};
231/// let ff = FreeformClass::default();
232/// // character 𐍁 is OtherLetterDigits (R)
233/// assert_eq!(ff.get_value_from_char('𐍁'), DerivedPropertyValue::SpecClassPval);
234/// // Character S is ASCII7 (K)
235/// assert_eq!(ff.get_value_from_char('S'), DerivedPropertyValue::PValid);
236/// // Character 0x1170 is OldHangulJamo (I)
237/// assert_eq!(ff.get_value_from_codepoint(0x1170), DerivedPropertyValue::Disallowed);
238/// ```
239#[derive(Debug, Default, Copy, Clone, PartialEq, Eq)]
240pub struct FreeformClass {}
241
242impl SpecificDerivedPropertyValue for FreeformClass {
243    fn on_has_compat(&self) -> DerivedPropertyValue {
244        DerivedPropertyValue::SpecClassPval
245    }
246    fn on_other_letter_digits(&self) -> DerivedPropertyValue {
247        DerivedPropertyValue::SpecClassPval
248    }
249    fn on_spaces(&self) -> DerivedPropertyValue {
250        DerivedPropertyValue::SpecClassPval
251    }
252    fn on_symbols(&self) -> DerivedPropertyValue {
253        DerivedPropertyValue::SpecClassPval
254    }
255    fn on_punctuation(&self) -> DerivedPropertyValue {
256        DerivedPropertyValue::SpecClassPval
257    }
258}
259
260impl StringClass for FreeformClass {
261    fn get_value_from_char(&self, c: char) -> DerivedPropertyValue {
262        get_derived_property_value(c as u32, self)
263    }
264
265    fn get_value_from_codepoint(&self, cp: u32) -> DerivedPropertyValue {
266        get_derived_property_value(cp, self)
267    }
268}
269
270#[cfg(test)]
271mod test_string_classes {
272    use super::*;
273
274    pub struct TestClass {}
275
276    impl StringClass for TestClass {
277        fn get_value_from_char(&self, c: char) -> DerivedPropertyValue {
278            self.get_value_from_codepoint(c as u32)
279        }
280
281        fn get_value_from_codepoint(&self, cp: u32) -> DerivedPropertyValue {
282            match cp {
283                0x0061 => DerivedPropertyValue::PValid,        // 'a'
284                0x0062 => DerivedPropertyValue::SpecClassPval, // 'b'
285                0x0063 => DerivedPropertyValue::SpecClassDis,  // 'c'
286                0x0064 => DerivedPropertyValue::ContextJ,      // 'd'
287                0x0065 => DerivedPropertyValue::ContextO,      // 'e'
288                0x0066 => DerivedPropertyValue::Disallowed,    // 'f'
289                0x006c => DerivedPropertyValue::PValid,        // 'l'
290                0x200d => DerivedPropertyValue::ContextJ,      // ZERO WIDTH JOINER
291                0x094d => DerivedPropertyValue::PValid,        // Virama
292                0x00b7 => DerivedPropertyValue::ContextO,      // MIDDLE DOT
293                _ => DerivedPropertyValue::Unassigned,
294            }
295        }
296    }
297
298    #[test]
299    fn test_allows_code_point() {
300        let id = TestClass {};
301
302        // Test PValid
303        assert_eq!(id.allows("\u{61}"), Ok(()));
304
305        // Test SpecClassPval
306        assert_eq!(id.allows("\u{62}"), Ok(()));
307
308        // Test SpecClassDis
309        assert_eq!(
310            id.allows("\u{63}"),
311            Err(Error::BadCodepoint(CodepointInfo {
312                cp: 0x63,
313                position: 0,
314                property: DerivedPropertyValue::SpecClassDis
315            }))
316        );
317
318        // Test Disallowed
319        assert_eq!(
320            id.allows("\u{0066}"),
321            Err(Error::BadCodepoint(CodepointInfo {
322                cp: 0x66,
323                position: 0,
324                property: DerivedPropertyValue::Disallowed
325            }))
326        );
327
328        // Test Unassigned
329        assert_eq!(
330            id.allows("\u{67}"),
331            Err(Error::BadCodepoint(CodepointInfo {
332                cp: 0x67,
333                position: 0,
334                property: DerivedPropertyValue::Unassigned
335            }))
336        );
337
338        // Test ContextJ without context rule
339        assert_eq!(
340            id.allows("\u{64}"),
341            Err(Error::Unexpected(UnexpectedError::MissingContextRule(
342                CodepointInfo {
343                    cp: 0x64,
344                    position: 0,
345                    property: DerivedPropertyValue::ContextJ
346                }
347            )))
348        );
349
350        // Test ContextJ with context rule (Disallowed)
351        assert_eq!(
352            id.allows("a\u{200d}"),
353            Err(Error::BadCodepoint(CodepointInfo {
354                cp: 0x200d,
355                position: 1,
356                property: DerivedPropertyValue::ContextJ
357            }))
358        );
359
360        // Test ContextJ with context rule (Disallowed) => Unexpected Error
361        assert_eq!(
362            id.allows("\u{200d}"),
363            Err(Error::Unexpected(UnexpectedError::Undefined))
364        );
365
366        // Test ContextJ with context rule (Allowed)
367        assert_eq!(id.allows("\u{94d}\u{200d}"), Ok(()));
368
369        // Test ContextO without context rule
370        assert_eq!(
371            id.allows("\u{65}"),
372            Err(Error::Unexpected(UnexpectedError::MissingContextRule(
373                CodepointInfo {
374                    cp: 0x65,
375                    position: 0,
376                    property: DerivedPropertyValue::ContextO
377                }
378            )))
379        );
380
381        // Test ContextO with context rule (Disallowed)
382        assert_eq!(
383            id.allows("a\u{00b7}b"),
384            Err(Error::BadCodepoint(CodepointInfo {
385                cp: 0x00b7,
386                position: 1,
387                property: DerivedPropertyValue::ContextO
388            }))
389        );
390
391        // Test ContextO with context rule (Disallowed) => Unexpected Error
392        assert_eq!(
393            id.allows("\u{00b7}"),
394            Err(Error::Unexpected(UnexpectedError::Undefined))
395        );
396
397        // Test ContextO with context rule (Allowed)
398        assert_eq!(id.allows("\u{006c}\u{00b7}\u{006c}"), Ok(()));
399    }
400
401    #[test]
402    fn test_allowed_by_context_rule() {
403        // Check missing context rule
404        assert_eq!(
405            allowed_by_context_rule("test", DerivedPropertyValue::ContextO, 0xffff, 0),
406            Err(Error::Unexpected(UnexpectedError::MissingContextRule(
407                CodepointInfo {
408                    cp: 0xffff,
409                    position: 0,
410                    property: DerivedPropertyValue::ContextO
411                }
412            )))
413        );
414
415        // Check rule allowed (middle dot rule)
416        assert_eq!(
417            allowed_by_context_rule(
418                "\u{006c}\u{00b7}\u{006c}",
419                DerivedPropertyValue::ContextO,
420                0x00b7,
421                1
422            ),
423            Ok(())
424        );
425
426        // Check rule disallowed (middle dot rule)
427        assert_eq!(
428            allowed_by_context_rule(
429                "\u{006c}\u{00b7}a",
430                DerivedPropertyValue::ContextO,
431                0x00b7,
432                1
433            ),
434            Err(Error::BadCodepoint(CodepointInfo {
435                cp: 0x00b7,
436                position: 1,
437                property: DerivedPropertyValue::ContextO
438            }))
439        );
440
441        // Check rule disallowed (middle dot rule) => Unexpected error
442        assert_eq!(
443            allowed_by_context_rule("\u{00b7}", DerivedPropertyValue::ContextO, 0x00b7, 0),
444            Err(Error::Unexpected(UnexpectedError::Undefined))
445        );
446
447        // Check rule not applicable
448        assert_eq!(
449            allowed_by_context_rule("\u{0066}", DerivedPropertyValue::ContextO, 0x00b7, 0),
450            Err(Error::Unexpected(
451                UnexpectedError::ContextRuleNotApplicable(CodepointInfo {
452                    cp: 0x00b7,
453                    position: 0,
454                    property: DerivedPropertyValue::ContextO
455                })
456            ))
457        );
458    }
459}