precis_core/stringclasses.rs
1//! This module contains the implementation and traits for the
2//! String classes such as it is defined by the PRECIS framework
3//! [`rfc8264`](https://datatracker.ietf.org/doc/html/rfc8264#section-4)
4
5use crate::common;
6use crate::context;
7use crate::DerivedPropertyValue;
8use crate::{CodepointInfo, Error, UnexpectedError};
9
10/// Interface for specific classes to deal with specific Unicode
11/// code groups defined in RFC 8264.
12/// Next callbacks will be invoked to calculate the derived property
13/// according to the algorithm defined in [`rfc8264`](https://datatracker.ietf.org/doc/html/rfc8264#section-8)
14pub trait SpecificDerivedPropertyValue {
15 /// Callback invoked when the Unicode code point belongs to
16 /// [Spaces](https://datatracker.ietf.org/doc/html/rfc8264#section-9.14)
17 fn on_spaces(&self) -> DerivedPropertyValue;
18 /// Callback invoked when the Unicode code point belongs to
19 /// [Symbols](https://datatracker.ietf.org/doc/html/rfc8264#section-9.15)
20 fn on_symbols(&self) -> DerivedPropertyValue;
21 /// Callback invoked when the Unicode code point belongs to
22 /// [Punctuation](https://datatracker.ietf.org/doc/html/rfc8264#section-9.16)
23 fn on_punctuation(&self) -> DerivedPropertyValue;
24 /// Callback invoked when the Unicode code point belongs to
25 /// [`HasCompat`](https://datatracker.ietf.org/doc/html/rfc8264#section-9.17)
26 fn on_has_compat(&self) -> DerivedPropertyValue;
27 /// Callback invoked when the Unicode code point belongs to
28 /// [`OtherLetterDigits`](https://datatracker.ietf.org/doc/html/rfc8264#section-9.18)
29 fn on_other_letter_digits(&self) -> DerivedPropertyValue;
30}
31
32/// Implements the algorithm to calculate the value of the derived property.
33/// This algorithm is as follows (implementations MUST NOT modify the order
34/// of operations within this algorithm, because doing so would cause
35/// inconsistent results across implementations):
36///
37/// > If .`cp`. .in. `Exceptions` Then `Exceptions`(`cp`);\
38/// > Else If .`cp`. .in. `BackwardCompatible` Then `BackwardCompatible`(`cp`);\
39/// > Else If .`cp`. .in. `Unassigned` Then `UNASSIGNED`;\
40/// > Else If .`cp`. .in. `ASCII7` Then `PVALID`;\
41/// > Else If .`cp`. .in. `JoinControl` Then `CONTEXTJ`;\
42/// > Else If .`cp`. .in. `OldHangulJamo` Then `DISALLOWED`;\
43/// > Else If .`cp`. .in. `PrecisIgnorableProperties` Then `DISALLOWED`;\
44/// > Else If .`cp`. .in. `Controls` Then `DISALLOWED`;\
45/// > Else If .`cp`. .in. `HasCompat` Then `ID_DIS` or `FREE_PVAL`;\
46/// > Else If .`cp`. .in. `LetterDigits` Then `PVALID`;\
47/// > Else If .`cp`. .in. `OtherLetterDigits` Then `ID_DIS` or `FREE_PVAL`;\
48/// > Else If .`cp`. .in. `Spaces` Then `ID_DIS` or `FREE_PVAL`;\
49/// > Else If .`cp`. .in. `Symbols` Then `ID_DIS` or `FREE_PVAL`;\
50/// > Else If .`cp`. .in. `Punctuation` Then `ID_DIS` or `FREE_PVAL`;\
51/// > Else `DISALLOWED`;
52///
53/// # Arguments
54/// * `cp` - Unicode code point
55/// * `obj` - Object implementing the [`SpecificDerivedPropertyValue`] trait.
56///
57/// # Return
58/// This function returns the derived property value as defined in
59/// [RFC 8264](https://datatracker.ietf.org/doc/html/rfc8264#section-8)
60#[allow(clippy::if_same_then_else)]
61fn get_derived_property_value(
62 cp: u32,
63 obj: &dyn SpecificDerivedPropertyValue,
64) -> DerivedPropertyValue {
65 match common::get_exception_val(cp) {
66 Some(val) => *val,
67 None => match common::get_backward_compatible_val(cp) {
68 Some(val) => *val,
69 None => {
70 if common::is_unassigned(cp) {
71 DerivedPropertyValue::Unassigned
72 } else if common::is_ascii7(cp) {
73 DerivedPropertyValue::PValid
74 } else if common::is_join_control(cp) {
75 DerivedPropertyValue::ContextJ
76 } else if common::is_old_hangul_jamo(cp) {
77 DerivedPropertyValue::Disallowed
78 } else if common::is_precis_ignorable_property(cp) {
79 DerivedPropertyValue::Disallowed
80 } else if common::is_control(cp) {
81 DerivedPropertyValue::Disallowed
82 } else if common::has_compat(cp) {
83 obj.on_has_compat()
84 } else if common::is_letter_digit(cp) {
85 DerivedPropertyValue::PValid
86 } else if common::is_other_letter_digit(cp) {
87 obj.on_other_letter_digits()
88 } else if common::is_space(cp) {
89 obj.on_spaces()
90 } else if common::is_symbol(cp) {
91 obj.on_symbols()
92 } else if common::is_punctuation(cp) {
93 obj.on_punctuation()
94 } else {
95 DerivedPropertyValue::Disallowed
96 }
97 }
98 },
99 }
100}
101
102fn allowed_by_context_rule(
103 label: &str,
104 val: DerivedPropertyValue,
105 cp: u32,
106 offset: usize,
107) -> Result<(), Error> {
108 match context::get_context_rule(cp) {
109 None => Err(Error::Unexpected(UnexpectedError::MissingContextRule(
110 CodepointInfo::new(cp, offset, val),
111 ))),
112 Some(rule) => match rule(label, offset) {
113 Ok(allowed) => {
114 if allowed {
115 Ok(())
116 } else {
117 Err(Error::BadCodepoint(CodepointInfo::new(cp, offset, val)))
118 }
119 }
120 Err(e) => match e {
121 context::ContextRuleError::NotApplicable => Err(Error::Unexpected(
122 UnexpectedError::ContextRuleNotApplicable(CodepointInfo::new(cp, offset, val)),
123 )),
124 context::ContextRuleError::Undefined => {
125 Err(Error::Unexpected(UnexpectedError::Undefined))
126 }
127 },
128 },
129 }
130}
131
132/// Base interface for all String classes in PRECIS framework.
133pub trait StringClass {
134 /// Gets the derived property value according to the algorithm defined
135 /// in [`rfc8264`](https://datatracker.ietf.org/doc/html/rfc8264#section-8)
136 /// # Arguments
137 /// * `c`- Unicode character
138 /// # Return
139 /// This method returns the derived property value associated to a Unicode character
140 fn get_value_from_char(&self, c: char) -> DerivedPropertyValue;
141
142 /// Gets the derived property value according to the algorithm defined
143 /// in [`rfc8264`](https://datatracker.ietf.org/doc/html/rfc8264#section-8)
144 /// # Arguments:
145 /// * `cp`- Unicode code point
146 /// # Return
147 /// This method returns the derived property value associated to a Unicode character
148 fn get_value_from_codepoint(&self, cp: u32) -> DerivedPropertyValue;
149
150 /// Ensures that the string consists only of Unicode code points that
151 /// are explicitly allowed by the PRECIS
152 /// [String Class](https://datatracker.ietf.org/doc/html/rfc8264#section-4)
153 /// # Arguments:
154 /// * `label` - string to check
155 /// # Returns
156 /// true if all character of `label` are allowed by the String Class.
157 fn allows<S>(&self, label: S) -> Result<(), Error>
158 where
159 S: AsRef<str>,
160 {
161 for (offset, c) in label.as_ref().chars().enumerate() {
162 let val = self.get_value_from_char(c);
163
164 match val {
165 DerivedPropertyValue::PValid | DerivedPropertyValue::SpecClassPval => Ok(()),
166 DerivedPropertyValue::SpecClassDis
167 | DerivedPropertyValue::Disallowed
168 | DerivedPropertyValue::Unassigned => Err(Error::BadCodepoint(CodepointInfo::new(
169 c as u32, offset, val,
170 ))),
171 DerivedPropertyValue::ContextJ | DerivedPropertyValue::ContextO => {
172 allowed_by_context_rule(label.as_ref(), val, c as u32, offset)
173 }
174 }?
175 }
176
177 Ok(())
178 }
179}
180
181/// Concrete class representing PRECIS `IdentifierClass` from
182/// [RFC 8264](https://datatracker.ietf.org/doc/html/rfc8264#section-4.2).
183/// # Example
184/// ```rust
185/// # use precis_core::{DerivedPropertyValue,IdentifierClass,StringClass};
186/// let id = IdentifierClass::default();
187/// // character 𐍁 is OtherLetterDigits (R)
188/// assert_eq!(id.get_value_from_char('𐍁'), DerivedPropertyValue::SpecClassDis);
189/// // Character S is ASCII7 (K)
190/// assert_eq!(id.get_value_from_char('S'), DerivedPropertyValue::PValid);
191/// // Character 0x1170 is OldHangulJamo (I)
192/// assert_eq!(id.get_value_from_codepoint(0x1170), DerivedPropertyValue::Disallowed);
193/// ```
194#[derive(Debug, Default, Copy, Clone, PartialEq, Eq)]
195pub struct IdentifierClass {}
196
197impl SpecificDerivedPropertyValue for IdentifierClass {
198 // `ID_DIS` mapped to `SPEC_CLASS_DIS`
199 fn on_has_compat(&self) -> DerivedPropertyValue {
200 DerivedPropertyValue::SpecClassDis
201 }
202 fn on_other_letter_digits(&self) -> DerivedPropertyValue {
203 DerivedPropertyValue::SpecClassDis
204 }
205 fn on_spaces(&self) -> DerivedPropertyValue {
206 DerivedPropertyValue::SpecClassDis
207 }
208 fn on_symbols(&self) -> DerivedPropertyValue {
209 DerivedPropertyValue::SpecClassDis
210 }
211 fn on_punctuation(&self) -> DerivedPropertyValue {
212 DerivedPropertyValue::SpecClassDis
213 }
214}
215
216impl StringClass for IdentifierClass {
217 fn get_value_from_char(&self, c: char) -> DerivedPropertyValue {
218 get_derived_property_value(c as u32, self)
219 }
220
221 fn get_value_from_codepoint(&self, cp: u32) -> DerivedPropertyValue {
222 get_derived_property_value(cp, self)
223 }
224}
225
226/// Concrete class representing PRECIS `FreeformClass` from
227/// [RFC 8264](https://datatracker.ietf.org/doc/html/rfc8264#section-4.3).
228/// # Example
229/// ```rust
230/// # use precis_core::{DerivedPropertyValue,FreeformClass,StringClass};
231/// let ff = FreeformClass::default();
232/// // character 𐍁 is OtherLetterDigits (R)
233/// assert_eq!(ff.get_value_from_char('𐍁'), DerivedPropertyValue::SpecClassPval);
234/// // Character S is ASCII7 (K)
235/// assert_eq!(ff.get_value_from_char('S'), DerivedPropertyValue::PValid);
236/// // Character 0x1170 is OldHangulJamo (I)
237/// assert_eq!(ff.get_value_from_codepoint(0x1170), DerivedPropertyValue::Disallowed);
238/// ```
239#[derive(Debug, Default, Copy, Clone, PartialEq, Eq)]
240pub struct FreeformClass {}
241
242impl SpecificDerivedPropertyValue for FreeformClass {
243 fn on_has_compat(&self) -> DerivedPropertyValue {
244 DerivedPropertyValue::SpecClassPval
245 }
246 fn on_other_letter_digits(&self) -> DerivedPropertyValue {
247 DerivedPropertyValue::SpecClassPval
248 }
249 fn on_spaces(&self) -> DerivedPropertyValue {
250 DerivedPropertyValue::SpecClassPval
251 }
252 fn on_symbols(&self) -> DerivedPropertyValue {
253 DerivedPropertyValue::SpecClassPval
254 }
255 fn on_punctuation(&self) -> DerivedPropertyValue {
256 DerivedPropertyValue::SpecClassPval
257 }
258}
259
260impl StringClass for FreeformClass {
261 fn get_value_from_char(&self, c: char) -> DerivedPropertyValue {
262 get_derived_property_value(c as u32, self)
263 }
264
265 fn get_value_from_codepoint(&self, cp: u32) -> DerivedPropertyValue {
266 get_derived_property_value(cp, self)
267 }
268}
269
270#[cfg(test)]
271mod test_string_classes {
272 use super::*;
273
274 pub struct TestClass {}
275
276 impl StringClass for TestClass {
277 fn get_value_from_char(&self, c: char) -> DerivedPropertyValue {
278 self.get_value_from_codepoint(c as u32)
279 }
280
281 fn get_value_from_codepoint(&self, cp: u32) -> DerivedPropertyValue {
282 match cp {
283 0x0061 => DerivedPropertyValue::PValid, // 'a'
284 0x0062 => DerivedPropertyValue::SpecClassPval, // 'b'
285 0x0063 => DerivedPropertyValue::SpecClassDis, // 'c'
286 0x0064 => DerivedPropertyValue::ContextJ, // 'd'
287 0x0065 => DerivedPropertyValue::ContextO, // 'e'
288 0x0066 => DerivedPropertyValue::Disallowed, // 'f'
289 0x006c => DerivedPropertyValue::PValid, // 'l'
290 0x200d => DerivedPropertyValue::ContextJ, // ZERO WIDTH JOINER
291 0x094d => DerivedPropertyValue::PValid, // Virama
292 0x00b7 => DerivedPropertyValue::ContextO, // MIDDLE DOT
293 _ => DerivedPropertyValue::Unassigned,
294 }
295 }
296 }
297
298 #[test]
299 fn test_allows_code_point() {
300 let id = TestClass {};
301
302 // Test PValid
303 assert_eq!(id.allows("\u{61}"), Ok(()));
304
305 // Test SpecClassPval
306 assert_eq!(id.allows("\u{62}"), Ok(()));
307
308 // Test SpecClassDis
309 assert_eq!(
310 id.allows("\u{63}"),
311 Err(Error::BadCodepoint(CodepointInfo {
312 cp: 0x63,
313 position: 0,
314 property: DerivedPropertyValue::SpecClassDis
315 }))
316 );
317
318 // Test Disallowed
319 assert_eq!(
320 id.allows("\u{0066}"),
321 Err(Error::BadCodepoint(CodepointInfo {
322 cp: 0x66,
323 position: 0,
324 property: DerivedPropertyValue::Disallowed
325 }))
326 );
327
328 // Test Unassigned
329 assert_eq!(
330 id.allows("\u{67}"),
331 Err(Error::BadCodepoint(CodepointInfo {
332 cp: 0x67,
333 position: 0,
334 property: DerivedPropertyValue::Unassigned
335 }))
336 );
337
338 // Test ContextJ without context rule
339 assert_eq!(
340 id.allows("\u{64}"),
341 Err(Error::Unexpected(UnexpectedError::MissingContextRule(
342 CodepointInfo {
343 cp: 0x64,
344 position: 0,
345 property: DerivedPropertyValue::ContextJ
346 }
347 )))
348 );
349
350 // Test ContextJ with context rule (Disallowed)
351 assert_eq!(
352 id.allows("a\u{200d}"),
353 Err(Error::BadCodepoint(CodepointInfo {
354 cp: 0x200d,
355 position: 1,
356 property: DerivedPropertyValue::ContextJ
357 }))
358 );
359
360 // Test ContextJ with context rule (Disallowed) => Unexpected Error
361 assert_eq!(
362 id.allows("\u{200d}"),
363 Err(Error::Unexpected(UnexpectedError::Undefined))
364 );
365
366 // Test ContextJ with context rule (Allowed)
367 assert_eq!(id.allows("\u{94d}\u{200d}"), Ok(()));
368
369 // Test ContextO without context rule
370 assert_eq!(
371 id.allows("\u{65}"),
372 Err(Error::Unexpected(UnexpectedError::MissingContextRule(
373 CodepointInfo {
374 cp: 0x65,
375 position: 0,
376 property: DerivedPropertyValue::ContextO
377 }
378 )))
379 );
380
381 // Test ContextO with context rule (Disallowed)
382 assert_eq!(
383 id.allows("a\u{00b7}b"),
384 Err(Error::BadCodepoint(CodepointInfo {
385 cp: 0x00b7,
386 position: 1,
387 property: DerivedPropertyValue::ContextO
388 }))
389 );
390
391 // Test ContextO with context rule (Disallowed) => Unexpected Error
392 assert_eq!(
393 id.allows("\u{00b7}"),
394 Err(Error::Unexpected(UnexpectedError::Undefined))
395 );
396
397 // Test ContextO with context rule (Allowed)
398 assert_eq!(id.allows("\u{006c}\u{00b7}\u{006c}"), Ok(()));
399 }
400
401 #[test]
402 fn test_allowed_by_context_rule() {
403 // Check missing context rule
404 assert_eq!(
405 allowed_by_context_rule("test", DerivedPropertyValue::ContextO, 0xffff, 0),
406 Err(Error::Unexpected(UnexpectedError::MissingContextRule(
407 CodepointInfo {
408 cp: 0xffff,
409 position: 0,
410 property: DerivedPropertyValue::ContextO
411 }
412 )))
413 );
414
415 // Check rule allowed (middle dot rule)
416 assert_eq!(
417 allowed_by_context_rule(
418 "\u{006c}\u{00b7}\u{006c}",
419 DerivedPropertyValue::ContextO,
420 0x00b7,
421 1
422 ),
423 Ok(())
424 );
425
426 // Check rule disallowed (middle dot rule)
427 assert_eq!(
428 allowed_by_context_rule(
429 "\u{006c}\u{00b7}a",
430 DerivedPropertyValue::ContextO,
431 0x00b7,
432 1
433 ),
434 Err(Error::BadCodepoint(CodepointInfo {
435 cp: 0x00b7,
436 position: 1,
437 property: DerivedPropertyValue::ContextO
438 }))
439 );
440
441 // Check rule disallowed (middle dot rule) => Unexpected error
442 assert_eq!(
443 allowed_by_context_rule("\u{00b7}", DerivedPropertyValue::ContextO, 0x00b7, 0),
444 Err(Error::Unexpected(UnexpectedError::Undefined))
445 );
446
447 // Check rule not applicable
448 assert_eq!(
449 allowed_by_context_rule("\u{0066}", DerivedPropertyValue::ContextO, 0x00b7, 0),
450 Err(Error::Unexpected(
451 UnexpectedError::ContextRuleNotApplicable(CodepointInfo {
452 cp: 0x00b7,
453 position: 0,
454 property: DerivedPropertyValue::ContextO
455 })
456 ))
457 );
458 }
459}