ucd_parse/
unicode_data.rs

1use std::path::Path;
2
3use crate::{
4    common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint},
5    error::Error,
6};
7
8/// Represents a single row in the `UnicodeData.txt` file.
9///
10/// These fields were taken from UAX44, Table 9, as part of the documentation
11/// for the
12/// [`UnicodeData.txt` file](https://www.unicode.org/reports/tr44/#UnicodeData.txt).
13#[derive(Clone, Debug, Default, Eq, PartialEq)]
14pub struct UnicodeData {
15    /// The codepoint corresponding to this row.
16    pub codepoint: Codepoint,
17    /// The name of this codepoint.
18    pub name: String,
19    /// The "general category" of this codepoint.
20    pub general_category: String,
21    /// The class of this codepoint used in the Canonical Ordering Algorithm.
22    ///
23    /// Note that some classes map to a particular symbol. See
24    /// [UAX44, Table 15](https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values).
25    pub canonical_combining_class: u8,
26    /// The bidirectional class of this codepoint.
27    ///
28    /// Possible values are listed in
29    /// [UAX44, Table 13](https://www.unicode.org/reports/tr44/#Bidi_Class_Values).
30    pub bidi_class: String,
31    /// The decomposition mapping for this codepoint. This includes its
32    /// formatting tag (if present).
33    pub decomposition: UnicodeDataDecomposition,
34    /// A decimal numeric representation of this codepoint, if it has the
35    /// property `Numeric_Type=Decimal`.
36    pub numeric_type_decimal: Option<u8>,
37    /// A decimal numeric representation of this codepoint, if it has the
38    /// property `Numeric_Type=Digit`. Note that while this field is still
39    /// populated for existing codepoints, no new codepoints will have this
40    /// field populated.
41    pub numeric_type_digit: Option<u8>,
42    /// A decimal or rational numeric representation of this codepoint, if it
43    /// has the property `Numeric_Type=Numeric`.
44    pub numeric_type_numeric: Option<UnicodeDataNumeric>,
45    /// A boolean indicating whether this codepoint is "mirrored" in
46    /// bidirectional text.
47    pub bidi_mirrored: bool,
48    /// The "old" Unicode 1.0 or ISO 6429 name of this codepoint. Note that
49    /// this field is empty unless it is significantly different from
50    /// the `name` field.
51    pub unicode1_name: String,
52    /// The ISO 10464 comment field. This no longer contains any non-NULL
53    /// values.
54    pub iso_comment: String,
55    /// This codepoint's simple uppercase mapping, if it exists.
56    pub simple_uppercase_mapping: Option<Codepoint>,
57    /// This codepoint's simple lowercase mapping, if it exists.
58    pub simple_lowercase_mapping: Option<Codepoint>,
59    /// This codepoint's simple titlecase mapping, if it exists.
60    pub simple_titlecase_mapping: Option<Codepoint>,
61}
62
63impl UcdFile for UnicodeData {
64    fn relative_file_path() -> &'static Path {
65        Path::new("UnicodeData.txt")
66    }
67}
68
69impl UcdFileByCodepoint for UnicodeData {
70    fn codepoints(&self) -> CodepointIter {
71        self.codepoint.into_iter()
72    }
73}
74
75impl UnicodeData {
76    /// Returns true if and only if this record corresponds to the start of a
77    /// range.
78    pub fn is_range_start(&self) -> bool {
79        self.name.starts_with('<')
80            && self.name.ends_with('>')
81            && self.name.contains("First")
82    }
83
84    /// Returns true if and only if this record corresponds to the end of a
85    /// range.
86    pub fn is_range_end(&self) -> bool {
87        self.name.starts_with('<')
88            && self.name.ends_with('>')
89            && self.name.contains("Last")
90    }
91}
92
93impl std::str::FromStr for UnicodeData {
94    type Err = Error;
95
96    fn from_str(line: &str) -> Result<UnicodeData, Error> {
97        let re_parts = regex!(
98            r"(?x)
99                ^
100                ([A-Z0-9]+);  #  1; codepoint
101                ([^;]+);      #  2; name
102                ([^;]+);      #  3; general category
103                ([0-9]+);     #  4; canonical combining class
104                ([^;]+);      #  5; bidi class
105                ([^;]*);      #  6; decomposition
106                ([0-9]*);     #  7; numeric type decimal
107                ([0-9]*);     #  8; numeric type digit
108                ([-0-9/]*);   #  9; numeric type numeric
109                ([YN]);       # 10; bidi mirrored
110                ([^;]*);      # 11; unicode1 name
111                ([^;]*);      # 12; ISO comment
112                ([^;]*);      # 13; simple uppercase mapping
113                ([^;]*);      # 14; simple lowercase mapping
114                ([^;]*)       # 15; simple titlecase mapping
115                $
116                ",
117        );
118
119        let caps = match re_parts.captures(line.trim()) {
120            Some(caps) => caps,
121            None => return err!("invalid UnicodeData line"),
122        };
123        let capget = |n| caps.get(n).unwrap().as_str();
124        let mut data = UnicodeData::default();
125
126        data.codepoint = capget(1).parse()?;
127        data.name = capget(2).to_string();
128        data.general_category = capget(3).to_string();
129        data.canonical_combining_class = match capget(4).parse() {
130            Ok(n) => n,
131            Err(err) => {
132                return err!(
133                    "failed to parse canonical combining class '{}': {}",
134                    capget(4),
135                    err
136                )
137            }
138        };
139        data.bidi_class = capget(5).to_string();
140        if !caps[6].is_empty() {
141            data.decomposition = caps[6].parse()?;
142        } else {
143            data.decomposition.push(data.codepoint)?;
144        }
145        if !capget(7).is_empty() {
146            data.numeric_type_decimal = Some(match capget(7).parse() {
147                Ok(n) => n,
148                Err(err) => {
149                    return err!(
150                        "failed to parse numeric type decimal '{}': {}",
151                        capget(7),
152                        err
153                    )
154                }
155            });
156        }
157        if !capget(8).is_empty() {
158            data.numeric_type_digit = Some(match capget(8).parse() {
159                Ok(n) => n,
160                Err(err) => {
161                    return err!(
162                        "failed to parse numeric type digit '{}': {}",
163                        capget(8),
164                        err
165                    )
166                }
167            });
168        }
169        if !capget(9).is_empty() {
170            data.numeric_type_numeric = Some(capget(9).parse()?);
171        }
172        data.bidi_mirrored = capget(10) == "Y";
173        data.unicode1_name = capget(11).to_string();
174        data.iso_comment = capget(12).to_string();
175        if !capget(13).is_empty() {
176            data.simple_uppercase_mapping = Some(capget(13).parse()?);
177        }
178        if !capget(14).is_empty() {
179            data.simple_lowercase_mapping = Some(capget(14).parse()?);
180        }
181        if !capget(15).is_empty() {
182            data.simple_titlecase_mapping = Some(capget(15).parse()?);
183        }
184        Ok(data)
185    }
186}
187
188impl std::fmt::Display for UnicodeData {
189    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
190        write!(f, "{};", self.codepoint)?;
191        write!(f, "{};", self.name)?;
192        write!(f, "{};", self.general_category)?;
193        write!(f, "{};", self.canonical_combining_class)?;
194        write!(f, "{};", self.bidi_class)?;
195        if self.decomposition.is_canonical()
196            && self.decomposition.mapping() == &[self.codepoint]
197        {
198            write!(f, ";")?;
199        } else {
200            write!(f, "{};", self.decomposition)?;
201        }
202        if let Some(n) = self.numeric_type_decimal {
203            write!(f, "{};", n)?;
204        } else {
205            write!(f, ";")?;
206        }
207        if let Some(n) = self.numeric_type_digit {
208            write!(f, "{};", n)?;
209        } else {
210            write!(f, ";")?;
211        }
212        if let Some(n) = self.numeric_type_numeric {
213            write!(f, "{};", n)?;
214        } else {
215            write!(f, ";")?;
216        }
217        write!(f, "{};", if self.bidi_mirrored { "Y" } else { "N" })?;
218        write!(f, "{};", self.unicode1_name)?;
219        write!(f, "{};", self.iso_comment)?;
220        if let Some(cp) = self.simple_uppercase_mapping {
221            write!(f, "{};", cp)?;
222        } else {
223            write!(f, ";")?;
224        }
225        if let Some(cp) = self.simple_lowercase_mapping {
226            write!(f, "{};", cp)?;
227        } else {
228            write!(f, ";")?;
229        }
230        if let Some(cp) = self.simple_titlecase_mapping {
231            write!(f, "{}", cp)?;
232        }
233        Ok(())
234    }
235}
236
237/// Represents a decomposition mapping of a single row in the
238/// `UnicodeData.txt` file.
239#[derive(Clone, Debug, Default, Eq, PartialEq)]
240pub struct UnicodeDataDecomposition {
241    /// The formatting tag associated with this mapping, if present.
242    pub tag: Option<UnicodeDataDecompositionTag>,
243    /// The number of codepoints in this mapping.
244    pub len: usize,
245    /// The codepoints in the mapping. Entries beyond `len` in the mapping
246    /// are always U+0000. If no mapping was present, then this always contains
247    /// a single codepoint corresponding to this row's character.
248    pub mapping: [Codepoint; 18],
249}
250
251impl UnicodeDataDecomposition {
252    /// Create a new decomposition mapping with the given tag and codepoints.
253    ///
254    /// If there are too many codepoints, then an error is returned.
255    pub fn new(
256        tag: Option<UnicodeDataDecompositionTag>,
257        mapping: &[Codepoint],
258    ) -> Result<UnicodeDataDecomposition, Error> {
259        let mut x = UnicodeDataDecomposition::default();
260        x.tag = tag;
261        for &cp in mapping {
262            x.push(cp)?;
263        }
264        Ok(x)
265    }
266
267    /// Add a new codepoint to this decomposition's mapping.
268    ///
269    /// If the mapping is already full, then this returns an error.
270    pub fn push(&mut self, cp: Codepoint) -> Result<(), Error> {
271        if self.len >= self.mapping.len() {
272            return err!(
273                "invalid decomposition mapping (too many codepoints)"
274            );
275        }
276        self.mapping[self.len] = cp;
277        self.len += 1;
278        Ok(())
279    }
280
281    /// Return the mapping as a slice of codepoints. The slice returned
282    /// has length equivalent to the number of codepoints in this mapping.
283    pub fn mapping(&self) -> &[Codepoint] {
284        &self.mapping[..self.len]
285    }
286
287    /// Returns true if and only if this decomposition mapping is canonical.
288    pub fn is_canonical(&self) -> bool {
289        self.tag.is_none()
290    }
291}
292
293impl std::str::FromStr for UnicodeDataDecomposition {
294    type Err = Error;
295
296    fn from_str(s: &str) -> Result<UnicodeDataDecomposition, Error> {
297        let re_with_tag =
298            regex!(r"^(?:<(?P<tag>[^>]+)>)?\s*(?P<chars>[\s0-9A-F]+)$");
299        let re_chars = regex!(r"[0-9A-F]+");
300        if s.is_empty() {
301            return err!(
302                "expected non-empty string for \
303                 UnicodeDataDecomposition value"
304            );
305        }
306        let caps = match re_with_tag.captures(s) {
307            Some(caps) => caps,
308            None => return err!("invalid decomposition value"),
309        };
310        let mut decomp = UnicodeDataDecomposition::default();
311        let mut codepoints = s;
312        if let Some(m) = caps.name("tag") {
313            decomp.tag = Some(m.as_str().parse()?);
314            codepoints = &caps["chars"];
315        }
316        for m in re_chars.find_iter(codepoints) {
317            let cp = m.as_str().parse()?;
318            decomp.push(cp)?;
319        }
320        Ok(decomp)
321    }
322}
323
324impl std::fmt::Display for UnicodeDataDecomposition {
325    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
326        if let Some(ref tag) = self.tag {
327            write!(f, "<{}> ", tag)?;
328        }
329        let mut first = true;
330        for cp in self.mapping() {
331            if !first {
332                write!(f, " ")?;
333            }
334            first = false;
335            write!(f, "{}", cp)?;
336        }
337        Ok(())
338    }
339}
340
341/// The formatting tag on a decomposition mapping.
342///
343/// This is taken from
344/// [UAX44, Table 14](https://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings).
345#[derive(Clone, Debug, Eq, PartialEq)]
346pub enum UnicodeDataDecompositionTag {
347    /// <font>
348    Font,
349    /// <noBreak>
350    NoBreak,
351    /// <initial>
352    Initial,
353    /// <medial>
354    Medial,
355    /// <final>
356    Final,
357    /// <isolated>
358    Isolated,
359    /// <circle>
360    Circle,
361    /// <super>
362    Super,
363    /// <sub>
364    Sub,
365    /// <vertical>
366    Vertical,
367    /// <wide>
368    Wide,
369    /// <narrow>
370    Narrow,
371    /// <small>
372    Small,
373    /// <square>
374    Square,
375    /// <fraction>
376    Fraction,
377    /// <compat>
378    Compat,
379}
380
381impl std::str::FromStr for UnicodeDataDecompositionTag {
382    type Err = Error;
383
384    fn from_str(s: &str) -> Result<UnicodeDataDecompositionTag, Error> {
385        use self::UnicodeDataDecompositionTag::*;
386        Ok(match s {
387            "font" => Font,
388            "noBreak" => NoBreak,
389            "initial" => Initial,
390            "medial" => Medial,
391            "final" => Final,
392            "isolated" => Isolated,
393            "circle" => Circle,
394            "super" => Super,
395            "sub" => Sub,
396            "vertical" => Vertical,
397            "wide" => Wide,
398            "narrow" => Narrow,
399            "small" => Small,
400            "square" => Square,
401            "fraction" => Fraction,
402            "compat" => Compat,
403            _ => return err!("invalid decomposition formatting tag: {}", s),
404        })
405    }
406}
407
408impl std::fmt::Display for UnicodeDataDecompositionTag {
409    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
410        use self::UnicodeDataDecompositionTag::*;
411        let s = match *self {
412            Font => "font",
413            NoBreak => "noBreak",
414            Initial => "initial",
415            Medial => "medial",
416            Final => "final",
417            Isolated => "isolated",
418            Circle => "circle",
419            Super => "super",
420            Sub => "sub",
421            Vertical => "vertical",
422            Wide => "wide",
423            Narrow => "narrow",
424            Small => "small",
425            Square => "square",
426            Fraction => "fraction",
427            Compat => "compat",
428        };
429        write!(f, "{}", s)
430    }
431}
432
433/// A numeric value corresponding to characters with `Numeric_Type=Numeric`.
434///
435/// A numeric value can either be a signed integer or a rational number.
436#[derive(Clone, Copy, Debug, Eq, PartialEq)]
437pub enum UnicodeDataNumeric {
438    /// An integer.
439    Integer(i64),
440    /// A rational number. The first is the numerator and the latter is the
441    /// denominator.
442    Rational(i64, i64),
443}
444
445impl std::str::FromStr for UnicodeDataNumeric {
446    type Err = Error;
447
448    fn from_str(s: &str) -> Result<UnicodeDataNumeric, Error> {
449        if s.is_empty() {
450            return err!(
451                "expected non-empty string for UnicodeDataNumeric value"
452            );
453        }
454        if let Some(pos) = s.find('/') {
455            let (snum, sden) = (&s[..pos], &s[pos + 1..]);
456            let num = match snum.parse() {
457                Ok(num) => num,
458                Err(err) => {
459                    return err!(
460                        "invalid integer numerator '{}': {}",
461                        snum,
462                        err
463                    );
464                }
465            };
466            let den = match sden.parse() {
467                Ok(den) => den,
468                Err(err) => {
469                    return err!(
470                        "invalid integer denominator '{}': {}",
471                        sden,
472                        err
473                    );
474                }
475            };
476            Ok(UnicodeDataNumeric::Rational(num, den))
477        } else {
478            match s.parse() {
479                Ok(den) => Ok(UnicodeDataNumeric::Integer(den)),
480                Err(err) => {
481                    return err!(
482                        "invalid integer denominator '{}': {}",
483                        s,
484                        err
485                    );
486                }
487            }
488        }
489    }
490}
491
492impl std::fmt::Display for UnicodeDataNumeric {
493    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
494        match *self {
495            UnicodeDataNumeric::Integer(n) => write!(f, "{}", n),
496            UnicodeDataNumeric::Rational(n, d) => write!(f, "{}/{}", n, d),
497        }
498    }
499}
500
501/// An iterator adapter that expands rows in `UnicodeData.txt`.
502///
503/// Throughout `UnicodeData.txt`, some assigned codepoints are not explicitly
504/// represented. Instead, they are represented by a pair of rows, indicating
505/// a range of codepoints with the same properties. For example, the Hangul
506/// syllable codepoints are represented by these two rows:
507///
508/// ```ignore
509/// AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
510/// D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
511/// ```
512///
513/// This iterator will wrap any iterator of `UnicodeData` and, when a range of
514/// Unicode codepoints is found, it will be expanded to the appropriate
515/// sequence of `UnicodeData` values. Note that all such expanded records will
516/// have an empty name.
517pub struct UnicodeDataExpander<I: Iterator> {
518    /// The underlying iterator.
519    it: std::iter::Peekable<I>,
520    /// A range of codepoints to emit when we've found a pair. Otherwise,
521    /// `None`.
522    range: CodepointRange,
523}
524
525struct CodepointRange {
526    /// The codepoint range.
527    range: std::ops::Range<u32>,
528    /// The start record. All subsequent records in this range are generated
529    /// by cloning this and updating the codepoint/name.
530    start_record: UnicodeData,
531}
532
533impl<I: Iterator<Item = UnicodeData>> UnicodeDataExpander<I> {
534    /// Create a new iterator that expands pairs of `UnicodeData` range
535    /// records. All other records are passed through as-is.
536    pub fn new<T>(it: T) -> UnicodeDataExpander<I>
537    where
538        T: IntoIterator<IntoIter = I, Item = I::Item>,
539    {
540        UnicodeDataExpander {
541            it: it.into_iter().peekable(),
542            range: CodepointRange {
543                range: 0..0,
544                start_record: UnicodeData::default(),
545            },
546        }
547    }
548}
549
550impl<I: Iterator<Item = UnicodeData>> Iterator for UnicodeDataExpander<I> {
551    type Item = UnicodeData;
552
553    fn next(&mut self) -> Option<UnicodeData> {
554        if let Some(udata) = self.range.next() {
555            return Some(udata);
556        }
557        let row1 = match self.it.next() {
558            None => return None,
559            Some(row1) => row1,
560        };
561        if !row1.is_range_start()
562            || !self.it.peek().map_or(false, |row2| row2.is_range_end())
563        {
564            return Some(row1);
565        }
566        let row2 = self.it.next().unwrap();
567        self.range = CodepointRange {
568            range: row1.codepoint.value()..(row2.codepoint.value() + 1),
569            start_record: row1,
570        };
571        self.next()
572    }
573}
574
575impl Iterator for CodepointRange {
576    type Item = UnicodeData;
577
578    fn next(&mut self) -> Option<UnicodeData> {
579        let cp = match self.range.next() {
580            None => return None,
581            Some(cp) => cp,
582        };
583        Some(UnicodeData {
584            codepoint: Codepoint::from_u32(cp).unwrap(),
585            name: "".to_string(),
586            ..self.start_record.clone()
587        })
588    }
589}
590
591#[cfg(test)]
592mod tests {
593    use crate::common::Codepoint;
594
595    use super::{
596        UnicodeData, UnicodeDataDecomposition, UnicodeDataDecompositionTag,
597        UnicodeDataNumeric,
598    };
599
600    fn codepoint(n: u32) -> Codepoint {
601        Codepoint::from_u32(n).unwrap()
602    }
603
604    fn s(string: &str) -> String {
605        string.to_string()
606    }
607
608    #[test]
609    fn parse1() {
610        let line = "249D;PARENTHESIZED LATIN SMALL LETTER B;So;0;L;<compat> 0028 0062 0029;;;;N;;;;;\n";
611        let data: UnicodeData = line.parse().unwrap();
612        assert_eq!(
613            data,
614            UnicodeData {
615                codepoint: codepoint(0x249d),
616                name: s("PARENTHESIZED LATIN SMALL LETTER B"),
617                general_category: s("So"),
618                canonical_combining_class: 0,
619                bidi_class: s("L"),
620                decomposition: UnicodeDataDecomposition::new(
621                    Some(UnicodeDataDecompositionTag::Compat),
622                    &[codepoint(0x28), codepoint(0x62), codepoint(0x29)],
623                )
624                .unwrap(),
625                numeric_type_decimal: None,
626                numeric_type_digit: None,
627                numeric_type_numeric: None,
628                bidi_mirrored: false,
629                unicode1_name: s(""),
630                iso_comment: s(""),
631                simple_uppercase_mapping: None,
632                simple_lowercase_mapping: None,
633                simple_titlecase_mapping: None,
634            }
635        );
636    }
637
638    #[test]
639    fn parse2() {
640        let line = "000D;<control>;Cc;0;B;;;;;N;CARRIAGE RETURN (CR);;;;\n";
641        let data: UnicodeData = line.parse().unwrap();
642        assert_eq!(
643            data,
644            UnicodeData {
645                codepoint: codepoint(0x000D),
646                name: s("<control>"),
647                general_category: s("Cc"),
648                canonical_combining_class: 0,
649                bidi_class: s("B"),
650                decomposition: UnicodeDataDecomposition::new(
651                    None,
652                    &[codepoint(0x000D)]
653                )
654                .unwrap(),
655                numeric_type_decimal: None,
656                numeric_type_digit: None,
657                numeric_type_numeric: None,
658                bidi_mirrored: false,
659                unicode1_name: s("CARRIAGE RETURN (CR)"),
660                iso_comment: s(""),
661                simple_uppercase_mapping: None,
662                simple_lowercase_mapping: None,
663                simple_titlecase_mapping: None,
664            }
665        );
666    }
667
668    #[test]
669    fn parse3() {
670        let line = "00BC;VULGAR FRACTION ONE QUARTER;No;0;ON;<fraction> 0031 2044 0034;;;1/4;N;FRACTION ONE QUARTER;;;;\n";
671        let data: UnicodeData = line.parse().unwrap();
672        assert_eq!(
673            data,
674            UnicodeData {
675                codepoint: codepoint(0x00BC),
676                name: s("VULGAR FRACTION ONE QUARTER"),
677                general_category: s("No"),
678                canonical_combining_class: 0,
679                bidi_class: s("ON"),
680                decomposition: UnicodeDataDecomposition::new(
681                    Some(UnicodeDataDecompositionTag::Fraction),
682                    &[codepoint(0x31), codepoint(0x2044), codepoint(0x34)],
683                )
684                .unwrap(),
685                numeric_type_decimal: None,
686                numeric_type_digit: None,
687                numeric_type_numeric: Some(UnicodeDataNumeric::Rational(1, 4)),
688                bidi_mirrored: false,
689                unicode1_name: s("FRACTION ONE QUARTER"),
690                iso_comment: s(""),
691                simple_uppercase_mapping: None,
692                simple_lowercase_mapping: None,
693                simple_titlecase_mapping: None,
694            }
695        );
696    }
697
698    #[test]
699    fn parse4() {
700        let line = "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;\n";
701        let data: UnicodeData = line.parse().unwrap();
702        assert_eq!(
703            data,
704            UnicodeData {
705                codepoint: codepoint(0x0041),
706                name: s("LATIN CAPITAL LETTER A"),
707                general_category: s("Lu"),
708                canonical_combining_class: 0,
709                bidi_class: s("L"),
710                decomposition: UnicodeDataDecomposition::new(
711                    None,
712                    &[codepoint(0x0041)]
713                )
714                .unwrap(),
715                numeric_type_decimal: None,
716                numeric_type_digit: None,
717                numeric_type_numeric: None,
718                bidi_mirrored: false,
719                unicode1_name: s(""),
720                iso_comment: s(""),
721                simple_uppercase_mapping: None,
722                simple_lowercase_mapping: Some(codepoint(0x0061)),
723                simple_titlecase_mapping: None,
724            }
725        );
726    }
727
728    #[test]
729    fn parse5() {
730        let line = "0F33;TIBETAN DIGIT HALF ZERO;No;0;L;;;;-1/2;N;;;;;\n";
731        let data: UnicodeData = line.parse().unwrap();
732        assert_eq!(
733            data,
734            UnicodeData {
735                codepoint: codepoint(0x0F33),
736                name: s("TIBETAN DIGIT HALF ZERO"),
737                general_category: s("No"),
738                canonical_combining_class: 0,
739                bidi_class: s("L"),
740                decomposition: UnicodeDataDecomposition::new(
741                    None,
742                    &[codepoint(0x0F33)]
743                )
744                .unwrap(),
745                numeric_type_decimal: None,
746                numeric_type_digit: None,
747                numeric_type_numeric: Some(UnicodeDataNumeric::Rational(
748                    -1, 2
749                )),
750                bidi_mirrored: false,
751                unicode1_name: s(""),
752                iso_comment: s(""),
753                simple_uppercase_mapping: None,
754                simple_lowercase_mapping: None,
755                simple_titlecase_mapping: None,
756            }
757        );
758    }
759
760    #[test]
761    fn expander() {
762        use super::UnicodeDataExpander;
763        use crate::common::UcdLineParser;
764
765        let data = "\
766ABF9;MEETEI MAYEK DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
767AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
768D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
769D7B0;HANGUL JUNGSEONG O-YEO;Lo;0;L;;;;;N;;;;;
770";
771        let records = UcdLineParser::new(None, data.as_bytes())
772            .collect::<Result<Vec<_>, _>>()
773            .unwrap();
774        assert_eq!(UnicodeDataExpander::new(records).count(), 11174);
775    }
776}
ucd_parse/unicode_data.rs

ucd_parse/
unicode_data.rs