ucd_parse/
grapheme_cluster_break.rs

1use std::path::Path;
2
3use crate::{
4    common::{
5        parse_break_test, parse_codepoint_association, CodepointIter,
6        Codepoints, UcdFile, UcdFileByCodepoint,
7    },
8    error::Error,
9};
10
11/// A single row in the `auxiliary/GraphemeBreakProperty.txt` file.
12#[derive(Clone, Debug, Default, Eq, PartialEq)]
13pub struct GraphemeClusterBreak {
14    /// The codepoint or codepoint range for this entry.
15    pub codepoints: Codepoints,
16    /// The property value assigned to the codepoints in this entry.
17    pub value: String,
18}
19
20impl UcdFile for GraphemeClusterBreak {
21    fn relative_file_path() -> &'static Path {
22        Path::new("auxiliary/GraphemeBreakProperty.txt")
23    }
24}
25
26impl UcdFileByCodepoint for GraphemeClusterBreak {
27    fn codepoints(&self) -> CodepointIter {
28        self.codepoints.into_iter()
29    }
30}
31
32impl std::str::FromStr for GraphemeClusterBreak {
33    type Err = Error;
34
35    fn from_str(line: &str) -> Result<GraphemeClusterBreak, Error> {
36        let (codepoints, value) = parse_codepoint_association(line)?;
37        Ok(GraphemeClusterBreak { codepoints, value: value.to_string() })
38    }
39}
40
41/// A single row in the `auxiliary/GraphemeBreakTest.txt` file.
42///
43/// This file defines tests for the grapheme cluster break algorithm.
44#[derive(Clone, Debug, Default, Eq, PartialEq)]
45pub struct GraphemeClusterBreakTest {
46    /// Each string is a UTF-8 encoded group of codepoints that make up a
47    /// single grapheme cluster.
48    pub grapheme_clusters: Vec<String>,
49    /// A human readable description of this test.
50    pub comment: String,
51}
52
53impl UcdFile for GraphemeClusterBreakTest {
54    fn relative_file_path() -> &'static Path {
55        Path::new("auxiliary/GraphemeBreakTest.txt")
56    }
57}
58
59impl std::str::FromStr for GraphemeClusterBreakTest {
60    type Err = Error;
61
62    fn from_str(line: &str) -> Result<GraphemeClusterBreakTest, Error> {
63        let (groups, comment) = parse_break_test(line)?;
64        Ok(GraphemeClusterBreakTest { grapheme_clusters: groups, comment })
65    }
66}
67
68#[cfg(test)]
69mod tests {
70    use super::{GraphemeClusterBreak, GraphemeClusterBreakTest};
71
72    #[test]
73    fn parse_single() {
74        let line = "093B          ; SpacingMark # Mc       DEVANAGARI VOWEL SIGN OOE\n";
75        let row: GraphemeClusterBreak = line.parse().unwrap();
76        assert_eq!(row.codepoints, 0x093B);
77        assert_eq!(row.value, "SpacingMark");
78    }
79
80    #[test]
81    fn parse_range() {
82        let line = "1F1E6..1F1FF  ; Regional_Indicator # So  [26] REGIONAL INDICATOR SYMBOL LETTER A..REGIONAL INDICATOR SYMBOL LETTER Z\n";
83        let row: GraphemeClusterBreak = line.parse().unwrap();
84        assert_eq!(row.codepoints, (0x1F1E6, 0x1F1FF));
85        assert_eq!(row.value, "Regional_Indicator");
86    }
87
88    #[test]
89    fn parse_test() {
90        let line = "÷ 0061 × 1F3FF ÷ 1F476 × 200D × 1F6D1 ÷	#  ÷ [0.2] LATIN SMALL LETTER A (Other) × [9.0] EMOJI MODIFIER FITZPATRICK TYPE-6 (Extend) ÷ [999.0] BABY (ExtPict) × [9.0] ZERO WIDTH JOINER (ZWJ_ExtCccZwj) × [11.0] OCTAGONAL SIGN (ExtPict) ÷ [0.3]\n";
91
92        let row: GraphemeClusterBreakTest = line.parse().unwrap();
93        assert_eq!(
94            row.grapheme_clusters,
95            vec!["\u{0061}\u{1F3FF}", "\u{1F476}\u{200D}\u{1F6D1}",]
96        );
97        assert!(row.comment.starts_with("÷ [0.2] LATIN SMALL LETTER A"));
98    }
99}