ratex_types/
unicode_scripts.rs

1/// Unicode script detection for characters supported in `\text{}` blocks.
2///
3/// Port of KaTeX's `unicodeScripts.ts`. Characters from these scripts can
4/// appear inside `\text{}` even when no font metrics exist for them.
5///
6/// Each script has a name and one or more [start, end] (inclusive) blocks.
7
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum UnicodeScript {
10    Latin,
11    Cyrillic,
12    Armenian,
13    Brahmic,
14    Georgian,
15    Cjk,
16    Hangul,
17}
18
19impl UnicodeScript {
20    pub fn as_str(self) -> &'static str {
21        match self {
22            Self::Latin => "latin",
23            Self::Cyrillic => "cyrillic",
24            Self::Armenian => "armenian",
25            Self::Brahmic => "brahmic",
26            Self::Georgian => "georgian",
27            Self::Cjk => "cjk",
28            Self::Hangul => "hangul",
29        }
30    }
31}
32
33struct ScriptDef {
34    script: UnicodeScript,
35    blocks: &'static [(u32, u32)],
36}
37
38static SCRIPT_DATA: &[ScriptDef] = &[
39    ScriptDef {
40        script: UnicodeScript::Latin,
41        blocks: &[
42            (0x0100, 0x024F), // Latin Extended-A and Latin Extended-B
43            (0x0300, 0x036F), // Combining Diacritical Marks
44        ],
45    },
46    ScriptDef {
47        script: UnicodeScript::Cyrillic,
48        blocks: &[(0x0400, 0x04FF)],
49    },
50    ScriptDef {
51        script: UnicodeScript::Armenian,
52        blocks: &[(0x0530, 0x058F)],
53    },
54    ScriptDef {
55        script: UnicodeScript::Brahmic,
56        blocks: &[(0x0900, 0x109F)],
57    },
58    ScriptDef {
59        script: UnicodeScript::Georgian,
60        blocks: &[(0x10A0, 0x10FF)],
61    },
62    ScriptDef {
63        script: UnicodeScript::Cjk,
64        blocks: &[
65            (0x3000, 0x30FF), // CJK symbols, Hiragana, Katakana
66            (0x4E00, 0x9FAF), // CJK Unified Ideographs
67            (0xFF00, 0xFF60), // Fullwidth forms
68        ],
69    },
70    ScriptDef {
71        script: UnicodeScript::Hangul,
72        blocks: &[(0xAC00, 0xD7AF)],
73    },
74];
75
76/// Identify the script/script family of a Unicode codepoint, if known.
77pub fn script_from_codepoint(codepoint: u32) -> Option<UnicodeScript> {
78    for def in SCRIPT_DATA {
79        for &(lo, hi) in def.blocks {
80            if codepoint >= lo && codepoint <= hi {
81                return Some(def.script);
82            }
83        }
84    }
85    None
86}
87
88/// Return `true` if the codepoint falls within one of the supported scripts.
89///
90/// Used in text mode to decide whether a character without font metrics
91/// should be rendered with fallback metrics (Latin capital M).
92pub fn supported_codepoint(codepoint: u32) -> bool {
93    for def in SCRIPT_DATA {
94        for &(lo, hi) in def.blocks {
95            if codepoint >= lo && codepoint <= hi {
96                return true;
97            }
98        }
99    }
100    false
101}
102
103#[cfg(test)]
104mod tests {
105    use super::*;
106
107    #[test]
108    fn test_basic_latin_not_matched() {
109        assert!(!supported_codepoint('A' as u32));
110        assert!(!supported_codepoint('z' as u32));
111    }
112
113    #[test]
114    fn test_latin_extended() {
115        assert!(supported_codepoint(0x0100)); // Ā
116        assert!(supported_codepoint(0x024F)); // end of Latin Extended-B
117        assert_eq!(script_from_codepoint(0x0100), Some(UnicodeScript::Latin));
118    }
119
120    #[test]
121    fn test_combining_diacritical() {
122        assert!(supported_codepoint(0x0300)); // combining grave accent
123        assert!(supported_codepoint(0x0301)); // combining acute accent
124        assert!(supported_codepoint(0x036F)); // end of combining marks
125        assert_eq!(script_from_codepoint(0x0301), Some(UnicodeScript::Latin));
126    }
127
128    #[test]
129    fn test_cyrillic() {
130        assert!(supported_codepoint('А' as u32)); // Cyrillic А = 0x0410
131        assert!(supported_codepoint('я' as u32)); // Cyrillic я = 0x044F
132        assert_eq!(script_from_codepoint('А' as u32), Some(UnicodeScript::Cyrillic));
133    }
134
135    #[test]
136    fn test_cjk() {
137        assert!(supported_codepoint('中' as u32)); // U+4E2D
138        assert!(supported_codepoint('あ' as u32)); // Hiragana, U+3042
139        assert_eq!(script_from_codepoint('中' as u32), Some(UnicodeScript::Cjk));
140    }
141
142    #[test]
143    fn test_hangul() {
144        assert!(supported_codepoint(0xAC00)); // first Hangul syllable
145        assert_eq!(script_from_codepoint(0xAC00), Some(UnicodeScript::Hangul));
146    }
147
148    #[test]
149    fn test_brahmic() {
150        assert!(supported_codepoint(0x0900)); // Devanagari start
151        assert!(supported_codepoint(0x0E01)); // Thai
152        assert_eq!(script_from_codepoint(0x0900), Some(UnicodeScript::Brahmic));
153    }
154
155    #[test]
156    fn test_armenian() {
157        assert!(supported_codepoint(0x0530));
158        assert_eq!(script_from_codepoint(0x0531), Some(UnicodeScript::Armenian));
159    }
160
161    #[test]
162    fn test_georgian() {
163        assert!(supported_codepoint(0x10A0));
164        assert_eq!(script_from_codepoint(0x10A0), Some(UnicodeScript::Georgian));
165    }
166
167    #[test]
168    fn test_unsupported_codepoint() {
169        assert!(!supported_codepoint(0xFFFF));
170        assert_eq!(script_from_codepoint(0xFFFF), None);
171    }
172}
ratex_types/unicode_scripts.rs

ratex_types/
unicode_scripts.rs