ens_normalize_rs/code_points/
specs.rs1use super::types::*;
2use crate::{
3 constants,
4 static_data::{
5 nf_json,
6 spec_json::{self, GroupName},
7 },
8 utils, CodePoint,
9};
10use regex::Regex;
11use std::collections::{HashMap, HashSet};
12
13pub struct CodePointsSpecs {
15 cm: HashSet<CodePoint>,
16 ignored: HashSet<CodePoint>,
17 mapped: HashMap<CodePoint, Vec<CodePoint>>,
18 nfc_check: HashSet<CodePoint>,
19 whole_map: ParsedWholeMap,
20 fenced: HashMap<CodePoint, String>,
21 groups: Vec<ParsedGroup>,
22 group_name_to_index: HashMap<spec_json::GroupName, usize>,
23 valid: HashSet<CodePoint>,
24 nsm: HashSet<CodePoint>,
25 nsm_max: u32,
26 emoji_no_fe0f_to_pretty: HashMap<Vec<CodePoint>, Vec<CodePoint>>,
27 decomp: HashMap<CodePoint, Vec<CodePoint>>,
28 emoji_regex: Regex,
29}
30
31impl CodePointsSpecs {
32 pub fn new(spec: spec_json::Spec, nf: nf_json::Nf) -> Self {
33 let emoji: HashSet<Vec<CodePoint>> = spec.emoji.into_iter().collect();
34 let emoji_no_fe0f_to_pretty = emoji
35 .iter()
36 .map(|e| (utils::filter_fe0f(e), e.clone()))
37 .collect();
38 let decomp = nf
39 .decomp
40 .into_iter()
41 .map(|item| (item.number, item.nested_numbers))
42 .collect();
43 let groups: Vec<ParsedGroup> = spec.groups.into_iter().map(ParsedGroup::from).collect();
44 let group_name_to_index: HashMap<spec_json::GroupName, usize> = groups
45 .iter()
46 .enumerate()
47 .map(|(i, g)| (g.name.clone(), i))
48 .collect();
49 let valid = compute_valid(&groups, &decomp);
50 let whole_map = compute_whole_map(spec.whole_map);
51 let emoji_str_list = emoji
52 .iter()
53 .map(|cps| utils::cps2str(cps))
54 .collect::<Vec<_>>();
55 let emoji_regex =
56 create_emoji_regex_pattern(emoji_str_list).expect("failed to create emoji regex");
57
58 Self {
59 cm: spec.cm.into_iter().collect(),
60 emoji_no_fe0f_to_pretty,
61 ignored: spec.ignored.into_iter().collect(),
62 mapped: spec.mapped.into_iter().map(|m| (m.from, m.to)).collect(),
63 nfc_check: spec.nfc_check.into_iter().collect(),
64 fenced: spec.fenced.into_iter().map(|f| (f.from, f.to)).collect(),
65 valid,
66 groups,
67 nsm: spec.nsm.into_iter().collect(),
68 nsm_max: spec.nsm_max,
69 decomp,
70 whole_map,
71 group_name_to_index,
72 emoji_regex,
73 }
74 }
75}
76
77impl Default for CodePointsSpecs {
78 fn default() -> Self {
79 let spec = spec_json::Spec::default();
80 let nf = nf_json::Nf::default();
81 Self::new(spec, nf)
82 }
83}
84
85impl CodePointsSpecs {
86 pub fn get_mapping(&self, cp: CodePoint) -> Option<&Vec<CodePoint>> {
87 self.mapped.get(&cp)
88 }
89
90 pub fn cps_is_emoji(&self, cps: &[CodePoint]) -> bool {
91 let s = utils::cps2str(cps);
92 let maybe_match = self.finditer_emoji(&s).next();
93 maybe_match
94 .map(|m| m.start() == 0 && m.end() == s.len())
95 .unwrap_or(false)
96 }
97
98 pub fn finditer_emoji<'a>(&'a self, s: &'a str) -> impl Iterator<Item = regex::Match<'_>> {
99 self.emoji_regex.find_iter(s)
100 }
101
102 pub fn cps_requires_check(&self, cps: &[CodePoint]) -> bool {
103 cps.iter().any(|cp| self.nfc_check.contains(cp))
104 }
105
106 pub fn cps_emoji_no_fe0f_to_pretty(&self, cps: &[CodePoint]) -> Option<&Vec<CodePoint>> {
107 self.emoji_no_fe0f_to_pretty.get(cps)
108 }
109
110 pub fn maybe_normalize(&self, cp: CodePoint) -> Option<&Vec<CodePoint>> {
111 self.mapped.get(&cp)
112 }
113
114 pub fn is_valid(&self, cp: CodePoint) -> bool {
115 self.valid.contains(&cp)
116 }
117
118 pub fn is_ignored(&self, cp: CodePoint) -> bool {
119 self.ignored.contains(&cp)
120 }
121
122 pub fn is_stop(&self, cp: CodePoint) -> bool {
123 cp == constants::CP_STOP
124 }
125
126 pub fn is_fenced(&self, cp: CodePoint) -> bool {
127 self.fenced.contains_key(&cp)
128 }
129
130 pub fn is_cm(&self, cp: CodePoint) -> bool {
131 self.cm.contains(&cp)
132 }
133
134 pub fn groups_for_cps<'a>(
135 &'a self,
136 cps: &'a [CodePoint],
137 ) -> impl Iterator<Item = &'a ParsedGroup> {
138 self.groups
139 .iter()
140 .filter(|group| cps.iter().all(|cp| group.contains_cp(*cp)))
141 }
142
143 pub fn is_nsm(&self, cp: CodePoint) -> bool {
144 self.nsm.contains(&cp)
145 }
146
147 pub fn nsm_max(&self) -> u32 {
148 self.nsm_max
149 }
150
151 pub fn decompose(&self, cp: CodePoint) -> Option<&Vec<CodePoint>> {
152 self.decomp.get(&cp)
153 }
154
155 pub fn whole_map(&self, cp: CodePoint) -> Option<&ParsedWholeValue> {
156 self.whole_map.get(&cp)
157 }
158
159 pub fn group_by_name(&self, name: impl Into<GroupName>) -> Option<&ParsedGroup> {
160 self.group_name_to_index
161 .get(&name.into())
162 .and_then(|i| self.groups.get(*i))
163 }
164}
165
166fn compute_valid(
167 groups: &[ParsedGroup],
168 decomp: &HashMap<CodePoint, Vec<CodePoint>>,
169) -> HashSet<CodePoint> {
170 let mut valid = HashSet::new();
171 for g in groups {
172 valid.extend(g.primary_plus_secondary.iter());
173 }
174
175 let ndf: Vec<CodePoint> = valid
176 .iter()
177 .flat_map(|cp| decomp.get(cp).cloned().unwrap_or_default())
178 .collect();
179 valid.extend(ndf);
180 valid
181}
182
183fn compute_whole_map(whole_map: HashMap<String, spec_json::WholeValue>) -> ParsedWholeMap {
184 whole_map
185 .into_iter()
186 .map(|(k, v)| (k.parse::<CodePoint>().unwrap(), v.try_into().unwrap()))
187 .collect()
188}
189
190fn create_emoji_regex_pattern(emojis: Vec<impl AsRef<str>>) -> Result<Regex, regex::Error> {
191 let fe0f = regex::escape(constants::STR_FE0F);
192
193 let make_emoji = |emoji: &str| regex::escape(emoji).replace(&fe0f, &format!("{}?", fe0f));
195
196 let order = |emoji: &str| emoji.replace(constants::STR_FE0F, "").len();
198
199 let mut sorted_emojis = emojis;
200 sorted_emojis.sort_by_key(|b| std::cmp::Reverse(order(b.as_ref())));
201
202 let emoji_regex = sorted_emojis
203 .into_iter()
204 .map(|emoji| make_emoji(emoji.as_ref()))
205 .collect::<Vec<_>>()
206 .join("|");
207
208 regex::Regex::new(&emoji_regex)
209}
210
211#[cfg(test)]
212mod tests {
213 use super::*;
214 use pretty_assertions::assert_eq;
215 use rstest::{fixture, rstest};
216
217 #[fixture]
218 #[once]
219 fn specs() -> CodePointsSpecs {
220 CodePointsSpecs::default()
221 }
222
223 #[rstest]
224 #[case::letter_a('A', "a")]
225 #[case::roman_numeral_vi('β
₯', "vi")]
226 fn test_mapped(#[case] input: char, #[case] output: &str, specs: &CodePointsSpecs) {
227 let mapped = specs.get_mapping(input as u32);
228 let expected = output.chars().map(|c| c as u32).collect::<Vec<_>>();
229 assert_eq!(mapped, Some(&expected));
230 }
231
232 #[rstest]
233 #[case::slash("β")]
234 fn test_fenced(#[case] fence: &str, specs: &CodePointsSpecs) {
235 assert!(
236 specs
237 .fenced
238 .contains_key(&(fence.chars().next().unwrap() as u32)),
239 "Fence {fence} not found"
240 );
241 }
242
243 #[rstest]
244 #[case::string("helloπ", vec![("π", 5, 9)])]
245 #[case::man_technologist("π¨βπ»", vec![("π¨βπ»", 0, 11)])]
246 fn test_emoji(
247 #[case] emoji: &str,
248 #[case] expected: Vec<(&str, usize, usize)>,
249 specs: &CodePointsSpecs,
250 ) {
251 let matches = specs.finditer_emoji(emoji).collect::<Vec<_>>();
252 assert_eq!(matches.len(), expected.len());
253 for (i, (emoji, start, end)) in expected.into_iter().enumerate() {
254 assert_eq!(matches[i].as_str(), emoji);
255 assert_eq!(matches[i].start(), start);
256 assert_eq!(matches[i].end(), end);
257 }
258 }
259
260 #[rstest]
261 #[case::small(&[36, 45, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 95, 97])]
262 #[case::big(&[205743, 205742, 205741, 205740, 205739, 205738, 205737, 205736])]
263 fn test_valid(#[case] cps: &[CodePoint], specs: &CodePointsSpecs) {
264 for cp in cps {
265 assert!(
266 specs.is_valid(*cp),
267 "Codepoint {cp} is not valid, but should be"
268 );
269 }
270 }
271
272 #[rstest]
273 #[case(&[82])]
274 fn test_not_valid(#[case] cps: &[CodePoint], specs: &CodePointsSpecs) {
275 for cp in cps {
276 assert!(
277 !specs.is_valid(*cp),
278 "Codepoint {cp} is valid, but should not be"
279 );
280 }
281 }
282}