ens_normalize_rs/code_points/
specs.rs

1use super::types::*;
2use crate::{
3    constants,
4    static_data::{
5        nf_json,
6        spec_json::{self, GroupName},
7    },
8    utils, CodePoint,
9};
10use regex::Regex;
11use std::collections::{HashMap, HashSet};
12
13/// This struct contains logic for validating and normalizing code points.
14pub struct CodePointsSpecs {
15    cm: HashSet<CodePoint>,
16    ignored: HashSet<CodePoint>,
17    mapped: HashMap<CodePoint, Vec<CodePoint>>,
18    nfc_check: HashSet<CodePoint>,
19    whole_map: ParsedWholeMap,
20    fenced: HashMap<CodePoint, String>,
21    groups: Vec<ParsedGroup>,
22    group_name_to_index: HashMap<spec_json::GroupName, usize>,
23    valid: HashSet<CodePoint>,
24    nsm: HashSet<CodePoint>,
25    nsm_max: u32,
26    emoji_no_fe0f_to_pretty: HashMap<Vec<CodePoint>, Vec<CodePoint>>,
27    decomp: HashMap<CodePoint, Vec<CodePoint>>,
28    emoji_regex: Regex,
29}
30
31impl CodePointsSpecs {
32    pub fn new(spec: spec_json::Spec, nf: nf_json::Nf) -> Self {
33        let emoji: HashSet<Vec<CodePoint>> = spec.emoji.into_iter().collect();
34        let emoji_no_fe0f_to_pretty = emoji
35            .iter()
36            .map(|e| (utils::filter_fe0f(e), e.clone()))
37            .collect();
38        let decomp = nf
39            .decomp
40            .into_iter()
41            .map(|item| (item.number, item.nested_numbers))
42            .collect();
43        let groups: Vec<ParsedGroup> = spec.groups.into_iter().map(ParsedGroup::from).collect();
44        let group_name_to_index: HashMap<spec_json::GroupName, usize> = groups
45            .iter()
46            .enumerate()
47            .map(|(i, g)| (g.name.clone(), i))
48            .collect();
49        let valid = compute_valid(&groups, &decomp);
50        let whole_map = compute_whole_map(spec.whole_map);
51        let emoji_str_list = emoji
52            .iter()
53            .map(|cps| utils::cps2str(cps))
54            .collect::<Vec<_>>();
55        let emoji_regex =
56            create_emoji_regex_pattern(emoji_str_list).expect("failed to create emoji regex");
57
58        Self {
59            cm: spec.cm.into_iter().collect(),
60            emoji_no_fe0f_to_pretty,
61            ignored: spec.ignored.into_iter().collect(),
62            mapped: spec.mapped.into_iter().map(|m| (m.from, m.to)).collect(),
63            nfc_check: spec.nfc_check.into_iter().collect(),
64            fenced: spec.fenced.into_iter().map(|f| (f.from, f.to)).collect(),
65            valid,
66            groups,
67            nsm: spec.nsm.into_iter().collect(),
68            nsm_max: spec.nsm_max,
69            decomp,
70            whole_map,
71            group_name_to_index,
72            emoji_regex,
73        }
74    }
75}
76
77impl Default for CodePointsSpecs {
78    fn default() -> Self {
79        let spec = spec_json::Spec::default();
80        let nf = nf_json::Nf::default();
81        Self::new(spec, nf)
82    }
83}
84
85impl CodePointsSpecs {
86    pub fn get_mapping(&self, cp: CodePoint) -> Option<&Vec<CodePoint>> {
87        self.mapped.get(&cp)
88    }
89
90    pub fn cps_is_emoji(&self, cps: &[CodePoint]) -> bool {
91        let s = utils::cps2str(cps);
92        let maybe_match = self.finditer_emoji(&s).next();
93        maybe_match
94            .map(|m| m.start() == 0 && m.end() == s.len())
95            .unwrap_or(false)
96    }
97
98    pub fn finditer_emoji<'a>(&'a self, s: &'a str) -> impl Iterator<Item = regex::Match<'_>> {
99        self.emoji_regex.find_iter(s)
100    }
101
102    pub fn cps_requires_check(&self, cps: &[CodePoint]) -> bool {
103        cps.iter().any(|cp| self.nfc_check.contains(cp))
104    }
105
106    pub fn cps_emoji_no_fe0f_to_pretty(&self, cps: &[CodePoint]) -> Option<&Vec<CodePoint>> {
107        self.emoji_no_fe0f_to_pretty.get(cps)
108    }
109
110    pub fn maybe_normalize(&self, cp: CodePoint) -> Option<&Vec<CodePoint>> {
111        self.mapped.get(&cp)
112    }
113
114    pub fn is_valid(&self, cp: CodePoint) -> bool {
115        self.valid.contains(&cp)
116    }
117
118    pub fn is_ignored(&self, cp: CodePoint) -> bool {
119        self.ignored.contains(&cp)
120    }
121
122    pub fn is_stop(&self, cp: CodePoint) -> bool {
123        cp == constants::CP_STOP
124    }
125
126    pub fn is_fenced(&self, cp: CodePoint) -> bool {
127        self.fenced.contains_key(&cp)
128    }
129
130    pub fn is_cm(&self, cp: CodePoint) -> bool {
131        self.cm.contains(&cp)
132    }
133
134    pub fn groups_for_cps<'a>(
135        &'a self,
136        cps: &'a [CodePoint],
137    ) -> impl Iterator<Item = &'a ParsedGroup> {
138        self.groups
139            .iter()
140            .filter(|group| cps.iter().all(|cp| group.contains_cp(*cp)))
141    }
142
143    pub fn is_nsm(&self, cp: CodePoint) -> bool {
144        self.nsm.contains(&cp)
145    }
146
147    pub fn nsm_max(&self) -> u32 {
148        self.nsm_max
149    }
150
151    pub fn decompose(&self, cp: CodePoint) -> Option<&Vec<CodePoint>> {
152        self.decomp.get(&cp)
153    }
154
155    pub fn whole_map(&self, cp: CodePoint) -> Option<&ParsedWholeValue> {
156        self.whole_map.get(&cp)
157    }
158
159    pub fn group_by_name(&self, name: impl Into<GroupName>) -> Option<&ParsedGroup> {
160        self.group_name_to_index
161            .get(&name.into())
162            .and_then(|i| self.groups.get(*i))
163    }
164}
165
166fn compute_valid(
167    groups: &[ParsedGroup],
168    decomp: &HashMap<CodePoint, Vec<CodePoint>>,
169) -> HashSet<CodePoint> {
170    let mut valid = HashSet::new();
171    for g in groups {
172        valid.extend(g.primary_plus_secondary.iter());
173    }
174
175    let ndf: Vec<CodePoint> = valid
176        .iter()
177        .flat_map(|cp| decomp.get(cp).cloned().unwrap_or_default())
178        .collect();
179    valid.extend(ndf);
180    valid
181}
182
183fn compute_whole_map(whole_map: HashMap<String, spec_json::WholeValue>) -> ParsedWholeMap {
184    whole_map
185        .into_iter()
186        .map(|(k, v)| (k.parse::<CodePoint>().unwrap(), v.try_into().unwrap()))
187        .collect()
188}
189
190fn create_emoji_regex_pattern(emojis: Vec<impl AsRef<str>>) -> Result<Regex, regex::Error> {
191    let fe0f = regex::escape(constants::STR_FE0F);
192
193    // Make FE0F optional
194    let make_emoji = |emoji: &str| regex::escape(emoji).replace(&fe0f, &format!("{}?", fe0f));
195
196    // Order emojis to match the longest ones first
197    let order = |emoji: &str| emoji.replace(constants::STR_FE0F, "").len();
198
199    let mut sorted_emojis = emojis;
200    sorted_emojis.sort_by_key(|b| std::cmp::Reverse(order(b.as_ref())));
201
202    let emoji_regex = sorted_emojis
203        .into_iter()
204        .map(|emoji| make_emoji(emoji.as_ref()))
205        .collect::<Vec<_>>()
206        .join("|");
207
208    regex::Regex::new(&emoji_regex)
209}
210
211#[cfg(test)]
212mod tests {
213    use super::*;
214    use pretty_assertions::assert_eq;
215    use rstest::{fixture, rstest};
216
217    #[fixture]
218    #[once]
219    fn specs() -> CodePointsSpecs {
220        CodePointsSpecs::default()
221    }
222
223    #[rstest]
224    #[case::letter_a('A', "a")]
225    #[case::roman_numeral_vi('β…₯', "vi")]
226    fn test_mapped(#[case] input: char, #[case] output: &str, specs: &CodePointsSpecs) {
227        let mapped = specs.get_mapping(input as u32);
228        let expected = output.chars().map(|c| c as u32).collect::<Vec<_>>();
229        assert_eq!(mapped, Some(&expected));
230    }
231
232    #[rstest]
233    #[case::slash("⁄")]
234    fn test_fenced(#[case] fence: &str, specs: &CodePointsSpecs) {
235        assert!(
236            specs
237                .fenced
238                .contains_key(&(fence.chars().next().unwrap() as u32)),
239            "Fence {fence} not found"
240        );
241    }
242
243    #[rstest]
244    #[case::string("helloπŸ˜€", vec![("πŸ˜€", 5, 9)])]
245    #[case::man_technologist("πŸ‘¨β€πŸ’»", vec![("πŸ‘¨β€πŸ’»", 0, 11)])]
246    fn test_emoji(
247        #[case] emoji: &str,
248        #[case] expected: Vec<(&str, usize, usize)>,
249        specs: &CodePointsSpecs,
250    ) {
251        let matches = specs.finditer_emoji(emoji).collect::<Vec<_>>();
252        assert_eq!(matches.len(), expected.len());
253        for (i, (emoji, start, end)) in expected.into_iter().enumerate() {
254            assert_eq!(matches[i].as_str(), emoji);
255            assert_eq!(matches[i].start(), start);
256            assert_eq!(matches[i].end(), end);
257        }
258    }
259
260    #[rstest]
261    #[case::small(&[36, 45, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 95, 97])]
262    #[case::big(&[205743, 205742, 205741, 205740, 205739, 205738, 205737, 205736])]
263    fn test_valid(#[case] cps: &[CodePoint], specs: &CodePointsSpecs) {
264        for cp in cps {
265            assert!(
266                specs.is_valid(*cp),
267                "Codepoint {cp} is not valid, but should be"
268            );
269        }
270    }
271
272    #[rstest]
273    #[case(&[82])]
274    fn test_not_valid(#[case] cps: &[CodePoint], specs: &CodePointsSpecs) {
275        for cp in cps {
276            assert!(
277                !specs.is_valid(*cp),
278                "Codepoint {cp} is valid, but should not be"
279            );
280        }
281    }
282}