Skip to main content

ens_normalize/
spec.rs

1use crate::nf::{nfc, nfd};
2use crate::utils::{
3    EnsError, Result, array_replace, bidi_qq, compare_arrays, explode_cp, quote_cp,
4    safe_str_from_cps, str_from_cps,
5};
6use serde::Deserialize;
7use std::collections::{HashMap, HashSet};
8use std::sync::LazyLock;
9
10const HYPHEN: u32 = 0x2D;
11const STOP: u32 = 0x2E;
12const FE0F: u32 = 0xFE0F;
13const UNIQUE_PH: usize = usize::MAX;
14
15#[derive(Debug, Clone, PartialEq, Eq)]
16pub struct Label {
17    pub input: Vec<u32>,
18    pub offset: usize,
19    pub error: Option<EnsError>,
20    pub tokens: Option<Vec<Vec<u32>>>,
21    pub output: Option<Vec<u32>>,
22    pub emoji: Option<bool>,
23    pub label_type: Option<String>,
24}
25
26#[derive(Debug, Clone, PartialEq, Eq)]
27pub enum Token {
28    Stop {
29        cp: u32,
30    },
31    Disallowed {
32        cp: u32,
33    },
34    Ignored {
35        cp: u32,
36    },
37    Valid {
38        cps: Vec<u32>,
39    },
40    Mapped {
41        cp: u32,
42        cps: Vec<u32>,
43    },
44    Emoji {
45        input: Vec<u32>,
46        cps: Vec<u32>,
47        emoji: Vec<u32>,
48    },
49    Nfc {
50        input: Vec<u32>,
51        tokens0: Vec<Token>,
52        cps: Vec<u32>,
53        tokens: Vec<Token>,
54    },
55}
56
57impl Token {
58    pub fn token_type(&self) -> &'static str {
59        match self {
60            Token::Stop { .. } => "stop",
61            Token::Disallowed { .. } => "disallowed",
62            Token::Ignored { .. } => "ignored",
63            Token::Valid { .. } => "valid",
64            Token::Mapped { .. } => "mapped",
65            Token::Emoji { .. } => "emoji",
66            Token::Nfc { .. } => "nfc",
67        }
68    }
69
70    pub fn cps(&self) -> Option<&[u32]> {
71        match self {
72            Token::Valid { cps }
73            | Token::Mapped { cps, .. }
74            | Token::Emoji { cps, .. }
75            | Token::Nfc { cps, .. } => Some(cps),
76            _ => None,
77        }
78    }
79}
80
81#[derive(Debug, Clone, Copy, PartialEq, Eq)]
82pub struct TokenizeOptions {
83    pub nf: bool,
84}
85
86impl Default for TokenizeOptions {
87    fn default() -> Self {
88        Self { nf: true }
89    }
90}
91
92#[derive(Deserialize)]
93struct RawSpec {
94    emoji: Vec<Vec<u32>>,
95    ignored: Vec<u32>,
96    mapped: Vec<(u32, Vec<u32>)>,
97    fenced: Vec<(u32, String)>,
98    wholes: Vec<RawWhole>,
99    cm: Vec<u32>,
100    nsm: Vec<u32>,
101    nsm_max: usize,
102    escape: Vec<u32>,
103    groups: Vec<RawGroup>,
104    nfc_check: Vec<u32>,
105}
106
107#[derive(Deserialize)]
108struct RawWhole {
109    valid: Vec<u32>,
110    confused: Vec<u32>,
111}
112
113#[derive(Deserialize)]
114struct RawGroup {
115    name: String,
116    #[serde(default)]
117    restricted: bool,
118    primary: Vec<u32>,
119    secondary: Vec<u32>,
120    cm: Option<Vec<serde_json::Value>>,
121}
122
123struct Group {
124    name: String,
125    primary: HashSet<u32>,
126    secondary: HashSet<u32>,
127    check_nsm: bool,
128}
129
130impl Group {
131    fn has_cp(&self, cp: u32) -> bool {
132        self.primary.contains(&cp) || self.secondary.contains(&cp)
133    }
134}
135
136struct Whole {
137    complements: HashMap<u32, Vec<usize>>,
138}
139
140#[derive(Default)]
141struct EmojiNode {
142    children: HashMap<u32, usize>,
143    value: Option<Vec<u32>>,
144}
145
146#[derive(Default)]
147struct EmojiTrie {
148    nodes: Vec<EmojiNode>,
149}
150
151impl EmojiTrie {
152    fn new() -> Self {
153        Self {
154            nodes: vec![EmojiNode::default()],
155        }
156    }
157
158    fn child_or_insert(&mut self, node: usize, cp: u32) -> usize {
159        if let Some(&child) = self.nodes[node].children.get(&cp) {
160            return child;
161        }
162        let child = self.nodes.len();
163        self.nodes.push(EmojiNode::default());
164        self.nodes[node].children.insert(cp, child);
165        child
166    }
167}
168
169struct EnsData {
170    mapped: HashMap<u32, Vec<u32>>,
171    ignored: HashSet<u32>,
172    cm: HashSet<u32>,
173    nsm: HashSet<u32>,
174    nsm_max: usize,
175    escape: HashSet<u32>,
176    nfc_check: HashSet<u32>,
177    fenced: HashMap<u32, String>,
178    groups: Vec<Group>,
179    whole_map: HashMap<u32, usize>,
180    wholes: Vec<Whole>,
181    valid: HashSet<u32>,
182    emoji_list: Vec<Vec<u32>>,
183    emoji_root: EmojiTrie,
184}
185
186static ENS: LazyLock<EnsData> = LazyLock::new(|| {
187    let raw: RawSpec =
188        serde_json::from_str(include_str!("../data/spec.json")).expect("valid spec.json");
189    EnsData::from_raw(raw)
190});
191
192impl EnsData {
193    fn from_raw(raw: RawSpec) -> Self {
194        let groups: Vec<Group> = raw
195            .groups
196            .into_iter()
197            .map(|g| {
198                let name = if g.restricted {
199                    format!("Restricted[{}]", g.name)
200                } else {
201                    g.name
202                };
203                Group {
204                    name,
205                    primary: g.primary.into_iter().collect(),
206                    secondary: g.secondary.into_iter().collect(),
207                    check_nsm: g.cm.is_none(),
208                }
209            })
210            .collect();
211
212        let mut wholes = Vec::new();
213        let mut whole_map = HashMap::new();
214        for raw_whole in raw.wholes {
215            if raw_whole.confused.is_empty() {
216                continue;
217            }
218
219            let values: Vec<u32> = raw_whole
220                .valid
221                .iter()
222                .chain(raw_whole.confused.iter())
223                .copied()
224                .collect();
225            let complements = compute_whole_complements(&groups, &values);
226            let whole_index = wholes.len();
227            for cp in raw_whole.confused {
228                whole_map.insert(cp, whole_index);
229            }
230            wholes.push(Whole { complements });
231        }
232
233        let mut valid = HashSet::new();
234        let mut multi = HashSet::new();
235        for g in &groups {
236            for &cp in g.primary.iter().chain(g.secondary.iter()) {
237                if !valid.insert(cp) {
238                    multi.insert(cp);
239                }
240            }
241        }
242
243        for &cp in &valid {
244            if !whole_map.contains_key(&cp) && !multi.contains(&cp) {
245                whole_map.insert(cp, UNIQUE_PH);
246            }
247        }
248
249        let valid_vec: Vec<u32> = valid.iter().copied().collect();
250        for cp in nfd(&valid_vec) {
251            valid.insert(cp);
252        }
253
254        let mut emoji_list = raw.emoji;
255        emoji_list.sort_by(|a, b| compare_arrays(a, b).cmp(&0));
256        let mut emoji_root = EmojiTrie::new();
257        for cps in &emoji_list {
258            let mut prev = vec![0usize];
259            for &cp in cps {
260                let next: Vec<usize> = prev
261                    .iter()
262                    .map(|&node| emoji_root.child_or_insert(node, cp))
263                    .collect();
264                if cp == FE0F {
265                    prev.extend(next);
266                } else {
267                    prev = next;
268                }
269            }
270            for node in prev {
271                emoji_root.nodes[node].value = Some(cps.clone());
272            }
273        }
274
275        Self {
276            mapped: raw.mapped.into_iter().collect(),
277            ignored: raw.ignored.into_iter().collect(),
278            cm: raw.cm.into_iter().collect(),
279            nsm: raw.nsm.into_iter().collect(),
280            nsm_max: raw.nsm_max,
281            escape: raw.escape.into_iter().collect(),
282            nfc_check: raw.nfc_check.into_iter().collect(),
283            fenced: raw.fenced.into_iter().collect(),
284            groups,
285            whole_map,
286            wholes,
287            valid,
288            emoji_list,
289            emoji_root,
290        }
291    }
292}
293
294struct WholeRec {
295    groups: Vec<usize>,
296    values: Vec<u32>,
297}
298
299fn push_unique(v: &mut Vec<usize>, x: usize) {
300    if !v.contains(&x) {
301        v.push(x);
302    }
303}
304
305fn compute_whole_complements(groups: &[Group], values: &[u32]) -> HashMap<u32, Vec<usize>> {
306    let mut recs: Vec<WholeRec> = Vec::new();
307    for &cp in values {
308        let gs: Vec<usize> = groups
309            .iter()
310            .enumerate()
311            .filter_map(|(i, g)| g.has_cp(cp).then_some(i))
312            .collect();
313        let rec_index = recs
314            .iter()
315            .position(|rec| gs.iter().any(|g| rec.groups.contains(g)));
316        let rec_index = match rec_index {
317            Some(i) => i,
318            None => {
319                recs.push(WholeRec {
320                    groups: Vec::new(),
321                    values: Vec::new(),
322                });
323                recs.len() - 1
324            }
325        };
326        recs[rec_index].values.push(cp);
327        for g in gs {
328            push_unique(&mut recs[rec_index].groups, g);
329        }
330    }
331
332    let mut union = Vec::new();
333    for rec in &recs {
334        for &g in &rec.groups {
335            push_unique(&mut union, g);
336        }
337    }
338
339    let mut complements = HashMap::new();
340    for rec in recs {
341        let complement: Vec<usize> = union
342            .iter()
343            .copied()
344            .filter(|g| !rec.groups.contains(g))
345            .collect();
346        for cp in rec.values {
347            complements.insert(cp, complement.clone());
348        }
349    }
350    complements
351}
352
353#[derive(Clone)]
354struct NormToken {
355    cps: Vec<u32>,
356    is_emoji: bool,
357}
358
359pub fn is_combining_mark(cp: u32, only_nsm: bool) -> bool {
360    if only_nsm {
361        ENS.nsm.contains(&cp)
362    } else {
363        ENS.cm.contains(&cp)
364    }
365}
366
367pub fn should_escape(cp: u32) -> bool {
368    ENS.escape.contains(&cp)
369}
370
371pub fn ens_emoji() -> Vec<Vec<u32>> {
372    ENS.emoji_list.clone()
373}
374
375pub fn ens_normalize_fragment(frag: &str, decompose: bool) -> Result<String> {
376    let nf = if decompose { nfd } else { nfc };
377    let mut out = Vec::new();
378    for (i, label) in frag.split('.').enumerate() {
379        if i > 0 {
380            out.push(STOP);
381        }
382        let input = explode_cp(label);
383        let tokens = tokens_from_str(&input, nf, EmojiFilter::DropFe0f)?;
384        out.extend(tokens.into_iter().flat_map(|t| t.cps));
385    }
386    str_from_cps(&out)
387}
388
389pub fn ens_normalize(name: &str) -> Result<String> {
390    flatten(split(name, nfc, EmojiFilter::DropFe0f))
391}
392
393pub fn ens_beautify(name: &str) -> Result<String> {
394    let mut labels = split(name, nfc, EmojiFilter::Preserve);
395    for label in &mut labels {
396        if label.error.is_some() {
397            break;
398        }
399        if label.label_type.as_deref() != Some("Greek")
400            && let Some(output) = &mut label.output
401        {
402            array_replace(output, 0x3BE, 0x39E);
403        }
404    }
405    flatten(labels)
406}
407
408pub fn ens_split(name: &str, preserve_emoji: bool) -> Vec<Label> {
409    split(
410        name,
411        nfc,
412        if preserve_emoji {
413            EmojiFilter::Preserve
414        } else {
415            EmojiFilter::DropFe0f
416        },
417    )
418}
419
420fn split(name: &str, nf: fn(&[u32]) -> Vec<u32>, ef: EmojiFilter) -> Vec<Label> {
421    if name.is_empty() {
422        return Vec::new();
423    }
424
425    let mut offset = 0usize;
426    name.split('.')
427        .map(|label| {
428            let input = explode_cp(label);
429            let mut info = Label {
430                input: input.clone(),
431                offset,
432                error: None,
433                tokens: None,
434                output: None,
435                emoji: None,
436                label_type: None,
437            };
438            offset += input.len() + 1;
439
440            if let Err(err) = process_label(&input, nf, ef, &mut info) {
441                info.error = Some(err);
442            }
443            info
444        })
445        .collect()
446}
447
448fn process_label(
449    input: &[u32],
450    nf: fn(&[u32]) -> Vec<u32>,
451    ef: EmojiFilter,
452    info: &mut Label,
453) -> Result<()> {
454    let tokens = tokens_from_str(input, nf, ef)?;
455    info.tokens = Some(tokens.iter().map(|t| t.cps.clone()).collect());
456    if tokens.is_empty() {
457        return Err(EnsError::new("empty label"));
458    }
459
460    let output: Vec<u32> = tokens.iter().flat_map(|t| t.cps.iter().copied()).collect();
461    info.output = Some(output.clone());
462    check_leading_underscore(&output)?;
463    let emoji = tokens.len() > 1 || tokens[0].is_emoji;
464    info.emoji = Some(emoji);
465    let label_type = if !emoji && output.iter().all(|&cp| cp < 0x80) {
466        check_label_extension(&output)?;
467        "ASCII".to_string()
468    } else {
469        let chars: Vec<u32> = tokens
470            .iter()
471            .filter(|t| !t.is_emoji)
472            .flat_map(|t| t.cps.iter().copied())
473            .collect();
474        if chars.is_empty() {
475            "Emoji".to_string()
476        } else {
477            if ENS.cm.contains(&output[0]) {
478                return Err(error_placement("leading combining mark"));
479            }
480            for i in 1..tokens.len() {
481                if !tokens[i].is_emoji && ENS.cm.contains(&tokens[i].cps[0]) {
482                    let prev = str_from_cps(&tokens[i - 1].cps)?;
483                    let mark = safe_str_from_cps(&[tokens[i].cps[0]], None);
484                    return Err(error_placement(&format!(
485                        "emoji + combining mark: \"{prev} + {mark}\""
486                    )));
487                }
488            }
489
490            check_fenced(&output)?;
491            let unique = unique_preserving_order(&chars);
492            let group = determine_group(&unique)?;
493            check_group(group, &chars)?;
494            check_whole(group, &unique)?;
495            ENS.groups[group].name.clone()
496        }
497    };
498
499    info.label_type = Some(label_type);
500    Ok(())
501}
502
503fn unique_preserving_order(cps: &[u32]) -> Vec<u32> {
504    let mut seen = HashSet::new();
505    let mut unique = Vec::new();
506    for &cp in cps {
507        if seen.insert(cp) {
508            unique.push(cp);
509        }
510    }
511    unique
512}
513
514fn check_label_extension(cps: &[u32]) -> Result<()> {
515    if cps.len() >= 4 && cps[2] == HYPHEN && cps[3] == HYPHEN {
516        let s = str_from_cps(&cps[..4])?;
517        Err(EnsError::new(format!("invalid label extension: \"{s}\"")))
518    } else {
519        Ok(())
520    }
521}
522
523fn check_leading_underscore(cps: &[u32]) -> Result<()> {
524    const UNDERSCORE: u32 = 0x5F;
525    if let Some(mut i) = cps.iter().rposition(|&cp| cp == UNDERSCORE) {
526        while i > 0 {
527            i -= 1;
528            if cps[i] != UNDERSCORE {
529                return Err(EnsError::new("underscore allowed only at start"));
530            }
531        }
532    }
533    Ok(())
534}
535
536fn check_fenced(cps: &[u32]) -> Result<()> {
537    if cps.is_empty() {
538        return Ok(());
539    }
540    let mut prev = ENS.fenced.get(&cps[0]);
541    if let Some(prev) = prev {
542        return Err(error_placement(&format!("leading {prev}")));
543    }
544
545    let mut last = usize::MAX;
546    for (i, &cp) in cps.iter().enumerate().skip(1) {
547        if let Some(matched) = ENS.fenced.get(&cp) {
548            if last == i {
549                return Err(error_placement(&format!("{} + {matched}", prev.unwrap())));
550            }
551            last = i + 1;
552            prev = Some(matched);
553        }
554    }
555    if last == cps.len()
556        && let Some(prev) = prev
557    {
558        return Err(error_placement(&format!("trailing {prev}")));
559    }
560    Ok(())
561}
562
563fn determine_group(unique: &[u32]) -> Result<usize> {
564    let mut groups: Vec<usize> = (0..ENS.groups.len()).collect();
565    for &cp in unique {
566        let gs: Vec<usize> = groups
567            .iter()
568            .copied()
569            .filter(|&i| ENS.groups[i].has_cp(cp))
570            .collect();
571        if gs.is_empty() {
572            if !ENS.groups.iter().any(|g| g.has_cp(cp)) {
573                return Err(error_disallowed(cp));
574            }
575            return Err(error_group_member(groups[0], cp));
576        }
577        groups = gs;
578        if groups.len() == 1 {
579            break;
580        }
581    }
582    Ok(groups[0])
583}
584
585fn check_group(group: usize, cps: &[u32]) -> Result<()> {
586    let g = &ENS.groups[group];
587    for &cp in cps {
588        if !g.has_cp(cp) {
589            return Err(error_group_member(group, cp));
590        }
591    }
592
593    if g.check_nsm {
594        let decomposed = nfd(cps);
595        let mut i = 1usize;
596        while i < decomposed.len() {
597            if ENS.nsm.contains(&decomposed[i]) {
598                let mut j = i + 1;
599                while j < decomposed.len() && ENS.nsm.contains(&decomposed[j]) {
600                    for k in i..j {
601                        if decomposed[k] == decomposed[j] {
602                            return Err(EnsError::new(format!(
603                                "duplicate non-spacing marks: {}",
604                                quoted_cp(decomposed[j])
605                            )));
606                        }
607                    }
608                    j += 1;
609                }
610                if j - i > ENS.nsm_max {
611                    let s = safe_str_from_cps(&decomposed[i - 1..j], None);
612                    return Err(EnsError::new(format!(
613                        "excessive non-spacing marks: {} ({}/{})",
614                        bidi_qq(&s),
615                        j - i,
616                        ENS.nsm_max
617                    )));
618                }
619                i = j;
620            } else {
621                i += 1;
622            }
623        }
624    }
625
626    Ok(())
627}
628
629fn check_whole(group: usize, unique: &[u32]) -> Result<()> {
630    let mut maker: Option<Vec<usize>> = None;
631    let mut shared = Vec::new();
632    for &cp in unique {
633        match ENS.whole_map.get(&cp).copied() {
634            Some(UNIQUE_PH) => return Ok(()),
635            Some(whole_index) => {
636                let set = ENS.wholes[whole_index]
637                    .complements
638                    .get(&cp)
639                    .cloned()
640                    .unwrap_or_default();
641                maker = Some(match maker {
642                    Some(prev) => prev.into_iter().filter(|g| set.contains(g)).collect(),
643                    None => set,
644                });
645                if maker.as_ref().is_some_and(|m| m.is_empty()) {
646                    return Ok(());
647                }
648            }
649            None => shared.push(cp),
650        }
651    }
652
653    if let Some(maker) = maker {
654        for other in maker {
655            if shared.iter().all(|&cp| ENS.groups[other].has_cp(cp)) {
656                return Err(EnsError::new(format!(
657                    "whole-script confusable: {}/{}",
658                    ENS.groups[group].name, ENS.groups[other].name
659                )));
660            }
661        }
662    }
663    Ok(())
664}
665
666fn flatten(labels: Vec<Label>) -> Result<String> {
667    let multiple = labels.len() != 1;
668    let mut out = Vec::new();
669    for label in labels {
670        if let Some(error) = label.error {
671            if multiple {
672                let safe = safe_str_from_cps(&label.input, Some(63));
673                return Err(EnsError::new(format!(
674                    "Invalid label {}: {}",
675                    bidi_qq(&safe),
676                    error.message()
677                )));
678            }
679            return Err(error);
680        }
681        out.push(str_from_cps(label.output.as_deref().unwrap_or_default())?);
682    }
683    Ok(out.join("."))
684}
685
686fn quoted_cp(cp: u32) -> String {
687    let prefix = if should_escape(cp) {
688        String::new()
689    } else {
690        format!("{} ", bidi_qq(&safe_str_from_cps(&[cp], None)))
691    };
692    format!("{prefix}{}", quote_cp(cp))
693}
694
695fn error_disallowed(cp: u32) -> EnsError {
696    EnsError::new(format!("disallowed character: {}", quoted_cp(cp)))
697}
698
699fn error_group_member(group: usize, cp: u32) -> EnsError {
700    let mut quoted = quoted_cp(cp);
701    if let Some(gg) = ENS.groups.iter().find(|g| g.primary.contains(&cp)) {
702        quoted = format!("{} {quoted}", gg.name);
703    }
704    EnsError::new(format!(
705        "illegal mixture: {} + {quoted}",
706        ENS.groups[group].name
707    ))
708}
709
710fn error_placement(where_: &str) -> EnsError {
711    EnsError::new(format!("illegal placement: {where_}"))
712}
713
714#[derive(Debug, Clone, Copy)]
715enum EmojiFilter {
716    Preserve,
717    DropFe0f,
718}
719
720fn filter_emoji(cps: &[u32], filter: EmojiFilter) -> Vec<u32> {
721    match filter {
722        EmojiFilter::Preserve => cps.to_vec(),
723        EmojiFilter::DropFe0f => cps.iter().copied().filter(|&cp| cp != FE0F).collect(),
724    }
725}
726
727fn tokens_from_str(
728    input: &[u32],
729    nf: fn(&[u32]) -> Vec<u32>,
730    ef: EmojiFilter,
731) -> Result<Vec<NormToken>> {
732    let mut ret = Vec::new();
733    let mut chars = Vec::new();
734    let mut input = input.to_vec();
735    input.reverse();
736
737    while !input.is_empty() {
738        if let Some(emoji) = consume_emoji_reversed(&mut input, None) {
739            if !chars.is_empty() {
740                ret.push(NormToken {
741                    cps: nf(&chars),
742                    is_emoji: false,
743                });
744                chars.clear();
745            }
746            ret.push(NormToken {
747                cps: filter_emoji(&emoji, ef),
748                is_emoji: true,
749            });
750        } else {
751            let cp = input.pop().expect("input is not empty");
752            if ENS.valid.contains(&cp) {
753                chars.push(cp);
754            } else if let Some(cps) = ENS.mapped.get(&cp) {
755                chars.extend_from_slice(cps);
756            } else if !ENS.ignored.contains(&cp) {
757                return Err(error_disallowed(cp));
758            }
759        }
760    }
761
762    if !chars.is_empty() {
763        ret.push(NormToken {
764            cps: nf(&chars),
765            is_emoji: false,
766        });
767    }
768
769    Ok(ret)
770}
771
772fn consume_emoji_reversed(input: &mut Vec<u32>, eaten: Option<&mut Vec<u32>>) -> Option<Vec<u32>> {
773    let mut eaten = eaten;
774    let mut node = 0usize;
775    let mut emoji = None;
776    let mut pos = input.len();
777    while pos > 0 {
778        pos -= 1;
779        let cp = input[pos];
780        let Some(&child) = ENS.emoji_root.nodes[node].children.get(&cp) else {
781            break;
782        };
783        node = child;
784        if let Some(value) = ENS.emoji_root.nodes[node].value.clone() {
785            if let Some(eaten) = eaten.as_deref_mut() {
786                eaten.extend(input[pos..].iter().rev().copied());
787            }
788            input.truncate(pos);
789            emoji = Some(value);
790        }
791    }
792    emoji
793}
794
795pub fn ens_tokenize(name: &str) -> Vec<Token> {
796    ens_tokenize_with_options(name, TokenizeOptions::default())
797}
798
799pub fn ens_tokenize_with_options(name: &str, options: TokenizeOptions) -> Vec<Token> {
800    tokenize(name, options.nf)
801}
802
803fn tokenize(name: &str, nf: bool) -> Vec<Token> {
804    let mut input = explode_cp(name);
805    input.reverse();
806    let mut eaten = Vec::new();
807    let mut tokens = Vec::new();
808
809    while !input.is_empty() {
810        if let Some(emoji) = consume_emoji_reversed(&mut input, Some(&mut eaten)) {
811            tokens.push(Token::Emoji {
812                input: std::mem::take(&mut eaten),
813                cps: filter_emoji(&emoji, EmojiFilter::DropFe0f),
814                emoji,
815            });
816        } else {
817            let cp = input.pop().expect("input is not empty");
818            if cp == STOP {
819                tokens.push(Token::Stop { cp });
820            } else if ENS.valid.contains(&cp) {
821                tokens.push(Token::Valid { cps: vec![cp] });
822            } else if ENS.ignored.contains(&cp) {
823                tokens.push(Token::Ignored { cp });
824            } else if let Some(cps) = ENS.mapped.get(&cp) {
825                tokens.push(Token::Mapped {
826                    cp,
827                    cps: cps.clone(),
828                });
829            } else {
830                tokens.push(Token::Disallowed { cp });
831            }
832        }
833    }
834
835    if nf {
836        apply_token_nfc(&mut tokens);
837    }
838
839    collapse_valid_tokens(tokens)
840}
841
842fn is_valid_or_mapped(token: &Token) -> bool {
843    matches!(token, Token::Valid { .. } | Token::Mapped { .. })
844}
845
846fn valid_or_mapped_cps(token: &Token) -> Option<&[u32]> {
847    match token {
848        Token::Valid { cps } | Token::Mapped { cps, .. } => Some(cps),
849        _ => None,
850    }
851}
852
853fn requires_check(cps: &[u32]) -> bool {
854    cps.iter().any(|cp| ENS.nfc_check.contains(cp))
855}
856
857fn apply_token_nfc(tokens: &mut Vec<Token>) {
858    let mut i = 0usize;
859    let mut start: Option<usize> = None;
860    while i < tokens.len() {
861        if is_valid_or_mapped(&tokens[i]) {
862            let cps = valid_or_mapped_cps(&tokens[i]).unwrap();
863            if requires_check(cps) {
864                let mut end = i + 1;
865                let mut pos = end;
866                while pos < tokens.len() {
867                    if let Some(cps) = valid_or_mapped_cps(&tokens[pos]) {
868                        if !requires_check(cps) {
869                            break;
870                        }
871                        end = pos + 1;
872                    } else if !matches!(tokens[pos], Token::Ignored { .. }) {
873                        break;
874                    }
875                    pos += 1;
876                }
877                let start_i = start.unwrap_or(i);
878                let slice = tokens[start_i..end].to_vec();
879                let cps0: Vec<u32> = slice
880                    .iter()
881                    .filter_map(valid_or_mapped_cps)
882                    .flat_map(|cps| cps.iter().copied())
883                    .collect();
884                let cps = nfc(&cps0);
885                if compare_arrays(&cps, &cps0) != 0 {
886                    let text = str_from_cps(&cps).unwrap_or_default();
887                    let replacement = Token::Nfc {
888                        input: cps0,
889                        tokens0: collapse_valid_tokens(slice),
890                        cps,
891                        tokens: tokenize(&text, false),
892                    };
893                    tokens.splice(start_i..end, [replacement]);
894                    i = start_i;
895                } else {
896                    i = end.saturating_sub(1);
897                }
898                start = None;
899            } else {
900                start = Some(i);
901            }
902        } else if !matches!(tokens[i], Token::Ignored { .. }) {
903            start = None;
904        }
905        i += 1;
906    }
907}
908
909fn collapse_valid_tokens(tokens: Vec<Token>) -> Vec<Token> {
910    let mut out = Vec::new();
911    let mut i = 0usize;
912    while i < tokens.len() {
913        if let Token::Valid { .. } = &tokens[i] {
914            let mut cps = Vec::new();
915            while i < tokens.len() {
916                if let Token::Valid { cps: next } = &tokens[i] {
917                    cps.extend_from_slice(next);
918                    i += 1;
919                } else {
920                    break;
921                }
922            }
923            out.push(Token::Valid { cps });
924        } else {
925            out.push(tokens[i].clone());
926            i += 1;
927        }
928    }
929    out
930}