Skip to main content

ens_normalize/
spec.rs

1use crate::intmap::{IntMap, IntSet};
2use crate::nf::{nfc, nfd};
3use crate::utils::{
4    EnsError, Result, array_replace, bidi_qq, compare_arrays, explode_cp, quote_cp,
5    safe_str_from_cps, str_from_cps,
6};
7use serde::Deserialize;
8use std::borrow::Cow;
9use std::sync::LazyLock;
10
11const HYPHEN: u32 = 0x2D;
12const STOP: u32 = 0x2E;
13const FE0F: u32 = 0xFE0F;
14const UNIQUE_PH: usize = usize::MAX;
15
16#[derive(Debug, Clone, PartialEq, Eq)]
17pub struct Label {
18    pub input: Vec<u32>,
19    pub offset: usize,
20    pub error: Option<EnsError>,
21    pub tokens: Option<Vec<Vec<u32>>>,
22    pub output: Option<Vec<u32>>,
23    pub emoji: Option<bool>,
24    pub label_type: Option<String>,
25}
26
27#[derive(Debug, Clone, PartialEq, Eq)]
28pub enum Token {
29    Stop {
30        cp: u32,
31    },
32    Disallowed {
33        cp: u32,
34    },
35    Ignored {
36        cp: u32,
37    },
38    Valid {
39        cps: Vec<u32>,
40    },
41    Mapped {
42        cp: u32,
43        cps: Vec<u32>,
44    },
45    Emoji {
46        input: Vec<u32>,
47        cps: Vec<u32>,
48        emoji: Vec<u32>,
49    },
50    Nfc {
51        input: Vec<u32>,
52        tokens0: Vec<Token>,
53        cps: Vec<u32>,
54        tokens: Vec<Token>,
55    },
56}
57
58impl Token {
59    pub fn token_type(&self) -> &'static str {
60        match self {
61            Token::Stop { .. } => "stop",
62            Token::Disallowed { .. } => "disallowed",
63            Token::Ignored { .. } => "ignored",
64            Token::Valid { .. } => "valid",
65            Token::Mapped { .. } => "mapped",
66            Token::Emoji { .. } => "emoji",
67            Token::Nfc { .. } => "nfc",
68        }
69    }
70
71    pub fn cps(&self) -> Option<&[u32]> {
72        match self {
73            Token::Valid { cps }
74            | Token::Mapped { cps, .. }
75            | Token::Emoji { cps, .. }
76            | Token::Nfc { cps, .. } => Some(cps),
77            _ => None,
78        }
79    }
80}
81
82#[derive(Debug, Clone, Copy, PartialEq, Eq)]
83pub struct TokenizeOptions {
84    pub nf: bool,
85}
86
87impl Default for TokenizeOptions {
88    fn default() -> Self {
89        Self { nf: true }
90    }
91}
92
93#[derive(Deserialize)]
94struct RawSpec {
95    emoji: Vec<Vec<u32>>,
96    ignored: Vec<u32>,
97    mapped: Vec<(u32, Vec<u32>)>,
98    fenced: Vec<(u32, String)>,
99    wholes: Vec<RawWhole>,
100    cm: Vec<u32>,
101    nsm: Vec<u32>,
102    nsm_max: usize,
103    escape: Vec<u32>,
104    groups: Vec<RawGroup>,
105    nfc_check: Vec<u32>,
106}
107
108#[derive(Deserialize)]
109struct RawWhole {
110    valid: Vec<u32>,
111    confused: Vec<u32>,
112}
113
114#[derive(Deserialize)]
115struct RawGroup {
116    name: String,
117    #[serde(default)]
118    restricted: bool,
119    primary: Vec<u32>,
120    secondary: Vec<u32>,
121    cm: Option<Vec<serde_json::Value>>,
122}
123
124struct Group {
125    name: String,
126    primary: IntSet<u32>,
127    secondary: IntSet<u32>,
128    check_nsm: bool,
129}
130
131impl Group {
132    fn has_cp(&self, cp: u32) -> bool {
133        self.primary.contains(&cp) || self.secondary.contains(&cp)
134    }
135}
136
137struct Whole {
138    complements: IntMap<u32, Vec<usize>>,
139}
140
141#[derive(Default)]
142struct EmojiNode {
143    children: IntMap<u32, usize>,
144    value: Option<Vec<u32>>,
145}
146
147#[derive(Default)]
148struct EmojiTrie {
149    nodes: Vec<EmojiNode>,
150}
151
152impl EmojiTrie {
153    fn new() -> Self {
154        Self {
155            nodes: vec![EmojiNode::default()],
156        }
157    }
158
159    fn child_or_insert(&mut self, node: usize, cp: u32) -> usize {
160        if let Some(&child) = self.nodes[node].children.get(&cp) {
161            return child;
162        }
163        let child = self.nodes.len();
164        self.nodes.push(EmojiNode::default());
165        self.nodes[node].children.insert(cp, child);
166        child
167    }
168}
169
170struct EnsData {
171    mapped: IntMap<u32, Vec<u32>>,
172    ignored: IntSet<u32>,
173    cm: IntSet<u32>,
174    nsm: IntSet<u32>,
175    nsm_check: IntSet<u32>,
176    nsm_max: usize,
177    escape: IntSet<u32>,
178    nfc_check: IntSet<u32>,
179    fenced: IntMap<u32, String>,
180    groups: Vec<Group>,
181    group_members: IntMap<u32, Vec<usize>>,
182    primary_group: IntMap<u32, usize>,
183    whole_map: IntMap<u32, usize>,
184    wholes: Vec<Whole>,
185    valid: IntSet<u32>,
186    emoji_list: Vec<Vec<u32>>,
187    emoji_root: EmojiTrie,
188}
189
190static ENS: LazyLock<EnsData> = LazyLock::new(|| {
191    let raw: RawSpec =
192        serde_json::from_str(include_str!("../data/spec.json")).expect("valid spec.json");
193    EnsData::from_raw(raw)
194});
195
196impl EnsData {
197    fn from_raw(raw: RawSpec) -> Self {
198        let groups: Vec<Group> = raw
199            .groups
200            .into_iter()
201            .map(|g| {
202                let name = if g.restricted {
203                    format!("Restricted[{}]", g.name)
204                } else {
205                    g.name
206                };
207                Group {
208                    name,
209                    primary: g.primary.into_iter().collect(),
210                    secondary: g.secondary.into_iter().collect(),
211                    check_nsm: g.cm.is_none(),
212                }
213            })
214            .collect();
215
216        let mut group_members: IntMap<u32, Vec<usize>> = IntMap::default();
217        let mut primary_group = IntMap::default();
218        for (i, group) in groups.iter().enumerate() {
219            for &cp in &group.primary {
220                primary_group.entry(cp).or_insert(i);
221                let members = group_members.entry(cp).or_default();
222                if !members.contains(&i) {
223                    members.push(i);
224                }
225            }
226            for &cp in &group.secondary {
227                let members = group_members.entry(cp).or_default();
228                if !members.contains(&i) {
229                    members.push(i);
230                }
231            }
232        }
233
234        let mut wholes = Vec::new();
235        let mut whole_map = IntMap::default();
236        for raw_whole in raw.wholes {
237            if raw_whole.confused.is_empty() {
238                continue;
239            }
240
241            let values: Vec<u32> = raw_whole
242                .valid
243                .iter()
244                .chain(raw_whole.confused.iter())
245                .copied()
246                .collect();
247            let complements = compute_whole_complements(&groups, &values);
248            let whole_index = wholes.len();
249            for cp in raw_whole.confused {
250                whole_map.insert(cp, whole_index);
251            }
252            wholes.push(Whole { complements });
253        }
254
255        let mut valid = IntSet::default();
256        let mut multi = IntSet::default();
257        for g in &groups {
258            for &cp in g.primary.iter().chain(g.secondary.iter()) {
259                if !valid.insert(cp) {
260                    multi.insert(cp);
261                }
262            }
263        }
264
265        for &cp in &valid {
266            if !whole_map.contains_key(&cp) && !multi.contains(&cp) {
267                whole_map.insert(cp, UNIQUE_PH);
268            }
269        }
270
271        let valid_vec: Vec<u32> = valid.iter().copied().collect();
272        for cp in nfd(&valid_vec) {
273            valid.insert(cp);
274        }
275        let nsm: IntSet<u32> = raw.nsm.into_iter().collect();
276        let nsm_check: IntSet<u32> = valid
277            .iter()
278            .copied()
279            .filter(|&cp| nfd(&[cp]).iter().any(|part| nsm.contains(part)))
280            .collect();
281
282        let mut emoji_list = raw.emoji;
283        emoji_list.sort_by(|a, b| compare_arrays(a, b).cmp(&0));
284        let mut emoji_root = EmojiTrie::new();
285        for cps in &emoji_list {
286            let mut prev = vec![0usize];
287            for &cp in cps {
288                let next: Vec<usize> = prev
289                    .iter()
290                    .map(|&node| emoji_root.child_or_insert(node, cp))
291                    .collect();
292                if cp == FE0F {
293                    prev.extend(next);
294                } else {
295                    prev = next;
296                }
297            }
298            for node in prev {
299                emoji_root.nodes[node].value = Some(cps.clone());
300            }
301        }
302
303        Self {
304            mapped: raw.mapped.into_iter().collect(),
305            ignored: raw.ignored.into_iter().collect(),
306            cm: raw.cm.into_iter().collect(),
307            nsm,
308            nsm_check,
309            nsm_max: raw.nsm_max,
310            escape: raw.escape.into_iter().collect(),
311            nfc_check: raw.nfc_check.into_iter().collect(),
312            fenced: raw.fenced.into_iter().collect(),
313            groups,
314            group_members,
315            primary_group,
316            whole_map,
317            wholes,
318            valid,
319            emoji_list,
320            emoji_root,
321        }
322    }
323}
324
325struct WholeRec {
326    groups: Vec<usize>,
327    values: Vec<u32>,
328}
329
330fn push_unique(v: &mut Vec<usize>, x: usize) {
331    if !v.contains(&x) {
332        v.push(x);
333    }
334}
335
336fn compute_whole_complements(groups: &[Group], values: &[u32]) -> IntMap<u32, Vec<usize>> {
337    let mut recs: Vec<WholeRec> = Vec::new();
338    for &cp in values {
339        let gs: Vec<usize> = groups
340            .iter()
341            .enumerate()
342            .filter_map(|(i, g)| g.has_cp(cp).then_some(i))
343            .collect();
344        let rec_index = recs
345            .iter()
346            .position(|rec| gs.iter().any(|g| rec.groups.contains(g)));
347        let rec_index = match rec_index {
348            Some(i) => i,
349            None => {
350                recs.push(WholeRec {
351                    groups: Vec::new(),
352                    values: Vec::new(),
353                });
354                recs.len() - 1
355            }
356        };
357        recs[rec_index].values.push(cp);
358        for g in gs {
359            push_unique(&mut recs[rec_index].groups, g);
360        }
361    }
362
363    let mut union = Vec::new();
364    for rec in &recs {
365        for &g in &rec.groups {
366            push_unique(&mut union, g);
367        }
368    }
369
370    let mut complements = IntMap::default();
371    for rec in recs {
372        let complement: Vec<usize> = union
373            .iter()
374            .copied()
375            .filter(|g| !rec.groups.contains(g))
376            .collect();
377        for cp in rec.values {
378            complements.insert(cp, complement.clone());
379        }
380    }
381    complements
382}
383
384#[derive(Clone)]
385struct NormToken {
386    cps: Vec<u32>,
387    is_emoji: bool,
388}
389
390pub fn is_combining_mark(cp: u32, only_nsm: bool) -> bool {
391    if only_nsm {
392        ENS.nsm.contains(&cp)
393    } else {
394        ENS.cm.contains(&cp)
395    }
396}
397
398pub fn should_escape(cp: u32) -> bool {
399    ENS.escape.contains(&cp)
400}
401
402pub fn ens_emoji() -> Vec<Vec<u32>> {
403    ENS.emoji_list.clone()
404}
405
406pub fn ens_normalize_fragment(frag: &str, decompose: bool) -> Result<String> {
407    let nf = if decompose {
408        NormalizeForm::Nfd
409    } else {
410        NormalizeForm::Nfc
411    };
412    let mut out = Vec::new();
413    for (i, label) in frag.split('.').enumerate() {
414        if i > 0 {
415            out.push(STOP);
416        }
417        let input = explode_cp(label);
418        let tokens = tokens_from_str(&input, nf, EmojiFilter::DropFe0f)?;
419        out.extend(tokens.into_iter().flat_map(|t| t.cps));
420    }
421    str_from_cps(&out)
422}
423
424pub fn ens_normalize(name: &str) -> Result<String> {
425    if let Some(result) = normalize_ascii(name) {
426        return result;
427    }
428    normalize_labels(name)
429}
430
431pub fn ens_beautify(name: &str) -> Result<String> {
432    let mut labels = split(name, NormalizeForm::Nfc, EmojiFilter::Preserve);
433    for label in &mut labels {
434        if label.error.is_some() {
435            break;
436        }
437        if label.label_type.as_deref() != Some("Greek")
438            && let Some(output) = &mut label.output
439        {
440            array_replace(output, 0x3BE, 0x39E);
441        }
442    }
443    flatten(labels)
444}
445
446pub fn ens_split(name: &str, preserve_emoji: bool) -> Vec<Label> {
447    split(
448        name,
449        NormalizeForm::Nfc,
450        if preserve_emoji {
451            EmojiFilter::Preserve
452        } else {
453            EmojiFilter::DropFe0f
454        },
455    )
456}
457
458fn split(name: &str, nf: NormalizeForm, ef: EmojiFilter) -> Vec<Label> {
459    if name.is_empty() {
460        return Vec::new();
461    }
462
463    let mut offset = 0usize;
464    name.split('.')
465        .map(|label| {
466            let input = explode_cp(label);
467            let mut info = Label {
468                input: input.clone(),
469                offset,
470                error: None,
471                tokens: None,
472                output: None,
473                emoji: None,
474                label_type: None,
475            };
476            offset += input.len() + 1;
477
478            if let Err(err) = process_label(&input, nf, ef, &mut info) {
479                info.error = Some(err);
480            }
481            info
482        })
483        .collect()
484}
485
486fn process_label(
487    input: &[u32],
488    nf: NormalizeForm,
489    ef: EmojiFilter,
490    info: &mut Label,
491) -> Result<()> {
492    let tokens = tokens_from_str(input, nf, ef)?;
493    info.tokens = Some(tokens.iter().map(|t| t.cps.clone()).collect());
494    if tokens.is_empty() {
495        return Err(EnsError::new("empty label"));
496    }
497
498    let output: Vec<u32> = tokens.iter().flat_map(|t| t.cps.iter().copied()).collect();
499    info.output = Some(output.clone());
500    check_leading_underscore(&output)?;
501    let emoji = tokens.len() > 1 || tokens[0].is_emoji;
502    info.emoji = Some(emoji);
503    let label_type = if !emoji && output.iter().all(|&cp| cp < 0x80) {
504        check_label_extension(&output)?;
505        "ASCII".to_string()
506    } else {
507        let chars_storage;
508        let chars: &[u32] = if emoji {
509            chars_storage = tokens
510                .iter()
511                .filter(|t| !t.is_emoji)
512                .flat_map(|t| t.cps.iter().copied())
513                .collect::<Vec<_>>();
514            &chars_storage
515        } else {
516            &output
517        };
518        if chars.is_empty() {
519            "Emoji".to_string()
520        } else {
521            if ENS.cm.contains(&output[0]) {
522                return Err(error_placement("leading combining mark"));
523            }
524            for i in 1..tokens.len() {
525                if !tokens[i].is_emoji && ENS.cm.contains(&tokens[i].cps[0]) {
526                    let prev = str_from_cps(&tokens[i - 1].cps)?;
527                    let mark = safe_str_from_cps(&[tokens[i].cps[0]], None);
528                    return Err(error_placement(&format!(
529                        "emoji + combining mark: \"{prev} + {mark}\""
530                    )));
531                }
532            }
533
534            check_fenced(&output)?;
535            let unique = unique_preserving_order(chars);
536            let group = determine_group(&unique)?;
537            check_group(group, chars)?;
538            check_whole(group, &unique)?;
539            ENS.groups[group].name.clone()
540        }
541    };
542
543    info.label_type = Some(label_type);
544    Ok(())
545}
546
547fn process_label_output(input: &[u32], nf: NormalizeForm, ef: EmojiFilter) -> Result<Vec<u32>> {
548    let tokens = tokens_from_str(input, nf, ef)?;
549    if tokens.is_empty() {
550        return Err(EnsError::new("empty label"));
551    }
552
553    let output: Vec<u32> = tokens.iter().flat_map(|t| t.cps.iter().copied()).collect();
554    check_leading_underscore(&output)?;
555    let emoji = tokens.len() > 1 || tokens[0].is_emoji;
556    if !emoji && output.iter().all(|&cp| cp < 0x80) {
557        check_label_extension(&output)?;
558    } else {
559        let chars_storage;
560        let chars: &[u32] = if emoji {
561            chars_storage = tokens
562                .iter()
563                .filter(|t| !t.is_emoji)
564                .flat_map(|t| t.cps.iter().copied())
565                .collect::<Vec<_>>();
566            &chars_storage
567        } else {
568            &output
569        };
570        if !chars.is_empty() {
571            if ENS.cm.contains(&output[0]) {
572                return Err(error_placement("leading combining mark"));
573            }
574            for i in 1..tokens.len() {
575                if !tokens[i].is_emoji && ENS.cm.contains(&tokens[i].cps[0]) {
576                    let prev = str_from_cps(&tokens[i - 1].cps)?;
577                    let mark = safe_str_from_cps(&[tokens[i].cps[0]], None);
578                    return Err(error_placement(&format!(
579                        "emoji + combining mark: \"{prev} + {mark}\""
580                    )));
581                }
582            }
583
584            check_fenced(&output)?;
585            let unique = unique_preserving_order(chars);
586            let group = determine_group(&unique)?;
587            check_group(group, chars)?;
588            check_whole(group, &unique)?;
589        }
590    }
591
592    Ok(output)
593}
594
595fn process_text_label_output(input: &[u32]) -> Option<Result<Vec<u32>>> {
596    let mut chars = Vec::with_capacity(input.len());
597    for &cp in input {
598        if ENS.emoji_root.nodes[0].children.contains_key(&cp) {
599            return None;
600        }
601        if ENS.valid.contains(&cp) {
602            chars.push(cp);
603        } else if let Some(cps) = ENS.mapped.get(&cp) {
604            chars.extend_from_slice(cps);
605        } else if !ENS.ignored.contains(&cp) {
606            return Some(Err(error_disallowed(cp)));
607        }
608    }
609
610    let output = NormalizeForm::Nfc.apply(&chars);
611    Some(validate_text_label_output(&output).map(|()| output))
612}
613
614fn validate_text_label_output(output: &[u32]) -> Result<()> {
615    if output.is_empty() {
616        return Err(EnsError::new("empty label"));
617    }
618    check_leading_underscore(output)?;
619    if output.iter().all(|&cp| cp < 0x80) {
620        check_label_extension(output)?;
621    } else {
622        if ENS.cm.contains(&output[0]) {
623            return Err(error_placement("leading combining mark"));
624        }
625        check_fenced(output)?;
626        let unique = unique_preserving_order(output);
627        let group = determine_group(&unique)?;
628        check_group(group, output)?;
629        check_whole(group, &unique)?;
630    }
631    Ok(())
632}
633
634fn normalize_labels(name: &str) -> Result<String> {
635    if name.is_empty() {
636        return Ok(String::new());
637    }
638
639    let labels: Vec<&str> = name.split('.').collect();
640    let multiple = labels.len() != 1;
641    let mut out = String::with_capacity(name.len());
642    for (i, label) in labels.iter().enumerate() {
643        if i > 0 {
644            out.push('.');
645        }
646        if let Some(label) = normalize_ascii_label(label) {
647            out.push_str(&label);
648            continue;
649        }
650        let input = explode_cp(label);
651        let result = process_text_label_output(&input).unwrap_or_else(|| {
652            process_label_output(&input, NormalizeForm::Nfc, EmojiFilter::DropFe0f)
653        });
654        match result {
655            Ok(output) => out.push_str(&str_from_cps(&output)?),
656            Err(error) if multiple => {
657                let safe = safe_str_from_cps(&input, Some(63));
658                return Err(EnsError::new(format!(
659                    "Invalid label {}: {}",
660                    bidi_qq(&safe),
661                    error.message()
662                )));
663            }
664            Err(error) => return Err(error),
665        }
666    }
667    Ok(out)
668}
669
670fn normalize_ascii(name: &str) -> Option<Result<String>> {
671    if name.is_empty() {
672        return Some(Ok(String::new()));
673    }
674    if !name.is_ascii() {
675        return None;
676    }
677
678    let mut start = 0;
679    let mut changed = false;
680    for (i, byte) in name.bytes().enumerate() {
681        if byte == b'.' {
682            if !valid_ascii_label(&name.as_bytes()[start..i]) {
683                return None;
684            }
685            start = i + 1;
686        } else if byte.is_ascii_uppercase() {
687            changed = true;
688        } else if !is_valid_ascii_byte(byte) {
689            return None;
690        }
691    }
692
693    if !valid_ascii_label(&name.as_bytes()[start..]) {
694        return None;
695    }
696
697    if changed {
698        let mut out = String::with_capacity(name.len());
699        for byte in name.bytes() {
700            if byte.is_ascii_uppercase() {
701                out.push(char::from(byte + 32));
702            } else {
703                out.push(char::from(byte));
704            }
705        }
706        Some(Ok(out))
707    } else {
708        Some(Ok(name.to_owned()))
709    }
710}
711
712fn normalize_ascii_label(label: &str) -> Option<Cow<'_, str>> {
713    if label.is_empty() || !label.is_ascii() {
714        return None;
715    }
716    let bytes = label.as_bytes();
717    if !valid_ascii_label(bytes) {
718        return None;
719    }
720
721    let mut changed = false;
722    for &byte in bytes {
723        if byte.is_ascii_uppercase() {
724            changed = true;
725        } else if !is_valid_ascii_byte(byte) {
726            return None;
727        }
728    }
729
730    if changed {
731        let mut out = String::with_capacity(label.len());
732        for byte in label.bytes() {
733            if byte.is_ascii_uppercase() {
734                out.push(char::from(byte + 32));
735            } else {
736                out.push(char::from(byte));
737            }
738        }
739        Some(Cow::Owned(out))
740    } else {
741        Some(Cow::Borrowed(label))
742    }
743}
744
745fn is_valid_ascii_byte(byte: u8) -> bool {
746    matches!(byte, b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b'$')
747}
748
749fn valid_ascii_label(label: &[u8]) -> bool {
750    if label.is_empty() {
751        return false;
752    }
753    if label.len() >= 4 && label[2] == b'-' && label[3] == b'-' {
754        return false;
755    }
756    match label.iter().rposition(|&cp| cp == b'_') {
757        Some(0) | None => true,
758        Some(pos) => label[..pos].iter().all(|&cp| cp == b'_'),
759    }
760}
761
762fn unique_preserving_order(cps: &[u32]) -> Vec<u32> {
763    if cps.len() <= 64 {
764        let mut unique = Vec::new();
765        for &cp in cps {
766            if !unique.contains(&cp) {
767                unique.push(cp);
768            }
769        }
770        return unique;
771    }
772
773    let mut seen = IntSet::default();
774    let mut unique = Vec::new();
775    for &cp in cps {
776        if seen.insert(cp) {
777            unique.push(cp);
778        }
779    }
780    unique
781}
782
783fn check_label_extension(cps: &[u32]) -> Result<()> {
784    if cps.len() >= 4 && cps[2] == HYPHEN && cps[3] == HYPHEN {
785        let s = str_from_cps(&cps[..4])?;
786        Err(EnsError::new(format!("invalid label extension: \"{s}\"")))
787    } else {
788        Ok(())
789    }
790}
791
792fn check_leading_underscore(cps: &[u32]) -> Result<()> {
793    const UNDERSCORE: u32 = 0x5F;
794    if let Some(mut i) = cps.iter().rposition(|&cp| cp == UNDERSCORE) {
795        while i > 0 {
796            i -= 1;
797            if cps[i] != UNDERSCORE {
798                return Err(EnsError::new("underscore allowed only at start"));
799            }
800        }
801    }
802    Ok(())
803}
804
805fn check_fenced(cps: &[u32]) -> Result<()> {
806    if cps.is_empty() {
807        return Ok(());
808    }
809    let mut prev = ENS.fenced.get(&cps[0]);
810    if let Some(prev) = prev {
811        return Err(error_placement(&format!("leading {prev}")));
812    }
813
814    let mut last = usize::MAX;
815    for (i, &cp) in cps.iter().enumerate().skip(1) {
816        if let Some(matched) = ENS.fenced.get(&cp) {
817            if last == i {
818                return Err(error_placement(&format!("{} + {matched}", prev.unwrap())));
819            }
820            last = i + 1;
821            prev = Some(matched);
822        }
823    }
824    if last == cps.len()
825        && let Some(prev) = prev
826    {
827        return Err(error_placement(&format!("trailing {prev}")));
828    }
829    Ok(())
830}
831
832fn determine_group(unique: &[u32]) -> Result<usize> {
833    let mut groups: Option<Vec<usize>> = None;
834    for &cp in unique {
835        let Some(cp_groups) = ENS.group_members.get(&cp) else {
836            return Err(error_disallowed(cp));
837        };
838        let gs: Vec<usize> = if let Some(groups) = groups.take() {
839            let first = groups[0];
840            let filtered: Vec<usize> = groups
841                .into_iter()
842                .filter(|i| cp_groups.contains(i))
843                .collect();
844            if filtered.is_empty() {
845                return Err(error_group_member(first, cp));
846            }
847            filtered
848        } else {
849            cp_groups.clone()
850        };
851        if gs.len() == 1 {
852            return Ok(gs[0]);
853        }
854        groups = Some(gs);
855    }
856    Ok(groups.expect("unique has at least one code point")[0])
857}
858
859fn check_group(group: usize, cps: &[u32]) -> Result<()> {
860    let g = &ENS.groups[group];
861    for &cp in cps {
862        if !g.has_cp(cp) {
863            return Err(error_group_member(group, cp));
864        }
865    }
866
867    if g.check_nsm && cps.iter().any(|cp| ENS.nsm_check.contains(cp)) {
868        let decomposed = nfd(cps);
869        let mut i = 1usize;
870        while i < decomposed.len() {
871            if ENS.nsm.contains(&decomposed[i]) {
872                let mut j = i + 1;
873                while j < decomposed.len() && ENS.nsm.contains(&decomposed[j]) {
874                    for k in i..j {
875                        if decomposed[k] == decomposed[j] {
876                            return Err(EnsError::new(format!(
877                                "duplicate non-spacing marks: {}",
878                                quoted_cp(decomposed[j])
879                            )));
880                        }
881                    }
882                    j += 1;
883                }
884                if j - i > ENS.nsm_max {
885                    let s = safe_str_from_cps(&decomposed[i - 1..j], None);
886                    return Err(EnsError::new(format!(
887                        "excessive non-spacing marks: {} ({}/{})",
888                        bidi_qq(&s),
889                        j - i,
890                        ENS.nsm_max
891                    )));
892                }
893                i = j;
894            } else {
895                i += 1;
896            }
897        }
898    }
899
900    Ok(())
901}
902
903fn check_whole(group: usize, unique: &[u32]) -> Result<()> {
904    let mut maker: Option<Vec<usize>> = None;
905    let mut shared = Vec::new();
906    for &cp in unique {
907        match ENS.whole_map.get(&cp).copied() {
908            Some(UNIQUE_PH) => return Ok(()),
909            Some(whole_index) => {
910                let set = ENS.wholes[whole_index]
911                    .complements
912                    .get(&cp)
913                    .cloned()
914                    .unwrap_or_default();
915                maker = Some(match maker {
916                    Some(prev) => prev.into_iter().filter(|g| set.contains(g)).collect(),
917                    None => set,
918                });
919                if maker.as_ref().is_some_and(|m| m.is_empty()) {
920                    return Ok(());
921                }
922            }
923            None => shared.push(cp),
924        }
925    }
926
927    if let Some(maker) = maker {
928        for other in maker {
929            if shared.iter().all(|&cp| ENS.groups[other].has_cp(cp)) {
930                return Err(EnsError::new(format!(
931                    "whole-script confusable: {}/{}",
932                    ENS.groups[group].name, ENS.groups[other].name
933                )));
934            }
935        }
936    }
937    Ok(())
938}
939
940fn flatten(labels: Vec<Label>) -> Result<String> {
941    let multiple = labels.len() != 1;
942    let mut out = Vec::new();
943    for label in labels {
944        if let Some(error) = label.error {
945            if multiple {
946                let safe = safe_str_from_cps(&label.input, Some(63));
947                return Err(EnsError::new(format!(
948                    "Invalid label {}: {}",
949                    bidi_qq(&safe),
950                    error.message()
951                )));
952            }
953            return Err(error);
954        }
955        out.push(str_from_cps(label.output.as_deref().unwrap_or_default())?);
956    }
957    Ok(out.join("."))
958}
959
960fn quoted_cp(cp: u32) -> String {
961    let prefix = if should_escape(cp) {
962        String::new()
963    } else {
964        format!("{} ", bidi_qq(&safe_str_from_cps(&[cp], None)))
965    };
966    format!("{prefix}{}", quote_cp(cp))
967}
968
969fn error_disallowed(cp: u32) -> EnsError {
970    EnsError::new(format!("disallowed character: {}", quoted_cp(cp)))
971}
972
973fn error_group_member(group: usize, cp: u32) -> EnsError {
974    let mut quoted = quoted_cp(cp);
975    if let Some(&gg) = ENS.primary_group.get(&cp) {
976        let gg = &ENS.groups[gg];
977        quoted = format!("{} {quoted}", gg.name);
978    }
979    EnsError::new(format!(
980        "illegal mixture: {} + {quoted}",
981        ENS.groups[group].name
982    ))
983}
984
985fn error_placement(where_: &str) -> EnsError {
986    EnsError::new(format!("illegal placement: {where_}"))
987}
988
989#[derive(Debug, Clone, Copy)]
990enum NormalizeForm {
991    Nfc,
992    Nfd,
993}
994
995impl NormalizeForm {
996    fn apply(self, cps: &[u32]) -> Vec<u32> {
997        match self {
998            Self::Nfc if !requires_check(cps) => cps.to_vec(),
999            Self::Nfc => nfc(cps),
1000            Self::Nfd => nfd(cps),
1001        }
1002    }
1003}
1004
1005#[derive(Debug, Clone, Copy)]
1006enum EmojiFilter {
1007    Preserve,
1008    DropFe0f,
1009}
1010
1011fn filter_emoji(cps: &[u32], filter: EmojiFilter) -> Vec<u32> {
1012    match filter {
1013        EmojiFilter::Preserve => cps.to_vec(),
1014        EmojiFilter::DropFe0f => cps.iter().copied().filter(|&cp| cp != FE0F).collect(),
1015    }
1016}
1017
1018fn tokens_from_str(input: &[u32], nf: NormalizeForm, ef: EmojiFilter) -> Result<Vec<NormToken>> {
1019    let mut ret = Vec::new();
1020    let mut chars = Vec::new();
1021    let mut input = input.to_vec();
1022    input.reverse();
1023
1024    while !input.is_empty() {
1025        if let Some(emoji) = consume_emoji_reversed(&mut input, None) {
1026            if !chars.is_empty() {
1027                ret.push(NormToken {
1028                    cps: nf.apply(&chars),
1029                    is_emoji: false,
1030                });
1031                chars.clear();
1032            }
1033            ret.push(NormToken {
1034                cps: filter_emoji(&emoji, ef),
1035                is_emoji: true,
1036            });
1037        } else {
1038            let cp = input.pop().expect("input is not empty");
1039            if ENS.valid.contains(&cp) {
1040                chars.push(cp);
1041            } else if let Some(cps) = ENS.mapped.get(&cp) {
1042                chars.extend_from_slice(cps);
1043            } else if !ENS.ignored.contains(&cp) {
1044                return Err(error_disallowed(cp));
1045            }
1046        }
1047    }
1048
1049    if !chars.is_empty() {
1050        ret.push(NormToken {
1051            cps: nf.apply(&chars),
1052            is_emoji: false,
1053        });
1054    }
1055
1056    Ok(ret)
1057}
1058
1059fn consume_emoji_reversed(input: &mut Vec<u32>, eaten: Option<&mut Vec<u32>>) -> Option<Vec<u32>> {
1060    let mut eaten = eaten;
1061    let mut node = 0usize;
1062    let mut emoji = None;
1063    let mut pos = input.len();
1064    while pos > 0 {
1065        pos -= 1;
1066        let cp = input[pos];
1067        let Some(&child) = ENS.emoji_root.nodes[node].children.get(&cp) else {
1068            break;
1069        };
1070        node = child;
1071        if let Some(value) = ENS.emoji_root.nodes[node].value.clone() {
1072            if let Some(eaten) = eaten.as_deref_mut() {
1073                eaten.extend(input[pos..].iter().rev().copied());
1074            }
1075            input.truncate(pos);
1076            emoji = Some(value);
1077        }
1078    }
1079    emoji
1080}
1081
1082pub fn ens_tokenize(name: &str) -> Vec<Token> {
1083    ens_tokenize_with_options(name, TokenizeOptions::default())
1084}
1085
1086pub fn ens_tokenize_with_options(name: &str, options: TokenizeOptions) -> Vec<Token> {
1087    tokenize(name, options.nf)
1088}
1089
1090fn tokenize(name: &str, nf: bool) -> Vec<Token> {
1091    let mut input = explode_cp(name);
1092    input.reverse();
1093    let mut eaten = Vec::new();
1094    let mut tokens = Vec::new();
1095
1096    while !input.is_empty() {
1097        if let Some(emoji) = consume_emoji_reversed(&mut input, Some(&mut eaten)) {
1098            tokens.push(Token::Emoji {
1099                input: std::mem::take(&mut eaten),
1100                cps: filter_emoji(&emoji, EmojiFilter::DropFe0f),
1101                emoji,
1102            });
1103        } else {
1104            let cp = input.pop().expect("input is not empty");
1105            if cp == STOP {
1106                tokens.push(Token::Stop { cp });
1107            } else if ENS.valid.contains(&cp) {
1108                tokens.push(Token::Valid { cps: vec![cp] });
1109            } else if ENS.ignored.contains(&cp) {
1110                tokens.push(Token::Ignored { cp });
1111            } else if let Some(cps) = ENS.mapped.get(&cp) {
1112                tokens.push(Token::Mapped {
1113                    cp,
1114                    cps: cps.clone(),
1115                });
1116            } else {
1117                tokens.push(Token::Disallowed { cp });
1118            }
1119        }
1120    }
1121
1122    if nf {
1123        apply_token_nfc(&mut tokens);
1124    }
1125
1126    collapse_valid_tokens(tokens)
1127}
1128
1129fn is_valid_or_mapped(token: &Token) -> bool {
1130    matches!(token, Token::Valid { .. } | Token::Mapped { .. })
1131}
1132
1133fn valid_or_mapped_cps(token: &Token) -> Option<&[u32]> {
1134    match token {
1135        Token::Valid { cps } | Token::Mapped { cps, .. } => Some(cps),
1136        _ => None,
1137    }
1138}
1139
1140fn requires_check(cps: &[u32]) -> bool {
1141    cps.iter().any(|cp| ENS.nfc_check.contains(cp))
1142}
1143
1144fn apply_token_nfc(tokens: &mut Vec<Token>) {
1145    let mut i = 0usize;
1146    let mut start: Option<usize> = None;
1147    while i < tokens.len() {
1148        if is_valid_or_mapped(&tokens[i]) {
1149            let cps = valid_or_mapped_cps(&tokens[i]).unwrap();
1150            if requires_check(cps) {
1151                let mut end = i + 1;
1152                let mut pos = end;
1153                while pos < tokens.len() {
1154                    if let Some(cps) = valid_or_mapped_cps(&tokens[pos]) {
1155                        if !requires_check(cps) {
1156                            break;
1157                        }
1158                        end = pos + 1;
1159                    } else if !matches!(tokens[pos], Token::Ignored { .. }) {
1160                        break;
1161                    }
1162                    pos += 1;
1163                }
1164                let start_i = start.unwrap_or(i);
1165                let slice = tokens[start_i..end].to_vec();
1166                let cps0: Vec<u32> = slice
1167                    .iter()
1168                    .filter_map(valid_or_mapped_cps)
1169                    .flat_map(|cps| cps.iter().copied())
1170                    .collect();
1171                let cps = nfc(&cps0);
1172                if compare_arrays(&cps, &cps0) != 0 {
1173                    let text = str_from_cps(&cps).unwrap_or_default();
1174                    let replacement = Token::Nfc {
1175                        input: cps0,
1176                        tokens0: collapse_valid_tokens(slice),
1177                        cps,
1178                        tokens: tokenize(&text, false),
1179                    };
1180                    tokens.splice(start_i..end, [replacement]);
1181                    i = start_i;
1182                } else {
1183                    i = end.saturating_sub(1);
1184                }
1185                start = None;
1186            } else {
1187                start = Some(i);
1188            }
1189        } else if !matches!(tokens[i], Token::Ignored { .. }) {
1190            start = None;
1191        }
1192        i += 1;
1193    }
1194}
1195
1196fn collapse_valid_tokens(tokens: Vec<Token>) -> Vec<Token> {
1197    let mut out = Vec::new();
1198    let mut i = 0usize;
1199    while i < tokens.len() {
1200        if let Token::Valid { .. } = &tokens[i] {
1201            let mut cps = Vec::new();
1202            while i < tokens.len() {
1203                if let Token::Valid { cps: next } = &tokens[i] {
1204                    cps.extend_from_slice(next);
1205                    i += 1;
1206                } else {
1207                    break;
1208                }
1209            }
1210            out.push(Token::Valid { cps });
1211        } else {
1212            out.push(tokens[i].clone());
1213            i += 1;
1214        }
1215    }
1216    out
1217}