1use crate::intmap::{IntMap, IntSet};
2use crate::nf::{nfc, nfd};
3use crate::utils::{
4 EnsError, Result, array_replace, bidi_qq, compare_arrays, explode_cp, quote_cp,
5 safe_str_from_cps, str_from_cps,
6};
7use serde::Deserialize;
8use std::borrow::Cow;
9use std::sync::LazyLock;
10
11const HYPHEN: u32 = 0x2D;
12const STOP: u32 = 0x2E;
13const FE0F: u32 = 0xFE0F;
14const UNIQUE_PH: usize = usize::MAX;
15
16#[derive(Debug, Clone, PartialEq, Eq)]
17pub struct Label {
18 pub input: Vec<u32>,
19 pub offset: usize,
20 pub error: Option<EnsError>,
21 pub tokens: Option<Vec<Vec<u32>>>,
22 pub output: Option<Vec<u32>>,
23 pub emoji: Option<bool>,
24 pub label_type: Option<String>,
25}
26
27#[derive(Debug, Clone, PartialEq, Eq)]
28pub enum Token {
29 Stop {
30 cp: u32,
31 },
32 Disallowed {
33 cp: u32,
34 },
35 Ignored {
36 cp: u32,
37 },
38 Valid {
39 cps: Vec<u32>,
40 },
41 Mapped {
42 cp: u32,
43 cps: Vec<u32>,
44 },
45 Emoji {
46 input: Vec<u32>,
47 cps: Vec<u32>,
48 emoji: Vec<u32>,
49 },
50 Nfc {
51 input: Vec<u32>,
52 tokens0: Vec<Token>,
53 cps: Vec<u32>,
54 tokens: Vec<Token>,
55 },
56}
57
58impl Token {
59 pub fn token_type(&self) -> &'static str {
60 match self {
61 Token::Stop { .. } => "stop",
62 Token::Disallowed { .. } => "disallowed",
63 Token::Ignored { .. } => "ignored",
64 Token::Valid { .. } => "valid",
65 Token::Mapped { .. } => "mapped",
66 Token::Emoji { .. } => "emoji",
67 Token::Nfc { .. } => "nfc",
68 }
69 }
70
71 pub fn cps(&self) -> Option<&[u32]> {
72 match self {
73 Token::Valid { cps }
74 | Token::Mapped { cps, .. }
75 | Token::Emoji { cps, .. }
76 | Token::Nfc { cps, .. } => Some(cps),
77 _ => None,
78 }
79 }
80}
81
82#[derive(Debug, Clone, Copy, PartialEq, Eq)]
83pub struct TokenizeOptions {
84 pub nf: bool,
85}
86
87impl Default for TokenizeOptions {
88 fn default() -> Self {
89 Self { nf: true }
90 }
91}
92
93#[derive(Deserialize)]
94struct RawSpec {
95 emoji: Vec<Vec<u32>>,
96 ignored: Vec<u32>,
97 mapped: Vec<(u32, Vec<u32>)>,
98 fenced: Vec<(u32, String)>,
99 wholes: Vec<RawWhole>,
100 cm: Vec<u32>,
101 nsm: Vec<u32>,
102 nsm_max: usize,
103 escape: Vec<u32>,
104 groups: Vec<RawGroup>,
105 nfc_check: Vec<u32>,
106}
107
108#[derive(Deserialize)]
109struct RawWhole {
110 valid: Vec<u32>,
111 confused: Vec<u32>,
112}
113
114#[derive(Deserialize)]
115struct RawGroup {
116 name: String,
117 #[serde(default)]
118 restricted: bool,
119 primary: Vec<u32>,
120 secondary: Vec<u32>,
121 cm: Option<Vec<serde_json::Value>>,
122}
123
124struct Group {
125 name: String,
126 primary: IntSet<u32>,
127 secondary: IntSet<u32>,
128 check_nsm: bool,
129}
130
131impl Group {
132 fn has_cp(&self, cp: u32) -> bool {
133 self.primary.contains(&cp) || self.secondary.contains(&cp)
134 }
135}
136
137struct Whole {
138 complements: IntMap<u32, Vec<usize>>,
139}
140
141#[derive(Default)]
142struct EmojiNode {
143 children: IntMap<u32, usize>,
144 value: Option<Vec<u32>>,
145}
146
147#[derive(Default)]
148struct EmojiTrie {
149 nodes: Vec<EmojiNode>,
150}
151
152impl EmojiTrie {
153 fn new() -> Self {
154 Self {
155 nodes: vec![EmojiNode::default()],
156 }
157 }
158
159 fn child_or_insert(&mut self, node: usize, cp: u32) -> usize {
160 if let Some(&child) = self.nodes[node].children.get(&cp) {
161 return child;
162 }
163 let child = self.nodes.len();
164 self.nodes.push(EmojiNode::default());
165 self.nodes[node].children.insert(cp, child);
166 child
167 }
168}
169
170struct EnsData {
171 mapped: IntMap<u32, Vec<u32>>,
172 ignored: IntSet<u32>,
173 cm: IntSet<u32>,
174 nsm: IntSet<u32>,
175 nsm_check: IntSet<u32>,
176 nsm_max: usize,
177 escape: IntSet<u32>,
178 nfc_check: IntSet<u32>,
179 fenced: IntMap<u32, String>,
180 groups: Vec<Group>,
181 group_members: IntMap<u32, Vec<usize>>,
182 primary_group: IntMap<u32, usize>,
183 whole_map: IntMap<u32, usize>,
184 wholes: Vec<Whole>,
185 valid: IntSet<u32>,
186 emoji_list: Vec<Vec<u32>>,
187 emoji_root: EmojiTrie,
188}
189
190static ENS: LazyLock<EnsData> = LazyLock::new(|| {
191 let raw: RawSpec =
192 serde_json::from_str(include_str!("../data/spec.json")).expect("valid spec.json");
193 EnsData::from_raw(raw)
194});
195
196impl EnsData {
197 fn from_raw(raw: RawSpec) -> Self {
198 let groups: Vec<Group> = raw
199 .groups
200 .into_iter()
201 .map(|g| {
202 let name = if g.restricted {
203 format!("Restricted[{}]", g.name)
204 } else {
205 g.name
206 };
207 Group {
208 name,
209 primary: g.primary.into_iter().collect(),
210 secondary: g.secondary.into_iter().collect(),
211 check_nsm: g.cm.is_none(),
212 }
213 })
214 .collect();
215
216 let mut group_members: IntMap<u32, Vec<usize>> = IntMap::default();
217 let mut primary_group = IntMap::default();
218 for (i, group) in groups.iter().enumerate() {
219 for &cp in &group.primary {
220 primary_group.entry(cp).or_insert(i);
221 let members = group_members.entry(cp).or_default();
222 if !members.contains(&i) {
223 members.push(i);
224 }
225 }
226 for &cp in &group.secondary {
227 let members = group_members.entry(cp).or_default();
228 if !members.contains(&i) {
229 members.push(i);
230 }
231 }
232 }
233
234 let mut wholes = Vec::new();
235 let mut whole_map = IntMap::default();
236 for raw_whole in raw.wholes {
237 if raw_whole.confused.is_empty() {
238 continue;
239 }
240
241 let values: Vec<u32> = raw_whole
242 .valid
243 .iter()
244 .chain(raw_whole.confused.iter())
245 .copied()
246 .collect();
247 let complements = compute_whole_complements(&groups, &values);
248 let whole_index = wholes.len();
249 for cp in raw_whole.confused {
250 whole_map.insert(cp, whole_index);
251 }
252 wholes.push(Whole { complements });
253 }
254
255 let mut valid = IntSet::default();
256 let mut multi = IntSet::default();
257 for g in &groups {
258 for &cp in g.primary.iter().chain(g.secondary.iter()) {
259 if !valid.insert(cp) {
260 multi.insert(cp);
261 }
262 }
263 }
264
265 for &cp in &valid {
266 if !whole_map.contains_key(&cp) && !multi.contains(&cp) {
267 whole_map.insert(cp, UNIQUE_PH);
268 }
269 }
270
271 let valid_vec: Vec<u32> = valid.iter().copied().collect();
272 for cp in nfd(&valid_vec) {
273 valid.insert(cp);
274 }
275 let nsm: IntSet<u32> = raw.nsm.into_iter().collect();
276 let nsm_check: IntSet<u32> = valid
277 .iter()
278 .copied()
279 .filter(|&cp| nfd(&[cp]).iter().any(|part| nsm.contains(part)))
280 .collect();
281
282 let mut emoji_list = raw.emoji;
283 emoji_list.sort_by(|a, b| compare_arrays(a, b).cmp(&0));
284 let mut emoji_root = EmojiTrie::new();
285 for cps in &emoji_list {
286 let mut prev = vec![0usize];
287 for &cp in cps {
288 let next: Vec<usize> = prev
289 .iter()
290 .map(|&node| emoji_root.child_or_insert(node, cp))
291 .collect();
292 if cp == FE0F {
293 prev.extend(next);
294 } else {
295 prev = next;
296 }
297 }
298 for node in prev {
299 emoji_root.nodes[node].value = Some(cps.clone());
300 }
301 }
302
303 Self {
304 mapped: raw.mapped.into_iter().collect(),
305 ignored: raw.ignored.into_iter().collect(),
306 cm: raw.cm.into_iter().collect(),
307 nsm,
308 nsm_check,
309 nsm_max: raw.nsm_max,
310 escape: raw.escape.into_iter().collect(),
311 nfc_check: raw.nfc_check.into_iter().collect(),
312 fenced: raw.fenced.into_iter().collect(),
313 groups,
314 group_members,
315 primary_group,
316 whole_map,
317 wholes,
318 valid,
319 emoji_list,
320 emoji_root,
321 }
322 }
323}
324
325struct WholeRec {
326 groups: Vec<usize>,
327 values: Vec<u32>,
328}
329
330fn push_unique(v: &mut Vec<usize>, x: usize) {
331 if !v.contains(&x) {
332 v.push(x);
333 }
334}
335
336fn compute_whole_complements(groups: &[Group], values: &[u32]) -> IntMap<u32, Vec<usize>> {
337 let mut recs: Vec<WholeRec> = Vec::new();
338 for &cp in values {
339 let gs: Vec<usize> = groups
340 .iter()
341 .enumerate()
342 .filter_map(|(i, g)| g.has_cp(cp).then_some(i))
343 .collect();
344 let rec_index = recs
345 .iter()
346 .position(|rec| gs.iter().any(|g| rec.groups.contains(g)));
347 let rec_index = match rec_index {
348 Some(i) => i,
349 None => {
350 recs.push(WholeRec {
351 groups: Vec::new(),
352 values: Vec::new(),
353 });
354 recs.len() - 1
355 }
356 };
357 recs[rec_index].values.push(cp);
358 for g in gs {
359 push_unique(&mut recs[rec_index].groups, g);
360 }
361 }
362
363 let mut union = Vec::new();
364 for rec in &recs {
365 for &g in &rec.groups {
366 push_unique(&mut union, g);
367 }
368 }
369
370 let mut complements = IntMap::default();
371 for rec in recs {
372 let complement: Vec<usize> = union
373 .iter()
374 .copied()
375 .filter(|g| !rec.groups.contains(g))
376 .collect();
377 for cp in rec.values {
378 complements.insert(cp, complement.clone());
379 }
380 }
381 complements
382}
383
384#[derive(Clone)]
385struct NormToken {
386 cps: Vec<u32>,
387 is_emoji: bool,
388}
389
390pub fn is_combining_mark(cp: u32, only_nsm: bool) -> bool {
391 if only_nsm {
392 ENS.nsm.contains(&cp)
393 } else {
394 ENS.cm.contains(&cp)
395 }
396}
397
398pub fn should_escape(cp: u32) -> bool {
399 ENS.escape.contains(&cp)
400}
401
402pub fn ens_emoji() -> Vec<Vec<u32>> {
403 ENS.emoji_list.clone()
404}
405
406pub fn ens_normalize_fragment(frag: &str, decompose: bool) -> Result<String> {
407 let nf = if decompose {
408 NormalizeForm::Nfd
409 } else {
410 NormalizeForm::Nfc
411 };
412 let mut out = Vec::new();
413 for (i, label) in frag.split('.').enumerate() {
414 if i > 0 {
415 out.push(STOP);
416 }
417 let input = explode_cp(label);
418 let tokens = tokens_from_str(&input, nf, EmojiFilter::DropFe0f)?;
419 out.extend(tokens.into_iter().flat_map(|t| t.cps));
420 }
421 str_from_cps(&out)
422}
423
424pub fn ens_normalize(name: &str) -> Result<String> {
425 if let Some(result) = normalize_ascii(name) {
426 return result;
427 }
428 normalize_labels(name)
429}
430
431pub fn ens_beautify(name: &str) -> Result<String> {
432 let mut labels = split(name, NormalizeForm::Nfc, EmojiFilter::Preserve);
433 for label in &mut labels {
434 if label.error.is_some() {
435 break;
436 }
437 if label.label_type.as_deref() != Some("Greek")
438 && let Some(output) = &mut label.output
439 {
440 array_replace(output, 0x3BE, 0x39E);
441 }
442 }
443 flatten(labels)
444}
445
446pub fn ens_split(name: &str, preserve_emoji: bool) -> Vec<Label> {
447 split(
448 name,
449 NormalizeForm::Nfc,
450 if preserve_emoji {
451 EmojiFilter::Preserve
452 } else {
453 EmojiFilter::DropFe0f
454 },
455 )
456}
457
458fn split(name: &str, nf: NormalizeForm, ef: EmojiFilter) -> Vec<Label> {
459 if name.is_empty() {
460 return Vec::new();
461 }
462
463 let mut offset = 0usize;
464 name.split('.')
465 .map(|label| {
466 let input = explode_cp(label);
467 let mut info = Label {
468 input: input.clone(),
469 offset,
470 error: None,
471 tokens: None,
472 output: None,
473 emoji: None,
474 label_type: None,
475 };
476 offset += input.len() + 1;
477
478 if let Err(err) = process_label(&input, nf, ef, &mut info) {
479 info.error = Some(err);
480 }
481 info
482 })
483 .collect()
484}
485
486fn process_label(
487 input: &[u32],
488 nf: NormalizeForm,
489 ef: EmojiFilter,
490 info: &mut Label,
491) -> Result<()> {
492 let tokens = tokens_from_str(input, nf, ef)?;
493 info.tokens = Some(tokens.iter().map(|t| t.cps.clone()).collect());
494 if tokens.is_empty() {
495 return Err(EnsError::new("empty label"));
496 }
497
498 let output: Vec<u32> = tokens.iter().flat_map(|t| t.cps.iter().copied()).collect();
499 info.output = Some(output.clone());
500 check_leading_underscore(&output)?;
501 let emoji = tokens.len() > 1 || tokens[0].is_emoji;
502 info.emoji = Some(emoji);
503 let label_type = if !emoji && output.iter().all(|&cp| cp < 0x80) {
504 check_label_extension(&output)?;
505 "ASCII".to_string()
506 } else {
507 let chars_storage;
508 let chars: &[u32] = if emoji {
509 chars_storage = tokens
510 .iter()
511 .filter(|t| !t.is_emoji)
512 .flat_map(|t| t.cps.iter().copied())
513 .collect::<Vec<_>>();
514 &chars_storage
515 } else {
516 &output
517 };
518 if chars.is_empty() {
519 "Emoji".to_string()
520 } else {
521 if ENS.cm.contains(&output[0]) {
522 return Err(error_placement("leading combining mark"));
523 }
524 for i in 1..tokens.len() {
525 if !tokens[i].is_emoji && ENS.cm.contains(&tokens[i].cps[0]) {
526 let prev = str_from_cps(&tokens[i - 1].cps)?;
527 let mark = safe_str_from_cps(&[tokens[i].cps[0]], None);
528 return Err(error_placement(&format!(
529 "emoji + combining mark: \"{prev} + {mark}\""
530 )));
531 }
532 }
533
534 check_fenced(&output)?;
535 let unique = unique_preserving_order(chars);
536 let group = determine_group(&unique)?;
537 check_group(group, chars)?;
538 check_whole(group, &unique)?;
539 ENS.groups[group].name.clone()
540 }
541 };
542
543 info.label_type = Some(label_type);
544 Ok(())
545}
546
547fn process_label_output(input: &[u32], nf: NormalizeForm, ef: EmojiFilter) -> Result<Vec<u32>> {
548 let tokens = tokens_from_str(input, nf, ef)?;
549 if tokens.is_empty() {
550 return Err(EnsError::new("empty label"));
551 }
552
553 let output: Vec<u32> = tokens.iter().flat_map(|t| t.cps.iter().copied()).collect();
554 check_leading_underscore(&output)?;
555 let emoji = tokens.len() > 1 || tokens[0].is_emoji;
556 if !emoji && output.iter().all(|&cp| cp < 0x80) {
557 check_label_extension(&output)?;
558 } else {
559 let chars_storage;
560 let chars: &[u32] = if emoji {
561 chars_storage = tokens
562 .iter()
563 .filter(|t| !t.is_emoji)
564 .flat_map(|t| t.cps.iter().copied())
565 .collect::<Vec<_>>();
566 &chars_storage
567 } else {
568 &output
569 };
570 if !chars.is_empty() {
571 if ENS.cm.contains(&output[0]) {
572 return Err(error_placement("leading combining mark"));
573 }
574 for i in 1..tokens.len() {
575 if !tokens[i].is_emoji && ENS.cm.contains(&tokens[i].cps[0]) {
576 let prev = str_from_cps(&tokens[i - 1].cps)?;
577 let mark = safe_str_from_cps(&[tokens[i].cps[0]], None);
578 return Err(error_placement(&format!(
579 "emoji + combining mark: \"{prev} + {mark}\""
580 )));
581 }
582 }
583
584 check_fenced(&output)?;
585 let unique = unique_preserving_order(chars);
586 let group = determine_group(&unique)?;
587 check_group(group, chars)?;
588 check_whole(group, &unique)?;
589 }
590 }
591
592 Ok(output)
593}
594
595fn process_text_label_output(input: &[u32]) -> Option<Result<Vec<u32>>> {
596 let mut chars = Vec::with_capacity(input.len());
597 for &cp in input {
598 if ENS.emoji_root.nodes[0].children.contains_key(&cp) {
599 return None;
600 }
601 if ENS.valid.contains(&cp) {
602 chars.push(cp);
603 } else if let Some(cps) = ENS.mapped.get(&cp) {
604 chars.extend_from_slice(cps);
605 } else if !ENS.ignored.contains(&cp) {
606 return Some(Err(error_disallowed(cp)));
607 }
608 }
609
610 let output = NormalizeForm::Nfc.apply(&chars);
611 Some(validate_text_label_output(&output).map(|()| output))
612}
613
614fn validate_text_label_output(output: &[u32]) -> Result<()> {
615 if output.is_empty() {
616 return Err(EnsError::new("empty label"));
617 }
618 check_leading_underscore(output)?;
619 if output.iter().all(|&cp| cp < 0x80) {
620 check_label_extension(output)?;
621 } else {
622 if ENS.cm.contains(&output[0]) {
623 return Err(error_placement("leading combining mark"));
624 }
625 check_fenced(output)?;
626 let unique = unique_preserving_order(output);
627 let group = determine_group(&unique)?;
628 check_group(group, output)?;
629 check_whole(group, &unique)?;
630 }
631 Ok(())
632}
633
634fn normalize_labels(name: &str) -> Result<String> {
635 if name.is_empty() {
636 return Ok(String::new());
637 }
638
639 let labels: Vec<&str> = name.split('.').collect();
640 let multiple = labels.len() != 1;
641 let mut out = String::with_capacity(name.len());
642 for (i, label) in labels.iter().enumerate() {
643 if i > 0 {
644 out.push('.');
645 }
646 if let Some(label) = normalize_ascii_label(label) {
647 out.push_str(&label);
648 continue;
649 }
650 let input = explode_cp(label);
651 let result = process_text_label_output(&input).unwrap_or_else(|| {
652 process_label_output(&input, NormalizeForm::Nfc, EmojiFilter::DropFe0f)
653 });
654 match result {
655 Ok(output) => out.push_str(&str_from_cps(&output)?),
656 Err(error) if multiple => {
657 let safe = safe_str_from_cps(&input, Some(63));
658 return Err(EnsError::new(format!(
659 "Invalid label {}: {}",
660 bidi_qq(&safe),
661 error.message()
662 )));
663 }
664 Err(error) => return Err(error),
665 }
666 }
667 Ok(out)
668}
669
670fn normalize_ascii(name: &str) -> Option<Result<String>> {
671 if name.is_empty() {
672 return Some(Ok(String::new()));
673 }
674 if !name.is_ascii() {
675 return None;
676 }
677
678 let mut start = 0;
679 let mut changed = false;
680 for (i, byte) in name.bytes().enumerate() {
681 if byte == b'.' {
682 if !valid_ascii_label(&name.as_bytes()[start..i]) {
683 return None;
684 }
685 start = i + 1;
686 } else if byte.is_ascii_uppercase() {
687 changed = true;
688 } else if !is_valid_ascii_byte(byte) {
689 return None;
690 }
691 }
692
693 if !valid_ascii_label(&name.as_bytes()[start..]) {
694 return None;
695 }
696
697 if changed {
698 let mut out = String::with_capacity(name.len());
699 for byte in name.bytes() {
700 if byte.is_ascii_uppercase() {
701 out.push(char::from(byte + 32));
702 } else {
703 out.push(char::from(byte));
704 }
705 }
706 Some(Ok(out))
707 } else {
708 Some(Ok(name.to_owned()))
709 }
710}
711
712fn normalize_ascii_label(label: &str) -> Option<Cow<'_, str>> {
713 if label.is_empty() || !label.is_ascii() {
714 return None;
715 }
716 let bytes = label.as_bytes();
717 if !valid_ascii_label(bytes) {
718 return None;
719 }
720
721 let mut changed = false;
722 for &byte in bytes {
723 if byte.is_ascii_uppercase() {
724 changed = true;
725 } else if !is_valid_ascii_byte(byte) {
726 return None;
727 }
728 }
729
730 if changed {
731 let mut out = String::with_capacity(label.len());
732 for byte in label.bytes() {
733 if byte.is_ascii_uppercase() {
734 out.push(char::from(byte + 32));
735 } else {
736 out.push(char::from(byte));
737 }
738 }
739 Some(Cow::Owned(out))
740 } else {
741 Some(Cow::Borrowed(label))
742 }
743}
744
745fn is_valid_ascii_byte(byte: u8) -> bool {
746 matches!(byte, b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b'$')
747}
748
749fn valid_ascii_label(label: &[u8]) -> bool {
750 if label.is_empty() {
751 return false;
752 }
753 if label.len() >= 4 && label[2] == b'-' && label[3] == b'-' {
754 return false;
755 }
756 match label.iter().rposition(|&cp| cp == b'_') {
757 Some(0) | None => true,
758 Some(pos) => label[..pos].iter().all(|&cp| cp == b'_'),
759 }
760}
761
762fn unique_preserving_order(cps: &[u32]) -> Vec<u32> {
763 if cps.len() <= 64 {
764 let mut unique = Vec::new();
765 for &cp in cps {
766 if !unique.contains(&cp) {
767 unique.push(cp);
768 }
769 }
770 return unique;
771 }
772
773 let mut seen = IntSet::default();
774 let mut unique = Vec::new();
775 for &cp in cps {
776 if seen.insert(cp) {
777 unique.push(cp);
778 }
779 }
780 unique
781}
782
783fn check_label_extension(cps: &[u32]) -> Result<()> {
784 if cps.len() >= 4 && cps[2] == HYPHEN && cps[3] == HYPHEN {
785 let s = str_from_cps(&cps[..4])?;
786 Err(EnsError::new(format!("invalid label extension: \"{s}\"")))
787 } else {
788 Ok(())
789 }
790}
791
792fn check_leading_underscore(cps: &[u32]) -> Result<()> {
793 const UNDERSCORE: u32 = 0x5F;
794 if let Some(mut i) = cps.iter().rposition(|&cp| cp == UNDERSCORE) {
795 while i > 0 {
796 i -= 1;
797 if cps[i] != UNDERSCORE {
798 return Err(EnsError::new("underscore allowed only at start"));
799 }
800 }
801 }
802 Ok(())
803}
804
805fn check_fenced(cps: &[u32]) -> Result<()> {
806 if cps.is_empty() {
807 return Ok(());
808 }
809 let mut prev = ENS.fenced.get(&cps[0]);
810 if let Some(prev) = prev {
811 return Err(error_placement(&format!("leading {prev}")));
812 }
813
814 let mut last = usize::MAX;
815 for (i, &cp) in cps.iter().enumerate().skip(1) {
816 if let Some(matched) = ENS.fenced.get(&cp) {
817 if last == i {
818 return Err(error_placement(&format!("{} + {matched}", prev.unwrap())));
819 }
820 last = i + 1;
821 prev = Some(matched);
822 }
823 }
824 if last == cps.len()
825 && let Some(prev) = prev
826 {
827 return Err(error_placement(&format!("trailing {prev}")));
828 }
829 Ok(())
830}
831
832fn determine_group(unique: &[u32]) -> Result<usize> {
833 let mut groups: Option<Vec<usize>> = None;
834 for &cp in unique {
835 let Some(cp_groups) = ENS.group_members.get(&cp) else {
836 return Err(error_disallowed(cp));
837 };
838 let gs: Vec<usize> = if let Some(groups) = groups.take() {
839 let first = groups[0];
840 let filtered: Vec<usize> = groups
841 .into_iter()
842 .filter(|i| cp_groups.contains(i))
843 .collect();
844 if filtered.is_empty() {
845 return Err(error_group_member(first, cp));
846 }
847 filtered
848 } else {
849 cp_groups.clone()
850 };
851 if gs.len() == 1 {
852 return Ok(gs[0]);
853 }
854 groups = Some(gs);
855 }
856 Ok(groups.expect("unique has at least one code point")[0])
857}
858
859fn check_group(group: usize, cps: &[u32]) -> Result<()> {
860 let g = &ENS.groups[group];
861 for &cp in cps {
862 if !g.has_cp(cp) {
863 return Err(error_group_member(group, cp));
864 }
865 }
866
867 if g.check_nsm && cps.iter().any(|cp| ENS.nsm_check.contains(cp)) {
868 let decomposed = nfd(cps);
869 let mut i = 1usize;
870 while i < decomposed.len() {
871 if ENS.nsm.contains(&decomposed[i]) {
872 let mut j = i + 1;
873 while j < decomposed.len() && ENS.nsm.contains(&decomposed[j]) {
874 for k in i..j {
875 if decomposed[k] == decomposed[j] {
876 return Err(EnsError::new(format!(
877 "duplicate non-spacing marks: {}",
878 quoted_cp(decomposed[j])
879 )));
880 }
881 }
882 j += 1;
883 }
884 if j - i > ENS.nsm_max {
885 let s = safe_str_from_cps(&decomposed[i - 1..j], None);
886 return Err(EnsError::new(format!(
887 "excessive non-spacing marks: {} ({}/{})",
888 bidi_qq(&s),
889 j - i,
890 ENS.nsm_max
891 )));
892 }
893 i = j;
894 } else {
895 i += 1;
896 }
897 }
898 }
899
900 Ok(())
901}
902
903fn check_whole(group: usize, unique: &[u32]) -> Result<()> {
904 let mut maker: Option<Vec<usize>> = None;
905 let mut shared = Vec::new();
906 for &cp in unique {
907 match ENS.whole_map.get(&cp).copied() {
908 Some(UNIQUE_PH) => return Ok(()),
909 Some(whole_index) => {
910 let set = ENS.wholes[whole_index]
911 .complements
912 .get(&cp)
913 .cloned()
914 .unwrap_or_default();
915 maker = Some(match maker {
916 Some(prev) => prev.into_iter().filter(|g| set.contains(g)).collect(),
917 None => set,
918 });
919 if maker.as_ref().is_some_and(|m| m.is_empty()) {
920 return Ok(());
921 }
922 }
923 None => shared.push(cp),
924 }
925 }
926
927 if let Some(maker) = maker {
928 for other in maker {
929 if shared.iter().all(|&cp| ENS.groups[other].has_cp(cp)) {
930 return Err(EnsError::new(format!(
931 "whole-script confusable: {}/{}",
932 ENS.groups[group].name, ENS.groups[other].name
933 )));
934 }
935 }
936 }
937 Ok(())
938}
939
940fn flatten(labels: Vec<Label>) -> Result<String> {
941 let multiple = labels.len() != 1;
942 let mut out = Vec::new();
943 for label in labels {
944 if let Some(error) = label.error {
945 if multiple {
946 let safe = safe_str_from_cps(&label.input, Some(63));
947 return Err(EnsError::new(format!(
948 "Invalid label {}: {}",
949 bidi_qq(&safe),
950 error.message()
951 )));
952 }
953 return Err(error);
954 }
955 out.push(str_from_cps(label.output.as_deref().unwrap_or_default())?);
956 }
957 Ok(out.join("."))
958}
959
960fn quoted_cp(cp: u32) -> String {
961 let prefix = if should_escape(cp) {
962 String::new()
963 } else {
964 format!("{} ", bidi_qq(&safe_str_from_cps(&[cp], None)))
965 };
966 format!("{prefix}{}", quote_cp(cp))
967}
968
969fn error_disallowed(cp: u32) -> EnsError {
970 EnsError::new(format!("disallowed character: {}", quoted_cp(cp)))
971}
972
973fn error_group_member(group: usize, cp: u32) -> EnsError {
974 let mut quoted = quoted_cp(cp);
975 if let Some(&gg) = ENS.primary_group.get(&cp) {
976 let gg = &ENS.groups[gg];
977 quoted = format!("{} {quoted}", gg.name);
978 }
979 EnsError::new(format!(
980 "illegal mixture: {} + {quoted}",
981 ENS.groups[group].name
982 ))
983}
984
985fn error_placement(where_: &str) -> EnsError {
986 EnsError::new(format!("illegal placement: {where_}"))
987}
988
989#[derive(Debug, Clone, Copy)]
990enum NormalizeForm {
991 Nfc,
992 Nfd,
993}
994
995impl NormalizeForm {
996 fn apply(self, cps: &[u32]) -> Vec<u32> {
997 match self {
998 Self::Nfc if !requires_check(cps) => cps.to_vec(),
999 Self::Nfc => nfc(cps),
1000 Self::Nfd => nfd(cps),
1001 }
1002 }
1003}
1004
1005#[derive(Debug, Clone, Copy)]
1006enum EmojiFilter {
1007 Preserve,
1008 DropFe0f,
1009}
1010
1011fn filter_emoji(cps: &[u32], filter: EmojiFilter) -> Vec<u32> {
1012 match filter {
1013 EmojiFilter::Preserve => cps.to_vec(),
1014 EmojiFilter::DropFe0f => cps.iter().copied().filter(|&cp| cp != FE0F).collect(),
1015 }
1016}
1017
1018fn tokens_from_str(input: &[u32], nf: NormalizeForm, ef: EmojiFilter) -> Result<Vec<NormToken>> {
1019 let mut ret = Vec::new();
1020 let mut chars = Vec::new();
1021 let mut input = input.to_vec();
1022 input.reverse();
1023
1024 while !input.is_empty() {
1025 if let Some(emoji) = consume_emoji_reversed(&mut input, None) {
1026 if !chars.is_empty() {
1027 ret.push(NormToken {
1028 cps: nf.apply(&chars),
1029 is_emoji: false,
1030 });
1031 chars.clear();
1032 }
1033 ret.push(NormToken {
1034 cps: filter_emoji(&emoji, ef),
1035 is_emoji: true,
1036 });
1037 } else {
1038 let cp = input.pop().expect("input is not empty");
1039 if ENS.valid.contains(&cp) {
1040 chars.push(cp);
1041 } else if let Some(cps) = ENS.mapped.get(&cp) {
1042 chars.extend_from_slice(cps);
1043 } else if !ENS.ignored.contains(&cp) {
1044 return Err(error_disallowed(cp));
1045 }
1046 }
1047 }
1048
1049 if !chars.is_empty() {
1050 ret.push(NormToken {
1051 cps: nf.apply(&chars),
1052 is_emoji: false,
1053 });
1054 }
1055
1056 Ok(ret)
1057}
1058
1059fn consume_emoji_reversed(input: &mut Vec<u32>, eaten: Option<&mut Vec<u32>>) -> Option<Vec<u32>> {
1060 let mut eaten = eaten;
1061 let mut node = 0usize;
1062 let mut emoji = None;
1063 let mut pos = input.len();
1064 while pos > 0 {
1065 pos -= 1;
1066 let cp = input[pos];
1067 let Some(&child) = ENS.emoji_root.nodes[node].children.get(&cp) else {
1068 break;
1069 };
1070 node = child;
1071 if let Some(value) = ENS.emoji_root.nodes[node].value.clone() {
1072 if let Some(eaten) = eaten.as_deref_mut() {
1073 eaten.extend(input[pos..].iter().rev().copied());
1074 }
1075 input.truncate(pos);
1076 emoji = Some(value);
1077 }
1078 }
1079 emoji
1080}
1081
1082pub fn ens_tokenize(name: &str) -> Vec<Token> {
1083 ens_tokenize_with_options(name, TokenizeOptions::default())
1084}
1085
1086pub fn ens_tokenize_with_options(name: &str, options: TokenizeOptions) -> Vec<Token> {
1087 tokenize(name, options.nf)
1088}
1089
1090fn tokenize(name: &str, nf: bool) -> Vec<Token> {
1091 let mut input = explode_cp(name);
1092 input.reverse();
1093 let mut eaten = Vec::new();
1094 let mut tokens = Vec::new();
1095
1096 while !input.is_empty() {
1097 if let Some(emoji) = consume_emoji_reversed(&mut input, Some(&mut eaten)) {
1098 tokens.push(Token::Emoji {
1099 input: std::mem::take(&mut eaten),
1100 cps: filter_emoji(&emoji, EmojiFilter::DropFe0f),
1101 emoji,
1102 });
1103 } else {
1104 let cp = input.pop().expect("input is not empty");
1105 if cp == STOP {
1106 tokens.push(Token::Stop { cp });
1107 } else if ENS.valid.contains(&cp) {
1108 tokens.push(Token::Valid { cps: vec![cp] });
1109 } else if ENS.ignored.contains(&cp) {
1110 tokens.push(Token::Ignored { cp });
1111 } else if let Some(cps) = ENS.mapped.get(&cp) {
1112 tokens.push(Token::Mapped {
1113 cp,
1114 cps: cps.clone(),
1115 });
1116 } else {
1117 tokens.push(Token::Disallowed { cp });
1118 }
1119 }
1120 }
1121
1122 if nf {
1123 apply_token_nfc(&mut tokens);
1124 }
1125
1126 collapse_valid_tokens(tokens)
1127}
1128
1129fn is_valid_or_mapped(token: &Token) -> bool {
1130 matches!(token, Token::Valid { .. } | Token::Mapped { .. })
1131}
1132
1133fn valid_or_mapped_cps(token: &Token) -> Option<&[u32]> {
1134 match token {
1135 Token::Valid { cps } | Token::Mapped { cps, .. } => Some(cps),
1136 _ => None,
1137 }
1138}
1139
1140fn requires_check(cps: &[u32]) -> bool {
1141 cps.iter().any(|cp| ENS.nfc_check.contains(cp))
1142}
1143
1144fn apply_token_nfc(tokens: &mut Vec<Token>) {
1145 let mut i = 0usize;
1146 let mut start: Option<usize> = None;
1147 while i < tokens.len() {
1148 if is_valid_or_mapped(&tokens[i]) {
1149 let cps = valid_or_mapped_cps(&tokens[i]).unwrap();
1150 if requires_check(cps) {
1151 let mut end = i + 1;
1152 let mut pos = end;
1153 while pos < tokens.len() {
1154 if let Some(cps) = valid_or_mapped_cps(&tokens[pos]) {
1155 if !requires_check(cps) {
1156 break;
1157 }
1158 end = pos + 1;
1159 } else if !matches!(tokens[pos], Token::Ignored { .. }) {
1160 break;
1161 }
1162 pos += 1;
1163 }
1164 let start_i = start.unwrap_or(i);
1165 let slice = tokens[start_i..end].to_vec();
1166 let cps0: Vec<u32> = slice
1167 .iter()
1168 .filter_map(valid_or_mapped_cps)
1169 .flat_map(|cps| cps.iter().copied())
1170 .collect();
1171 let cps = nfc(&cps0);
1172 if compare_arrays(&cps, &cps0) != 0 {
1173 let text = str_from_cps(&cps).unwrap_or_default();
1174 let replacement = Token::Nfc {
1175 input: cps0,
1176 tokens0: collapse_valid_tokens(slice),
1177 cps,
1178 tokens: tokenize(&text, false),
1179 };
1180 tokens.splice(start_i..end, [replacement]);
1181 i = start_i;
1182 } else {
1183 i = end.saturating_sub(1);
1184 }
1185 start = None;
1186 } else {
1187 start = Some(i);
1188 }
1189 } else if !matches!(tokens[i], Token::Ignored { .. }) {
1190 start = None;
1191 }
1192 i += 1;
1193 }
1194}
1195
1196fn collapse_valid_tokens(tokens: Vec<Token>) -> Vec<Token> {
1197 let mut out = Vec::new();
1198 let mut i = 0usize;
1199 while i < tokens.len() {
1200 if let Token::Valid { .. } = &tokens[i] {
1201 let mut cps = Vec::new();
1202 while i < tokens.len() {
1203 if let Token::Valid { cps: next } = &tokens[i] {
1204 cps.extend_from_slice(next);
1205 i += 1;
1206 } else {
1207 break;
1208 }
1209 }
1210 out.push(Token::Valid { cps });
1211 } else {
1212 out.push(tokens[i].clone());
1213 i += 1;
1214 }
1215 }
1216 out
1217}