1use crate::nf::{nfc, nfd};
2use crate::utils::{
3 EnsError, Result, array_replace, bidi_qq, compare_arrays, explode_cp, quote_cp,
4 safe_str_from_cps, str_from_cps,
5};
6use serde::Deserialize;
7use std::collections::{HashMap, HashSet};
8use std::sync::LazyLock;
9
10const HYPHEN: u32 = 0x2D;
11const STOP: u32 = 0x2E;
12const FE0F: u32 = 0xFE0F;
13const UNIQUE_PH: usize = usize::MAX;
14
15#[derive(Debug, Clone, PartialEq, Eq)]
16pub struct Label {
17 pub input: Vec<u32>,
18 pub offset: usize,
19 pub error: Option<EnsError>,
20 pub tokens: Option<Vec<Vec<u32>>>,
21 pub output: Option<Vec<u32>>,
22 pub emoji: Option<bool>,
23 pub label_type: Option<String>,
24}
25
26#[derive(Debug, Clone, PartialEq, Eq)]
27pub enum Token {
28 Stop {
29 cp: u32,
30 },
31 Disallowed {
32 cp: u32,
33 },
34 Ignored {
35 cp: u32,
36 },
37 Valid {
38 cps: Vec<u32>,
39 },
40 Mapped {
41 cp: u32,
42 cps: Vec<u32>,
43 },
44 Emoji {
45 input: Vec<u32>,
46 cps: Vec<u32>,
47 emoji: Vec<u32>,
48 },
49 Nfc {
50 input: Vec<u32>,
51 tokens0: Vec<Token>,
52 cps: Vec<u32>,
53 tokens: Vec<Token>,
54 },
55}
56
57impl Token {
58 pub fn token_type(&self) -> &'static str {
59 match self {
60 Token::Stop { .. } => "stop",
61 Token::Disallowed { .. } => "disallowed",
62 Token::Ignored { .. } => "ignored",
63 Token::Valid { .. } => "valid",
64 Token::Mapped { .. } => "mapped",
65 Token::Emoji { .. } => "emoji",
66 Token::Nfc { .. } => "nfc",
67 }
68 }
69
70 pub fn cps(&self) -> Option<&[u32]> {
71 match self {
72 Token::Valid { cps }
73 | Token::Mapped { cps, .. }
74 | Token::Emoji { cps, .. }
75 | Token::Nfc { cps, .. } => Some(cps),
76 _ => None,
77 }
78 }
79}
80
81#[derive(Debug, Clone, Copy, PartialEq, Eq)]
82pub struct TokenizeOptions {
83 pub nf: bool,
84}
85
86impl Default for TokenizeOptions {
87 fn default() -> Self {
88 Self { nf: true }
89 }
90}
91
92#[derive(Deserialize)]
93struct RawSpec {
94 emoji: Vec<Vec<u32>>,
95 ignored: Vec<u32>,
96 mapped: Vec<(u32, Vec<u32>)>,
97 fenced: Vec<(u32, String)>,
98 wholes: Vec<RawWhole>,
99 cm: Vec<u32>,
100 nsm: Vec<u32>,
101 nsm_max: usize,
102 escape: Vec<u32>,
103 groups: Vec<RawGroup>,
104 nfc_check: Vec<u32>,
105}
106
107#[derive(Deserialize)]
108struct RawWhole {
109 valid: Vec<u32>,
110 confused: Vec<u32>,
111}
112
113#[derive(Deserialize)]
114struct RawGroup {
115 name: String,
116 #[serde(default)]
117 restricted: bool,
118 primary: Vec<u32>,
119 secondary: Vec<u32>,
120 cm: Option<Vec<serde_json::Value>>,
121}
122
123struct Group {
124 name: String,
125 primary: HashSet<u32>,
126 secondary: HashSet<u32>,
127 check_nsm: bool,
128}
129
130impl Group {
131 fn has_cp(&self, cp: u32) -> bool {
132 self.primary.contains(&cp) || self.secondary.contains(&cp)
133 }
134}
135
136struct Whole {
137 complements: HashMap<u32, Vec<usize>>,
138}
139
140#[derive(Default)]
141struct EmojiNode {
142 children: HashMap<u32, usize>,
143 value: Option<Vec<u32>>,
144}
145
146#[derive(Default)]
147struct EmojiTrie {
148 nodes: Vec<EmojiNode>,
149}
150
151impl EmojiTrie {
152 fn new() -> Self {
153 Self {
154 nodes: vec![EmojiNode::default()],
155 }
156 }
157
158 fn child_or_insert(&mut self, node: usize, cp: u32) -> usize {
159 if let Some(&child) = self.nodes[node].children.get(&cp) {
160 return child;
161 }
162 let child = self.nodes.len();
163 self.nodes.push(EmojiNode::default());
164 self.nodes[node].children.insert(cp, child);
165 child
166 }
167}
168
169struct EnsData {
170 mapped: HashMap<u32, Vec<u32>>,
171 ignored: HashSet<u32>,
172 cm: HashSet<u32>,
173 nsm: HashSet<u32>,
174 nsm_max: usize,
175 escape: HashSet<u32>,
176 nfc_check: HashSet<u32>,
177 fenced: HashMap<u32, String>,
178 groups: Vec<Group>,
179 whole_map: HashMap<u32, usize>,
180 wholes: Vec<Whole>,
181 valid: HashSet<u32>,
182 emoji_list: Vec<Vec<u32>>,
183 emoji_root: EmojiTrie,
184}
185
186static ENS: LazyLock<EnsData> = LazyLock::new(|| {
187 let raw: RawSpec =
188 serde_json::from_str(include_str!("../data/spec.json")).expect("valid spec.json");
189 EnsData::from_raw(raw)
190});
191
192impl EnsData {
193 fn from_raw(raw: RawSpec) -> Self {
194 let groups: Vec<Group> = raw
195 .groups
196 .into_iter()
197 .map(|g| {
198 let name = if g.restricted {
199 format!("Restricted[{}]", g.name)
200 } else {
201 g.name
202 };
203 Group {
204 name,
205 primary: g.primary.into_iter().collect(),
206 secondary: g.secondary.into_iter().collect(),
207 check_nsm: g.cm.is_none(),
208 }
209 })
210 .collect();
211
212 let mut wholes = Vec::new();
213 let mut whole_map = HashMap::new();
214 for raw_whole in raw.wholes {
215 if raw_whole.confused.is_empty() {
216 continue;
217 }
218
219 let values: Vec<u32> = raw_whole
220 .valid
221 .iter()
222 .chain(raw_whole.confused.iter())
223 .copied()
224 .collect();
225 let complements = compute_whole_complements(&groups, &values);
226 let whole_index = wholes.len();
227 for cp in raw_whole.confused {
228 whole_map.insert(cp, whole_index);
229 }
230 wholes.push(Whole { complements });
231 }
232
233 let mut valid = HashSet::new();
234 let mut multi = HashSet::new();
235 for g in &groups {
236 for &cp in g.primary.iter().chain(g.secondary.iter()) {
237 if !valid.insert(cp) {
238 multi.insert(cp);
239 }
240 }
241 }
242
243 for &cp in &valid {
244 if !whole_map.contains_key(&cp) && !multi.contains(&cp) {
245 whole_map.insert(cp, UNIQUE_PH);
246 }
247 }
248
249 let valid_vec: Vec<u32> = valid.iter().copied().collect();
250 for cp in nfd(&valid_vec) {
251 valid.insert(cp);
252 }
253
254 let mut emoji_list = raw.emoji;
255 emoji_list.sort_by(|a, b| compare_arrays(a, b).cmp(&0));
256 let mut emoji_root = EmojiTrie::new();
257 for cps in &emoji_list {
258 let mut prev = vec![0usize];
259 for &cp in cps {
260 let next: Vec<usize> = prev
261 .iter()
262 .map(|&node| emoji_root.child_or_insert(node, cp))
263 .collect();
264 if cp == FE0F {
265 prev.extend(next);
266 } else {
267 prev = next;
268 }
269 }
270 for node in prev {
271 emoji_root.nodes[node].value = Some(cps.clone());
272 }
273 }
274
275 Self {
276 mapped: raw.mapped.into_iter().collect(),
277 ignored: raw.ignored.into_iter().collect(),
278 cm: raw.cm.into_iter().collect(),
279 nsm: raw.nsm.into_iter().collect(),
280 nsm_max: raw.nsm_max,
281 escape: raw.escape.into_iter().collect(),
282 nfc_check: raw.nfc_check.into_iter().collect(),
283 fenced: raw.fenced.into_iter().collect(),
284 groups,
285 whole_map,
286 wholes,
287 valid,
288 emoji_list,
289 emoji_root,
290 }
291 }
292}
293
294struct WholeRec {
295 groups: Vec<usize>,
296 values: Vec<u32>,
297}
298
299fn push_unique(v: &mut Vec<usize>, x: usize) {
300 if !v.contains(&x) {
301 v.push(x);
302 }
303}
304
305fn compute_whole_complements(groups: &[Group], values: &[u32]) -> HashMap<u32, Vec<usize>> {
306 let mut recs: Vec<WholeRec> = Vec::new();
307 for &cp in values {
308 let gs: Vec<usize> = groups
309 .iter()
310 .enumerate()
311 .filter_map(|(i, g)| g.has_cp(cp).then_some(i))
312 .collect();
313 let rec_index = recs
314 .iter()
315 .position(|rec| gs.iter().any(|g| rec.groups.contains(g)));
316 let rec_index = match rec_index {
317 Some(i) => i,
318 None => {
319 recs.push(WholeRec {
320 groups: Vec::new(),
321 values: Vec::new(),
322 });
323 recs.len() - 1
324 }
325 };
326 recs[rec_index].values.push(cp);
327 for g in gs {
328 push_unique(&mut recs[rec_index].groups, g);
329 }
330 }
331
332 let mut union = Vec::new();
333 for rec in &recs {
334 for &g in &rec.groups {
335 push_unique(&mut union, g);
336 }
337 }
338
339 let mut complements = HashMap::new();
340 for rec in recs {
341 let complement: Vec<usize> = union
342 .iter()
343 .copied()
344 .filter(|g| !rec.groups.contains(g))
345 .collect();
346 for cp in rec.values {
347 complements.insert(cp, complement.clone());
348 }
349 }
350 complements
351}
352
353#[derive(Clone)]
354struct NormToken {
355 cps: Vec<u32>,
356 is_emoji: bool,
357}
358
359pub fn is_combining_mark(cp: u32, only_nsm: bool) -> bool {
360 if only_nsm {
361 ENS.nsm.contains(&cp)
362 } else {
363 ENS.cm.contains(&cp)
364 }
365}
366
367pub fn should_escape(cp: u32) -> bool {
368 ENS.escape.contains(&cp)
369}
370
371pub fn ens_emoji() -> Vec<Vec<u32>> {
372 ENS.emoji_list.clone()
373}
374
375pub fn ens_normalize_fragment(frag: &str, decompose: bool) -> Result<String> {
376 let nf = if decompose { nfd } else { nfc };
377 let mut out = Vec::new();
378 for (i, label) in frag.split('.').enumerate() {
379 if i > 0 {
380 out.push(STOP);
381 }
382 let input = explode_cp(label);
383 let tokens = tokens_from_str(&input, nf, EmojiFilter::DropFe0f)?;
384 out.extend(tokens.into_iter().flat_map(|t| t.cps));
385 }
386 str_from_cps(&out)
387}
388
389pub fn ens_normalize(name: &str) -> Result<String> {
390 flatten(split(name, nfc, EmojiFilter::DropFe0f))
391}
392
393pub fn ens_beautify(name: &str) -> Result<String> {
394 let mut labels = split(name, nfc, EmojiFilter::Preserve);
395 for label in &mut labels {
396 if label.error.is_some() {
397 break;
398 }
399 if label.label_type.as_deref() != Some("Greek")
400 && let Some(output) = &mut label.output
401 {
402 array_replace(output, 0x3BE, 0x39E);
403 }
404 }
405 flatten(labels)
406}
407
408pub fn ens_split(name: &str, preserve_emoji: bool) -> Vec<Label> {
409 split(
410 name,
411 nfc,
412 if preserve_emoji {
413 EmojiFilter::Preserve
414 } else {
415 EmojiFilter::DropFe0f
416 },
417 )
418}
419
420fn split(name: &str, nf: fn(&[u32]) -> Vec<u32>, ef: EmojiFilter) -> Vec<Label> {
421 if name.is_empty() {
422 return Vec::new();
423 }
424
425 let mut offset = 0usize;
426 name.split('.')
427 .map(|label| {
428 let input = explode_cp(label);
429 let mut info = Label {
430 input: input.clone(),
431 offset,
432 error: None,
433 tokens: None,
434 output: None,
435 emoji: None,
436 label_type: None,
437 };
438 offset += input.len() + 1;
439
440 if let Err(err) = process_label(&input, nf, ef, &mut info) {
441 info.error = Some(err);
442 }
443 info
444 })
445 .collect()
446}
447
448fn process_label(
449 input: &[u32],
450 nf: fn(&[u32]) -> Vec<u32>,
451 ef: EmojiFilter,
452 info: &mut Label,
453) -> Result<()> {
454 let tokens = tokens_from_str(input, nf, ef)?;
455 info.tokens = Some(tokens.iter().map(|t| t.cps.clone()).collect());
456 if tokens.is_empty() {
457 return Err(EnsError::new("empty label"));
458 }
459
460 let output: Vec<u32> = tokens.iter().flat_map(|t| t.cps.iter().copied()).collect();
461 info.output = Some(output.clone());
462 check_leading_underscore(&output)?;
463 let emoji = tokens.len() > 1 || tokens[0].is_emoji;
464 info.emoji = Some(emoji);
465 let label_type = if !emoji && output.iter().all(|&cp| cp < 0x80) {
466 check_label_extension(&output)?;
467 "ASCII".to_string()
468 } else {
469 let chars: Vec<u32> = tokens
470 .iter()
471 .filter(|t| !t.is_emoji)
472 .flat_map(|t| t.cps.iter().copied())
473 .collect();
474 if chars.is_empty() {
475 "Emoji".to_string()
476 } else {
477 if ENS.cm.contains(&output[0]) {
478 return Err(error_placement("leading combining mark"));
479 }
480 for i in 1..tokens.len() {
481 if !tokens[i].is_emoji && ENS.cm.contains(&tokens[i].cps[0]) {
482 let prev = str_from_cps(&tokens[i - 1].cps)?;
483 let mark = safe_str_from_cps(&[tokens[i].cps[0]], None);
484 return Err(error_placement(&format!(
485 "emoji + combining mark: \"{prev} + {mark}\""
486 )));
487 }
488 }
489
490 check_fenced(&output)?;
491 let unique = unique_preserving_order(&chars);
492 let group = determine_group(&unique)?;
493 check_group(group, &chars)?;
494 check_whole(group, &unique)?;
495 ENS.groups[group].name.clone()
496 }
497 };
498
499 info.label_type = Some(label_type);
500 Ok(())
501}
502
503fn unique_preserving_order(cps: &[u32]) -> Vec<u32> {
504 let mut seen = HashSet::new();
505 let mut unique = Vec::new();
506 for &cp in cps {
507 if seen.insert(cp) {
508 unique.push(cp);
509 }
510 }
511 unique
512}
513
514fn check_label_extension(cps: &[u32]) -> Result<()> {
515 if cps.len() >= 4 && cps[2] == HYPHEN && cps[3] == HYPHEN {
516 let s = str_from_cps(&cps[..4])?;
517 Err(EnsError::new(format!("invalid label extension: \"{s}\"")))
518 } else {
519 Ok(())
520 }
521}
522
523fn check_leading_underscore(cps: &[u32]) -> Result<()> {
524 const UNDERSCORE: u32 = 0x5F;
525 if let Some(mut i) = cps.iter().rposition(|&cp| cp == UNDERSCORE) {
526 while i > 0 {
527 i -= 1;
528 if cps[i] != UNDERSCORE {
529 return Err(EnsError::new("underscore allowed only at start"));
530 }
531 }
532 }
533 Ok(())
534}
535
536fn check_fenced(cps: &[u32]) -> Result<()> {
537 if cps.is_empty() {
538 return Ok(());
539 }
540 let mut prev = ENS.fenced.get(&cps[0]);
541 if let Some(prev) = prev {
542 return Err(error_placement(&format!("leading {prev}")));
543 }
544
545 let mut last = usize::MAX;
546 for (i, &cp) in cps.iter().enumerate().skip(1) {
547 if let Some(matched) = ENS.fenced.get(&cp) {
548 if last == i {
549 return Err(error_placement(&format!("{} + {matched}", prev.unwrap())));
550 }
551 last = i + 1;
552 prev = Some(matched);
553 }
554 }
555 if last == cps.len()
556 && let Some(prev) = prev
557 {
558 return Err(error_placement(&format!("trailing {prev}")));
559 }
560 Ok(())
561}
562
563fn determine_group(unique: &[u32]) -> Result<usize> {
564 let mut groups: Vec<usize> = (0..ENS.groups.len()).collect();
565 for &cp in unique {
566 let gs: Vec<usize> = groups
567 .iter()
568 .copied()
569 .filter(|&i| ENS.groups[i].has_cp(cp))
570 .collect();
571 if gs.is_empty() {
572 if !ENS.groups.iter().any(|g| g.has_cp(cp)) {
573 return Err(error_disallowed(cp));
574 }
575 return Err(error_group_member(groups[0], cp));
576 }
577 groups = gs;
578 if groups.len() == 1 {
579 break;
580 }
581 }
582 Ok(groups[0])
583}
584
585fn check_group(group: usize, cps: &[u32]) -> Result<()> {
586 let g = &ENS.groups[group];
587 for &cp in cps {
588 if !g.has_cp(cp) {
589 return Err(error_group_member(group, cp));
590 }
591 }
592
593 if g.check_nsm {
594 let decomposed = nfd(cps);
595 let mut i = 1usize;
596 while i < decomposed.len() {
597 if ENS.nsm.contains(&decomposed[i]) {
598 let mut j = i + 1;
599 while j < decomposed.len() && ENS.nsm.contains(&decomposed[j]) {
600 for k in i..j {
601 if decomposed[k] == decomposed[j] {
602 return Err(EnsError::new(format!(
603 "duplicate non-spacing marks: {}",
604 quoted_cp(decomposed[j])
605 )));
606 }
607 }
608 j += 1;
609 }
610 if j - i > ENS.nsm_max {
611 let s = safe_str_from_cps(&decomposed[i - 1..j], None);
612 return Err(EnsError::new(format!(
613 "excessive non-spacing marks: {} ({}/{})",
614 bidi_qq(&s),
615 j - i,
616 ENS.nsm_max
617 )));
618 }
619 i = j;
620 } else {
621 i += 1;
622 }
623 }
624 }
625
626 Ok(())
627}
628
629fn check_whole(group: usize, unique: &[u32]) -> Result<()> {
630 let mut maker: Option<Vec<usize>> = None;
631 let mut shared = Vec::new();
632 for &cp in unique {
633 match ENS.whole_map.get(&cp).copied() {
634 Some(UNIQUE_PH) => return Ok(()),
635 Some(whole_index) => {
636 let set = ENS.wholes[whole_index]
637 .complements
638 .get(&cp)
639 .cloned()
640 .unwrap_or_default();
641 maker = Some(match maker {
642 Some(prev) => prev.into_iter().filter(|g| set.contains(g)).collect(),
643 None => set,
644 });
645 if maker.as_ref().is_some_and(|m| m.is_empty()) {
646 return Ok(());
647 }
648 }
649 None => shared.push(cp),
650 }
651 }
652
653 if let Some(maker) = maker {
654 for other in maker {
655 if shared.iter().all(|&cp| ENS.groups[other].has_cp(cp)) {
656 return Err(EnsError::new(format!(
657 "whole-script confusable: {}/{}",
658 ENS.groups[group].name, ENS.groups[other].name
659 )));
660 }
661 }
662 }
663 Ok(())
664}
665
666fn flatten(labels: Vec<Label>) -> Result<String> {
667 let multiple = labels.len() != 1;
668 let mut out = Vec::new();
669 for label in labels {
670 if let Some(error) = label.error {
671 if multiple {
672 let safe = safe_str_from_cps(&label.input, Some(63));
673 return Err(EnsError::new(format!(
674 "Invalid label {}: {}",
675 bidi_qq(&safe),
676 error.message()
677 )));
678 }
679 return Err(error);
680 }
681 out.push(str_from_cps(label.output.as_deref().unwrap_or_default())?);
682 }
683 Ok(out.join("."))
684}
685
686fn quoted_cp(cp: u32) -> String {
687 let prefix = if should_escape(cp) {
688 String::new()
689 } else {
690 format!("{} ", bidi_qq(&safe_str_from_cps(&[cp], None)))
691 };
692 format!("{prefix}{}", quote_cp(cp))
693}
694
695fn error_disallowed(cp: u32) -> EnsError {
696 EnsError::new(format!("disallowed character: {}", quoted_cp(cp)))
697}
698
699fn error_group_member(group: usize, cp: u32) -> EnsError {
700 let mut quoted = quoted_cp(cp);
701 if let Some(gg) = ENS.groups.iter().find(|g| g.primary.contains(&cp)) {
702 quoted = format!("{} {quoted}", gg.name);
703 }
704 EnsError::new(format!(
705 "illegal mixture: {} + {quoted}",
706 ENS.groups[group].name
707 ))
708}
709
710fn error_placement(where_: &str) -> EnsError {
711 EnsError::new(format!("illegal placement: {where_}"))
712}
713
714#[derive(Debug, Clone, Copy)]
715enum EmojiFilter {
716 Preserve,
717 DropFe0f,
718}
719
720fn filter_emoji(cps: &[u32], filter: EmojiFilter) -> Vec<u32> {
721 match filter {
722 EmojiFilter::Preserve => cps.to_vec(),
723 EmojiFilter::DropFe0f => cps.iter().copied().filter(|&cp| cp != FE0F).collect(),
724 }
725}
726
727fn tokens_from_str(
728 input: &[u32],
729 nf: fn(&[u32]) -> Vec<u32>,
730 ef: EmojiFilter,
731) -> Result<Vec<NormToken>> {
732 let mut ret = Vec::new();
733 let mut chars = Vec::new();
734 let mut input = input.to_vec();
735 input.reverse();
736
737 while !input.is_empty() {
738 if let Some(emoji) = consume_emoji_reversed(&mut input, None) {
739 if !chars.is_empty() {
740 ret.push(NormToken {
741 cps: nf(&chars),
742 is_emoji: false,
743 });
744 chars.clear();
745 }
746 ret.push(NormToken {
747 cps: filter_emoji(&emoji, ef),
748 is_emoji: true,
749 });
750 } else {
751 let cp = input.pop().expect("input is not empty");
752 if ENS.valid.contains(&cp) {
753 chars.push(cp);
754 } else if let Some(cps) = ENS.mapped.get(&cp) {
755 chars.extend_from_slice(cps);
756 } else if !ENS.ignored.contains(&cp) {
757 return Err(error_disallowed(cp));
758 }
759 }
760 }
761
762 if !chars.is_empty() {
763 ret.push(NormToken {
764 cps: nf(&chars),
765 is_emoji: false,
766 });
767 }
768
769 Ok(ret)
770}
771
772fn consume_emoji_reversed(input: &mut Vec<u32>, eaten: Option<&mut Vec<u32>>) -> Option<Vec<u32>> {
773 let mut eaten = eaten;
774 let mut node = 0usize;
775 let mut emoji = None;
776 let mut pos = input.len();
777 while pos > 0 {
778 pos -= 1;
779 let cp = input[pos];
780 let Some(&child) = ENS.emoji_root.nodes[node].children.get(&cp) else {
781 break;
782 };
783 node = child;
784 if let Some(value) = ENS.emoji_root.nodes[node].value.clone() {
785 if let Some(eaten) = eaten.as_deref_mut() {
786 eaten.extend(input[pos..].iter().rev().copied());
787 }
788 input.truncate(pos);
789 emoji = Some(value);
790 }
791 }
792 emoji
793}
794
795pub fn ens_tokenize(name: &str) -> Vec<Token> {
796 ens_tokenize_with_options(name, TokenizeOptions::default())
797}
798
799pub fn ens_tokenize_with_options(name: &str, options: TokenizeOptions) -> Vec<Token> {
800 tokenize(name, options.nf)
801}
802
803fn tokenize(name: &str, nf: bool) -> Vec<Token> {
804 let mut input = explode_cp(name);
805 input.reverse();
806 let mut eaten = Vec::new();
807 let mut tokens = Vec::new();
808
809 while !input.is_empty() {
810 if let Some(emoji) = consume_emoji_reversed(&mut input, Some(&mut eaten)) {
811 tokens.push(Token::Emoji {
812 input: std::mem::take(&mut eaten),
813 cps: filter_emoji(&emoji, EmojiFilter::DropFe0f),
814 emoji,
815 });
816 } else {
817 let cp = input.pop().expect("input is not empty");
818 if cp == STOP {
819 tokens.push(Token::Stop { cp });
820 } else if ENS.valid.contains(&cp) {
821 tokens.push(Token::Valid { cps: vec![cp] });
822 } else if ENS.ignored.contains(&cp) {
823 tokens.push(Token::Ignored { cp });
824 } else if let Some(cps) = ENS.mapped.get(&cp) {
825 tokens.push(Token::Mapped {
826 cp,
827 cps: cps.clone(),
828 });
829 } else {
830 tokens.push(Token::Disallowed { cp });
831 }
832 }
833 }
834
835 if nf {
836 apply_token_nfc(&mut tokens);
837 }
838
839 collapse_valid_tokens(tokens)
840}
841
842fn is_valid_or_mapped(token: &Token) -> bool {
843 matches!(token, Token::Valid { .. } | Token::Mapped { .. })
844}
845
846fn valid_or_mapped_cps(token: &Token) -> Option<&[u32]> {
847 match token {
848 Token::Valid { cps } | Token::Mapped { cps, .. } => Some(cps),
849 _ => None,
850 }
851}
852
853fn requires_check(cps: &[u32]) -> bool {
854 cps.iter().any(|cp| ENS.nfc_check.contains(cp))
855}
856
857fn apply_token_nfc(tokens: &mut Vec<Token>) {
858 let mut i = 0usize;
859 let mut start: Option<usize> = None;
860 while i < tokens.len() {
861 if is_valid_or_mapped(&tokens[i]) {
862 let cps = valid_or_mapped_cps(&tokens[i]).unwrap();
863 if requires_check(cps) {
864 let mut end = i + 1;
865 let mut pos = end;
866 while pos < tokens.len() {
867 if let Some(cps) = valid_or_mapped_cps(&tokens[pos]) {
868 if !requires_check(cps) {
869 break;
870 }
871 end = pos + 1;
872 } else if !matches!(tokens[pos], Token::Ignored { .. }) {
873 break;
874 }
875 pos += 1;
876 }
877 let start_i = start.unwrap_or(i);
878 let slice = tokens[start_i..end].to_vec();
879 let cps0: Vec<u32> = slice
880 .iter()
881 .filter_map(valid_or_mapped_cps)
882 .flat_map(|cps| cps.iter().copied())
883 .collect();
884 let cps = nfc(&cps0);
885 if compare_arrays(&cps, &cps0) != 0 {
886 let text = str_from_cps(&cps).unwrap_or_default();
887 let replacement = Token::Nfc {
888 input: cps0,
889 tokens0: collapse_valid_tokens(slice),
890 cps,
891 tokens: tokenize(&text, false),
892 };
893 tokens.splice(start_i..end, [replacement]);
894 i = start_i;
895 } else {
896 i = end.saturating_sub(1);
897 }
898 start = None;
899 } else {
900 start = Some(i);
901 }
902 } else if !matches!(tokens[i], Token::Ignored { .. }) {
903 start = None;
904 }
905 i += 1;
906 }
907}
908
909fn collapse_valid_tokens(tokens: Vec<Token>) -> Vec<Token> {
910 let mut out = Vec::new();
911 let mut i = 0usize;
912 while i < tokens.len() {
913 if let Token::Valid { .. } = &tokens[i] {
914 let mut cps = Vec::new();
915 while i < tokens.len() {
916 if let Token::Valid { cps: next } = &tokens[i] {
917 cps.extend_from_slice(next);
918 i += 1;
919 } else {
920 break;
921 }
922 }
923 out.push(Token::Valid { cps });
924 } else {
925 out.push(tokens[i].clone());
926 i += 1;
927 }
928 }
929 out
930}