basic_text_internals/
replace.rs

1//! On input, several disallowed scalar values are replaced, so that content
2//! containing them can still be read, but applications don't have to
3//! handle them.
4
5use crate::unicode::{BOM, FF, LS, NEL, ORC, PS, REPL, WJ};
6use std::collections::VecDeque;
7
8#[inline]
9pub fn replace(c: char, queue: &mut VecDeque<char>) {
10    match c {
11        BOM => queue.push_back(WJ),
12        '\u{149}' => {
13            queue.push_back('\u{2bc}');
14            queue.push_back('\u{6e}');
15        }
16        '\u{673}' => {
17            queue.push_back('\u{627}');
18            queue.push_back('\u{65f}');
19        }
20        '\u{f77}' => {
21            queue.push_back('\u{fb2}');
22            queue.push_back('\u{f81}');
23        }
24        '\u{f79}' => {
25            queue.push_back('\u{fb3}');
26            queue.push_back('\u{f81}');
27        }
28        '\u{17a3}' => queue.push_back('\u{17a2}'),
29        '\u{17a4}' => {
30            queue.push_back('\u{17a2}');
31            queue.push_back('\u{17b6}');
32        }
33        // Discouraged characters
34        '\u{2df5}' => {
35            queue.push_back('\u{2ded}');
36            queue.push_back('\u{2dee}');
37        }
38        '\u{111c4}' => {
39            queue.push_back('\u{1118f}');
40            queue.push_back('\u{11180}');
41        }
42        LS | PS => queue.push_back(' '),
43        // Latin Ligatures
44        '\u{fb00}' => {
45            queue.push_back('f');
46            queue.push_back('f');
47        }
48        '\u{fb01}' => {
49            queue.push_back('f');
50            queue.push_back('i');
51        }
52        '\u{fb02}' => {
53            queue.push_back('f');
54            queue.push_back('l');
55        }
56        '\u{fb03}' => {
57            queue.push_back('f');
58            queue.push_back('f');
59            queue.push_back('i');
60        }
61        '\u{fb04}' => {
62            queue.push_back('f');
63            queue.push_back('f');
64            queue.push_back('l');
65        }
66        '\u{fb05}' => {
67            queue.push_back('ſ');
68            queue.push_back('t');
69        }
70        '\u{fb06}' => {
71            queue.push_back('s');
72            queue.push_back('t');
73        }
74        FF | NEL => queue.push_back(' '),
75        // Control codes: C0 (except '\n', '\t', FF, and ESC), DEL, C1 (except NEL)
76        '\u{0}' | '\u{1}' | '\u{2}' | '\u{3}' |
77        '\u{4}' | '\u{5}' | '\u{6}' | '\u{7}' |
78        '\u{8}' | '\u{b}' |
79        '\u{d}' | '\u{e}' | '\u{f}' |
80        '\u{10}' | '\u{11}' | '\u{12}' | '\u{13}' |
81        '\u{14}' | '\u{15}' | '\u{16}' | '\u{17}' |
82        '\u{18}' | '\u{19}' | '\u{1a}' |
83        '\u{1c}' | '\u{1d}' | '\u{1e}' | '\u{1f}' |
84        '\u{7f}' |
85        '\u{80}' | '\u{81}' | '\u{82}' | '\u{83}' |
86        '\u{84}' | '\u{86}' | '\u{87}' |
87        '\u{88}' | '\u{89}' | '\u{8a}' | '\u{8b}' |
88        '\u{8c}' | '\u{8d}' | '\u{8e}' | '\u{8f}' |
89        '\u{90}' | '\u{91}' | '\u{92}' | '\u{93}' |
90        '\u{94}' | '\u{95}' | '\u{96}' | '\u{97}' |
91        '\u{98}' | '\u{99}' | '\u{9a}' | '\u{9b}' |
92        '\u{9c}' | '\u{9d}' | '\u{9e}' | '\u{9f}' |
93        // Angle brackets
94        '\u{2329}' | '\u{232a}' |
95        // Interlinear Annotations
96        '\u{fff9}'..='\u{fffb}' |
97        // Unassigned characters with replacements.
98        '\u{9e4}' | '\u{9e5}' | '\u{a64}' | '\u{a65}' |
99        '\u{ae4}' | '\u{ae5}' | '\u{b64}' | '\u{b65}' |
100        '\u{be4}' | '\u{be5}' | '\u{c64}' | '\u{c65}' |
101        '\u{ce4}' | '\u{ce5}' | '\u{d64}' | '\u{d65}' |
102        '\u{2072}' | '\u{2073}' |
103        '\u{1d455}' | '\u{1d49d}' | '\u{1d4a0}' | '\u{1d4a1}' |
104        '\u{1d4a3}' | '\u{1d4a4}' | '\u{1d4a7}' | '\u{1d4a8}' |
105        '\u{1d4ad}' | '\u{1d4ba}' | '\u{1d4bc}' | '\u{1d4c4}' |
106        '\u{1d506}' | '\u{1d50b}' | '\u{1d50c}' | '\u{1d515}' |
107        '\u{1d51d}' | '\u{1d53a}' | '\u{1d53f}' | '\u{1d545}' |
108        '\u{1d547}' | '\u{1d548}' | '\u{1d549}' | '\u{1d551}' |
109        // Object Replacement Character
110        ORC |
111        // Khmer characters erroneously invented by Unicode.
112        '\u{17b4}' | '\u{17b5}' | '\u{17d8}' |
113        // Deprecated Format Characters
114        '\u{206a}'..='\u{206f}' |
115        // Bidirectional Format Characters
116        '\u{202a}' | '\u{202b}' | '\u{202c}' | '\u{202d}' | '\u{202e}' |
117        '\u{2066}' | '\u{2067}' | '\u{2068}' | '\u{2069}' |
118        // Language Tag
119        '\u{e0001}' |
120        // Noncharacters
121        '\u{fffe}' ..= '\u{ffff}' |
122        '\u{1fffe}' ..= '\u{1ffff}' |
123        '\u{2fffe}' ..= '\u{2ffff}' |
124        '\u{3fffe}' ..= '\u{3ffff}' |
125        '\u{4fffe}' ..= '\u{4ffff}' |
126        '\u{5fffe}' ..= '\u{5ffff}' |
127        '\u{6fffe}' ..= '\u{6ffff}' |
128        '\u{7fffe}' ..= '\u{7ffff}' |
129        '\u{8fffe}' ..= '\u{8ffff}' |
130        '\u{9fffe}' ..= '\u{9ffff}' |
131        '\u{afffe}' ..= '\u{affff}' |
132        '\u{bfffe}' ..= '\u{bffff}' |
133        '\u{cfffe}' ..= '\u{cffff}' |
134        '\u{dfffe}' ..= '\u{dffff}' |
135        '\u{efffe}' ..= '\u{effff}' |
136        '\u{ffffe}' ..= '\u{fffff}' |
137        '\u{10fffe}' ..= '\u{10ffff}' |
138        '\u{fdd0}'..='\u{fdef}' => queue.push_back(REPL),
139
140        c => queue.push_back(c),
141    }
142}