1use crate::{
2 constants, static_data::spec_json, utils, CodePoint, CodePointsSpecs, CollapsedEnsNameToken,
3 CurrableError, DisallowedSequence, EnsNameToken, ParsedGroup, ParsedWholeValue, ProcessError,
4 TokenizedLabel, TokenizedName,
5};
6use itertools::Itertools;
7use std::collections::HashSet;
8pub type LabelType = spec_json::GroupName;
9
10#[derive(Debug, Clone, PartialEq, Eq)]
13pub struct ValidatedLabel {
14 pub tokens: Vec<EnsNameToken>,
15 pub label_type: LabelType,
16}
17
18pub fn validate_name(
19 name: &TokenizedName,
20 specs: &CodePointsSpecs,
21) -> Result<Vec<ValidatedLabel>, ProcessError> {
22 if name.is_empty() {
23 return Ok(vec![]);
24 }
25 let labels = name
26 .iter_labels()
27 .map(|label| validate_label(label, specs))
28 .collect::<Result<Vec<_>, _>>()?;
29 Ok(labels)
30}
31
32pub fn validate_label(
35 label: TokenizedLabel<'_>,
36 specs: &CodePointsSpecs,
37) -> Result<ValidatedLabel, ProcessError> {
38 non_empty(&label)?;
39 check_token_types(&label)?;
40 if label.is_fully_emoji() {
41 return Ok(ValidatedLabel {
42 tokens: label.tokens.to_owned(),
43 label_type: LabelType::Emoji,
44 });
45 };
46 underscore_only_at_beginning(&label)?;
47 if label.is_fully_ascii() {
48 no_hyphen_at_second_and_third(&label)?;
49 return Ok(ValidatedLabel {
50 tokens: label.tokens.to_owned(),
51 label_type: LabelType::Ascii,
52 });
53 }
54 check_fenced(&label, specs)?;
55 check_cm_leading_emoji(&label, specs)?;
56 let group = check_and_get_group(&label, specs)?;
57 Ok(ValidatedLabel {
58 tokens: label.tokens.to_owned(),
59 label_type: group.name,
60 })
61}
62
63fn non_empty(label: &TokenizedLabel) -> Result<(), ProcessError> {
64 let non_ignored_token_exists = label.tokens.iter().any(|token| !token.is_ignored());
65 if !non_ignored_token_exists {
66 return Err(ProcessError::DisallowedSequence(
67 DisallowedSequence::EmptyLabel,
68 ));
69 }
70 Ok(())
71}
72
73fn check_token_types(label: &TokenizedLabel) -> Result<(), ProcessError> {
74 if let Some(token) = label
75 .tokens
76 .iter()
77 .find(|token| token.is_disallowed() || token.is_stop())
78 {
79 let cps = token.cps();
80 let maybe_invisible_cp = cps.iter().find(|cp| {
81 *cp == &constants::CP_ZERO_WIDTH_JOINER || *cp == &constants::CP_ZERO_WIDTH_NON_JOINER
82 });
83 if let Some(invisible_cp) = maybe_invisible_cp {
84 return Err(ProcessError::DisallowedSequence(
85 DisallowedSequence::InvisibleCharacter(*invisible_cp),
86 ));
87 } else {
88 return Err(ProcessError::DisallowedSequence(
89 DisallowedSequence::Invalid(utils::cps2str(&cps)),
90 ));
91 }
92 }
93 Ok(())
94}
95
96fn underscore_only_at_beginning(label: &TokenizedLabel) -> Result<(), ProcessError> {
97 let leading_underscores = label
98 .iter_cps()
99 .take_while(|cp| *cp == constants::CP_UNDERSCORE)
100 .count();
101 let underscore_in_middle = label
102 .iter_cps()
103 .enumerate()
104 .skip(leading_underscores)
105 .find(|(_, cp)| *cp == constants::CP_UNDERSCORE);
106 if let Some((index, _)) = underscore_in_middle {
107 return Err(ProcessError::CurrableError {
108 inner: CurrableError::UnderscoreInMiddle,
109 index,
110 sequence: utils::cps2str(&[constants::CP_UNDERSCORE]),
111 maybe_suggest: Some("".to_string()),
112 });
113 }
114 Ok(())
115}
116
117fn no_hyphen_at_second_and_third(label: &TokenizedLabel) -> Result<(), ProcessError> {
121 if label.iter_cps().nth(2) == Some(constants::CP_HYPHEN)
122 && label.iter_cps().nth(3) == Some(constants::CP_HYPHEN)
123 {
124 return Err(ProcessError::CurrableError {
125 inner: CurrableError::HyphenAtSecondAndThird,
126 index: 2,
127 sequence: utils::cps2str(&[constants::CP_HYPHEN, constants::CP_HYPHEN]),
128 maybe_suggest: Some("".to_string()),
129 });
130 }
131 Ok(())
132}
133
134fn check_fenced(label: &TokenizedLabel, specs: &CodePointsSpecs) -> Result<(), ProcessError> {
135 if let Some(first_cp) = label.iter_cps().next() {
136 if specs.is_fenced(first_cp) {
137 return Err(ProcessError::CurrableError {
138 inner: CurrableError::FencedLeading,
139 index: 0,
140 sequence: utils::cps2str(&[first_cp]),
141 maybe_suggest: Some("".to_string()),
142 });
143 }
144 }
145 if let Some(last_cp) = label.iter_cps().last() {
146 if specs.is_fenced(last_cp) {
147 return Err(ProcessError::CurrableError {
148 inner: CurrableError::FencedTrailing,
149 index: label.iter_cps().count() - 1,
150 sequence: utils::cps2str(&[last_cp]),
151 maybe_suggest: Some("".to_string()),
152 });
153 }
154 }
155
156 for (i, window) in label.iter_cps().tuple_windows().enumerate() {
157 let (one, two) = window;
158 if specs.is_fenced(one) && specs.is_fenced(two) {
159 return Err(ProcessError::CurrableError {
160 inner: CurrableError::FencedConsecutive,
161 index: i,
162 sequence: utils::cps2str(&[one, two]),
163 maybe_suggest: Some(utils::cp2str(one)),
164 });
165 }
166 }
167 Ok(())
168}
169
170fn check_cm_leading_emoji(
171 label: &TokenizedLabel,
172 specs: &CodePointsSpecs,
173) -> Result<(), ProcessError> {
174 let mut index = 0;
175 let collapsed = label.collapse_into_text_or_emoji();
176 for (i, token) in collapsed.iter().enumerate() {
177 if let CollapsedEnsNameToken::Text(token) = token {
178 if let Some(cp) = token.cps.first() {
179 if specs.is_cm(*cp) {
180 if i == 0 {
181 return Err(ProcessError::CurrableError {
182 inner: CurrableError::CmStart,
183 index,
184 sequence: utils::cps2str(&[*cp]),
185 maybe_suggest: Some("".to_string()),
186 });
187 } else {
188 return Err(ProcessError::CurrableError {
189 inner: CurrableError::CmAfterEmoji,
190 index,
191 sequence: utils::cps2str(&[*cp]),
192 maybe_suggest: Some("".to_string()),
193 });
194 }
195 }
196 }
197 }
198 index += token.input_size();
199 }
200
201 Ok(())
202}
203
204fn check_and_get_group(
205 label: &TokenizedLabel,
206 specs: &CodePointsSpecs,
207) -> Result<ParsedGroup, ProcessError> {
208 let cps = label.get_cps_of_not_ignored_text();
209 let unique_cps = cps
210 .clone()
211 .into_iter()
212 .collect::<HashSet<_>>()
213 .into_iter()
214 .collect::<Vec<_>>();
215 let group = determine_group(&unique_cps, specs).cloned()?;
216 check_group(&group, &cps, specs)?;
217 check_whole(&group, &unique_cps, specs)?;
218 Ok(group)
219}
220
221fn check_group(
222 group: &ParsedGroup,
223 cps: &[CodePoint],
224 specs: &CodePointsSpecs,
225) -> Result<(), ProcessError> {
226 for cp in cps.iter() {
227 if !group.contains_cp(*cp) {
228 return Err(ProcessError::Confused(format!(
229 "symbol {} not present in group {}",
230 utils::cp2str(*cp),
231 group.name
232 )));
233 }
234 }
235 if group.cm_absent {
236 let decomposed = utils::nfd_cps(cps, specs);
237 let mut i = 1;
238 let e = decomposed.len();
239 while i < e {
240 if specs.is_nsm(decomposed[i]) {
241 let mut j = i + 1;
242 while j < e && specs.is_nsm(decomposed[j]) {
243 if j - i + 1 > specs.nsm_max() as usize {
244 return Err(ProcessError::DisallowedSequence(
245 DisallowedSequence::NsmTooMany,
246 ));
247 }
248 for k in i..j {
249 if decomposed[k] == decomposed[j] {
250 return Err(ProcessError::DisallowedSequence(
251 DisallowedSequence::NsmRepeated,
252 ));
253 }
254 }
255 j += 1;
256 }
257 i = j;
258 }
259 i += 1;
260 }
261 }
262 Ok(())
263}
264
265fn check_whole(
266 group: &ParsedGroup,
267 unique_cps: &[CodePoint],
268 specs: &CodePointsSpecs,
269) -> Result<(), ProcessError> {
270 let (maker, shared) = get_groups_candidates_and_shared_cps(unique_cps, specs);
271 for group_name in maker {
272 let confused_group_candidate = specs.group_by_name(group_name).expect("group must exist");
273 if confused_group_candidate.contains_all_cps(&shared) {
274 return Err(ProcessError::ConfusedGroups {
275 group1: group.name.to_string(),
276 group2: confused_group_candidate.name.to_string(),
277 });
278 }
279 }
280 Ok(())
281}
282
283fn get_groups_candidates_and_shared_cps(
284 unique_cps: &[CodePoint],
285 specs: &CodePointsSpecs,
286) -> (Vec<String>, Vec<CodePoint>) {
287 let mut maybe_groups: Option<Vec<String>> = None;
288 let mut shared: Vec<CodePoint> = Vec::new();
289
290 for cp in unique_cps {
291 match specs.whole_map(*cp) {
292 Some(ParsedWholeValue::Number(_)) => {
293 return (vec![], vec![]);
294 }
295 Some(ParsedWholeValue::WholeObject(whole)) => {
296 let confused_groups_names = whole
297 .m
298 .get(cp)
299 .expect("since we got `whole` from cp, `M` must have a value for `cp`");
300
301 match maybe_groups.as_mut() {
302 Some(groups) => {
303 groups.retain(|g| confused_groups_names.contains(g));
304 }
305 None => {
306 maybe_groups = Some(confused_groups_names.iter().cloned().collect());
307 }
308 }
309 }
310 None => {
311 shared.push(*cp);
312 }
313 };
314 }
315
316 (maybe_groups.unwrap_or_default(), shared)
317}
318
319fn determine_group<'a>(
320 unique_cps: &'a [CodePoint],
321 specs: &'a CodePointsSpecs,
322) -> Result<&'a ParsedGroup, ProcessError> {
323 specs
324 .groups_for_cps(unique_cps)
325 .next()
326 .ok_or(ProcessError::Confused(format!(
327 "no group found for {:?}",
328 unique_cps
329 )))
330}
331
332#[cfg(test)]
333mod tests {
334 use crate::TokenizedName;
335
336 use super::*;
337 use pretty_assertions::assert_eq;
338 use rstest::{fixture, rstest};
339
340 #[fixture]
341 #[once]
342 fn specs() -> CodePointsSpecs {
343 CodePointsSpecs::default()
344 }
345
346 #[rstest]
347 #[case::hello("hello", Ok(LabelType::Ascii))]
349 #[case::latin("E︎̃", Ok(LabelType::Other("Latin".to_string())))]
350 #[case::cyrillic("всем-привет", Ok(LabelType::Other("Cyrillic".to_string())))]
351 #[case::with_fenced_in_middle("a・a’s", Ok(LabelType::Other("Han".to_string())))]
352 #[case::ascii_with_hyphen("ab-c", Ok(LabelType::Ascii))]
353 #[case::hyphen_at_second_and_third("ab--", Err(ProcessError::CurrableError {
355 inner: CurrableError::HyphenAtSecondAndThird,
356 index: 2,
357 sequence: "--".to_string(),
358 maybe_suggest: Some("".to_string())
359 }))]
360 #[case::fenced_leading("’85", Err(ProcessError::CurrableError {
361 inner: CurrableError::FencedLeading,
362 index: 0,
363 sequence: "’".to_string(),
364 maybe_suggest: Some("".to_string())
365 }))]
366 #[case::fenced_contiguous("a・・a", Err(ProcessError::CurrableError {
367 inner: CurrableError::FencedConsecutive,
368 index: 1,
369 sequence: "・・".to_string(),
370 maybe_suggest: Some("・".to_string())
371 }))]
372 #[case::cm_after_emoji("😎😎😎😎😎😎😎😎\u{300}hello", Err(ProcessError::CurrableError {
373 inner: CurrableError::CmAfterEmoji,
374 index: 8,
375 sequence: "\u{300}".to_string(),
376 maybe_suggest: Some("".to_string())
377 }))]
378 #[case::cm_leading("\u{300}hello", Err(ProcessError::CurrableError {
379 inner: CurrableError::CmStart,
380 index: 0,
381 sequence: "\u{300}".to_string(),
382 maybe_suggest: Some("".to_string())
383 }))]
384 fn test_validate_and_get_type(
385 #[case] input: &str,
386 #[case] expected: Result<LabelType, ProcessError>,
387 specs: &CodePointsSpecs,
388 ) {
389 let name = TokenizedName::from_input(input, specs, true).unwrap();
390 let label = name.iter_labels().next().unwrap();
391 let result = validate_label(label, specs);
392 assert_eq!(
393 result.clone().map(|v| v.label_type),
394 expected,
395 "{:?}",
396 result
397 );
398 }
399
400 #[rstest]
401 #[case::emoji("\"Emoji\"", LabelType::Emoji)]
402 #[case::ascii("\"ASCII\"", LabelType::Ascii)]
403 #[case::greek("\"Greek\"", LabelType::Greek)]
404 #[case::other("\"FooBar\"", LabelType::Other("FooBar".to_string()))]
405 fn test_deserialize_label_type(#[case] input: &str, #[case] expected: LabelType) {
406 let result: LabelType = serde_json::from_str(input).unwrap();
407 assert_eq!(result, expected);
408 }
409}