ens_normalize_rs/
validate.rs

1use crate::{
2    constants, static_data::spec_json, utils, CodePoint, CodePointsSpecs, CollapsedEnsNameToken,
3    CurrableError, DisallowedSequence, EnsNameToken, ParsedGroup, ParsedWholeValue, ProcessError,
4    TokenizedLabel, TokenizedName,
5};
6use itertools::Itertools;
7use std::collections::HashSet;
8pub type LabelType = spec_json::GroupName;
9
10/// Represents a validated ENS label as result of the `validate_label` function.
11/// Contains the original tokenized label and the type of the label.
12#[derive(Debug, Clone, PartialEq, Eq)]
13pub struct ValidatedLabel {
14    pub tokens: Vec<EnsNameToken>,
15    pub label_type: LabelType,
16}
17
18pub fn validate_name(
19    name: &TokenizedName,
20    specs: &CodePointsSpecs,
21) -> Result<Vec<ValidatedLabel>, ProcessError> {
22    if name.is_empty() {
23        return Ok(vec![]);
24    }
25    let labels = name
26        .iter_labels()
27        .map(|label| validate_label(label, specs))
28        .collect::<Result<Vec<_>, _>>()?;
29    Ok(labels)
30}
31
32/// Validates a tokenized ENS label according to the ENSIP 15 specification
33/// https://docs.ens.domains/ensip/15#validate
34pub fn validate_label(
35    label: TokenizedLabel<'_>,
36    specs: &CodePointsSpecs,
37) -> Result<ValidatedLabel, ProcessError> {
38    non_empty(&label)?;
39    check_token_types(&label)?;
40    if label.is_fully_emoji() {
41        return Ok(ValidatedLabel {
42            tokens: label.tokens.to_owned(),
43            label_type: LabelType::Emoji,
44        });
45    };
46    underscore_only_at_beginning(&label)?;
47    if label.is_fully_ascii() {
48        no_hyphen_at_second_and_third(&label)?;
49        return Ok(ValidatedLabel {
50            tokens: label.tokens.to_owned(),
51            label_type: LabelType::Ascii,
52        });
53    }
54    check_fenced(&label, specs)?;
55    check_cm_leading_emoji(&label, specs)?;
56    let group = check_and_get_group(&label, specs)?;
57    Ok(ValidatedLabel {
58        tokens: label.tokens.to_owned(),
59        label_type: group.name,
60    })
61}
62
63fn non_empty(label: &TokenizedLabel) -> Result<(), ProcessError> {
64    let non_ignored_token_exists = label.tokens.iter().any(|token| !token.is_ignored());
65    if !non_ignored_token_exists {
66        return Err(ProcessError::DisallowedSequence(
67            DisallowedSequence::EmptyLabel,
68        ));
69    }
70    Ok(())
71}
72
73fn check_token_types(label: &TokenizedLabel) -> Result<(), ProcessError> {
74    if let Some(token) = label
75        .tokens
76        .iter()
77        .find(|token| token.is_disallowed() || token.is_stop())
78    {
79        let cps = token.cps();
80        let maybe_invisible_cp = cps.iter().find(|cp| {
81            *cp == &constants::CP_ZERO_WIDTH_JOINER || *cp == &constants::CP_ZERO_WIDTH_NON_JOINER
82        });
83        if let Some(invisible_cp) = maybe_invisible_cp {
84            return Err(ProcessError::DisallowedSequence(
85                DisallowedSequence::InvisibleCharacter(*invisible_cp),
86            ));
87        } else {
88            return Err(ProcessError::DisallowedSequence(
89                DisallowedSequence::Invalid(utils::cps2str(&cps)),
90            ));
91        }
92    }
93    Ok(())
94}
95
96fn underscore_only_at_beginning(label: &TokenizedLabel) -> Result<(), ProcessError> {
97    let leading_underscores = label
98        .iter_cps()
99        .take_while(|cp| *cp == constants::CP_UNDERSCORE)
100        .count();
101    let underscore_in_middle = label
102        .iter_cps()
103        .enumerate()
104        .skip(leading_underscores)
105        .find(|(_, cp)| *cp == constants::CP_UNDERSCORE);
106    if let Some((index, _)) = underscore_in_middle {
107        return Err(ProcessError::CurrableError {
108            inner: CurrableError::UnderscoreInMiddle,
109            index,
110            sequence: utils::cps2str(&[constants::CP_UNDERSCORE]),
111            maybe_suggest: Some("".to_string()),
112        });
113    }
114    Ok(())
115}
116
117// The 3rd and 4th characters must not both be 2D (-) HYPHEN-MINUS.
118// Must not match /^..--/
119// Examples: "ab-c" and "---a"are valid, "xn--" and ---- are invalid.
120fn no_hyphen_at_second_and_third(label: &TokenizedLabel) -> Result<(), ProcessError> {
121    if label.iter_cps().nth(2) == Some(constants::CP_HYPHEN)
122        && label.iter_cps().nth(3) == Some(constants::CP_HYPHEN)
123    {
124        return Err(ProcessError::CurrableError {
125            inner: CurrableError::HyphenAtSecondAndThird,
126            index: 2,
127            sequence: utils::cps2str(&[constants::CP_HYPHEN, constants::CP_HYPHEN]),
128            maybe_suggest: Some("".to_string()),
129        });
130    }
131    Ok(())
132}
133
134fn check_fenced(label: &TokenizedLabel, specs: &CodePointsSpecs) -> Result<(), ProcessError> {
135    if let Some(first_cp) = label.iter_cps().next() {
136        if specs.is_fenced(first_cp) {
137            return Err(ProcessError::CurrableError {
138                inner: CurrableError::FencedLeading,
139                index: 0,
140                sequence: utils::cps2str(&[first_cp]),
141                maybe_suggest: Some("".to_string()),
142            });
143        }
144    }
145    if let Some(last_cp) = label.iter_cps().last() {
146        if specs.is_fenced(last_cp) {
147            return Err(ProcessError::CurrableError {
148                inner: CurrableError::FencedTrailing,
149                index: label.iter_cps().count() - 1,
150                sequence: utils::cps2str(&[last_cp]),
151                maybe_suggest: Some("".to_string()),
152            });
153        }
154    }
155
156    for (i, window) in label.iter_cps().tuple_windows().enumerate() {
157        let (one, two) = window;
158        if specs.is_fenced(one) && specs.is_fenced(two) {
159            return Err(ProcessError::CurrableError {
160                inner: CurrableError::FencedConsecutive,
161                index: i,
162                sequence: utils::cps2str(&[one, two]),
163                maybe_suggest: Some(utils::cp2str(one)),
164            });
165        }
166    }
167    Ok(())
168}
169
170fn check_cm_leading_emoji(
171    label: &TokenizedLabel,
172    specs: &CodePointsSpecs,
173) -> Result<(), ProcessError> {
174    let mut index = 0;
175    let collapsed = label.collapse_into_text_or_emoji();
176    for (i, token) in collapsed.iter().enumerate() {
177        if let CollapsedEnsNameToken::Text(token) = token {
178            if let Some(cp) = token.cps.first() {
179                if specs.is_cm(*cp) {
180                    if i == 0 {
181                        return Err(ProcessError::CurrableError {
182                            inner: CurrableError::CmStart,
183                            index,
184                            sequence: utils::cps2str(&[*cp]),
185                            maybe_suggest: Some("".to_string()),
186                        });
187                    } else {
188                        return Err(ProcessError::CurrableError {
189                            inner: CurrableError::CmAfterEmoji,
190                            index,
191                            sequence: utils::cps2str(&[*cp]),
192                            maybe_suggest: Some("".to_string()),
193                        });
194                    }
195                }
196            }
197        }
198        index += token.input_size();
199    }
200
201    Ok(())
202}
203
204fn check_and_get_group(
205    label: &TokenizedLabel,
206    specs: &CodePointsSpecs,
207) -> Result<ParsedGroup, ProcessError> {
208    let cps = label.get_cps_of_not_ignored_text();
209    let unique_cps = cps
210        .clone()
211        .into_iter()
212        .collect::<HashSet<_>>()
213        .into_iter()
214        .collect::<Vec<_>>();
215    let group = determine_group(&unique_cps, specs).cloned()?;
216    check_group(&group, &cps, specs)?;
217    check_whole(&group, &unique_cps, specs)?;
218    Ok(group)
219}
220
221fn check_group(
222    group: &ParsedGroup,
223    cps: &[CodePoint],
224    specs: &CodePointsSpecs,
225) -> Result<(), ProcessError> {
226    for cp in cps.iter() {
227        if !group.contains_cp(*cp) {
228            return Err(ProcessError::Confused(format!(
229                "symbol {} not present in group {}",
230                utils::cp2str(*cp),
231                group.name
232            )));
233        }
234    }
235    if group.cm_absent {
236        let decomposed = utils::nfd_cps(cps, specs);
237        let mut i = 1;
238        let e = decomposed.len();
239        while i < e {
240            if specs.is_nsm(decomposed[i]) {
241                let mut j = i + 1;
242                while j < e && specs.is_nsm(decomposed[j]) {
243                    if j - i + 1 > specs.nsm_max() as usize {
244                        return Err(ProcessError::DisallowedSequence(
245                            DisallowedSequence::NsmTooMany,
246                        ));
247                    }
248                    for k in i..j {
249                        if decomposed[k] == decomposed[j] {
250                            return Err(ProcessError::DisallowedSequence(
251                                DisallowedSequence::NsmRepeated,
252                            ));
253                        }
254                    }
255                    j += 1;
256                }
257                i = j;
258            }
259            i += 1;
260        }
261    }
262    Ok(())
263}
264
265fn check_whole(
266    group: &ParsedGroup,
267    unique_cps: &[CodePoint],
268    specs: &CodePointsSpecs,
269) -> Result<(), ProcessError> {
270    let (maker, shared) = get_groups_candidates_and_shared_cps(unique_cps, specs);
271    for group_name in maker {
272        let confused_group_candidate = specs.group_by_name(group_name).expect("group must exist");
273        if confused_group_candidate.contains_all_cps(&shared) {
274            return Err(ProcessError::ConfusedGroups {
275                group1: group.name.to_string(),
276                group2: confused_group_candidate.name.to_string(),
277            });
278        }
279    }
280    Ok(())
281}
282
283fn get_groups_candidates_and_shared_cps(
284    unique_cps: &[CodePoint],
285    specs: &CodePointsSpecs,
286) -> (Vec<String>, Vec<CodePoint>) {
287    let mut maybe_groups: Option<Vec<String>> = None;
288    let mut shared: Vec<CodePoint> = Vec::new();
289
290    for cp in unique_cps {
291        match specs.whole_map(*cp) {
292            Some(ParsedWholeValue::Number(_)) => {
293                return (vec![], vec![]);
294            }
295            Some(ParsedWholeValue::WholeObject(whole)) => {
296                let confused_groups_names = whole
297                    .m
298                    .get(cp)
299                    .expect("since we got `whole` from cp, `M` must have a value for `cp`");
300
301                match maybe_groups.as_mut() {
302                    Some(groups) => {
303                        groups.retain(|g| confused_groups_names.contains(g));
304                    }
305                    None => {
306                        maybe_groups = Some(confused_groups_names.iter().cloned().collect());
307                    }
308                }
309            }
310            None => {
311                shared.push(*cp);
312            }
313        };
314    }
315
316    (maybe_groups.unwrap_or_default(), shared)
317}
318
319fn determine_group<'a>(
320    unique_cps: &'a [CodePoint],
321    specs: &'a CodePointsSpecs,
322) -> Result<&'a ParsedGroup, ProcessError> {
323    specs
324        .groups_for_cps(unique_cps)
325        .next()
326        .ok_or(ProcessError::Confused(format!(
327            "no group found for {:?}",
328            unique_cps
329        )))
330}
331
332#[cfg(test)]
333mod tests {
334    use crate::TokenizedName;
335
336    use super::*;
337    use pretty_assertions::assert_eq;
338    use rstest::{fixture, rstest};
339
340    #[fixture]
341    #[once]
342    fn specs() -> CodePointsSpecs {
343        CodePointsSpecs::default()
344    }
345
346    #[rstest]
347    // success
348    #[case::hello("hello", Ok(LabelType::Ascii))]
349    #[case::latin("E︎̃", Ok(LabelType::Other("Latin".to_string())))]
350    #[case::cyrillic("всем-привет", Ok(LabelType::Other("Cyrillic".to_string())))]
351    #[case::with_fenced_in_middle("a・a’s", Ok(LabelType::Other("Han".to_string())))]
352    #[case::ascii_with_hyphen("ab-c", Ok(LabelType::Ascii))]
353    // errors
354    #[case::hyphen_at_second_and_third("ab--", Err(ProcessError::CurrableError {
355        inner: CurrableError::HyphenAtSecondAndThird,
356        index: 2,
357        sequence: "--".to_string(),
358        maybe_suggest: Some("".to_string())
359    }))]
360    #[case::fenced_leading("’85", Err(ProcessError::CurrableError {
361        inner: CurrableError::FencedLeading,
362        index: 0,
363        sequence: "’".to_string(),
364        maybe_suggest: Some("".to_string())
365    }))]
366    #[case::fenced_contiguous("a・・a", Err(ProcessError::CurrableError {
367        inner: CurrableError::FencedConsecutive,
368        index: 1,
369        sequence: "・・".to_string(),
370        maybe_suggest: Some("・".to_string())
371    }))]
372    #[case::cm_after_emoji("😎😎😎😎😎😎😎😎\u{300}hello", Err(ProcessError::CurrableError {
373        inner: CurrableError::CmAfterEmoji,
374        index: 8,
375        sequence: "\u{300}".to_string(),
376        maybe_suggest: Some("".to_string())
377    }))]
378    #[case::cm_leading("\u{300}hello", Err(ProcessError::CurrableError {
379        inner: CurrableError::CmStart,
380        index: 0,
381        sequence: "\u{300}".to_string(),
382        maybe_suggest: Some("".to_string())
383    }))]
384    fn test_validate_and_get_type(
385        #[case] input: &str,
386        #[case] expected: Result<LabelType, ProcessError>,
387        specs: &CodePointsSpecs,
388    ) {
389        let name = TokenizedName::from_input(input, specs, true).unwrap();
390        let label = name.iter_labels().next().unwrap();
391        let result = validate_label(label, specs);
392        assert_eq!(
393            result.clone().map(|v| v.label_type),
394            expected,
395            "{:?}",
396            result
397        );
398    }
399
400    #[rstest]
401    #[case::emoji("\"Emoji\"", LabelType::Emoji)]
402    #[case::ascii("\"ASCII\"", LabelType::Ascii)]
403    #[case::greek("\"Greek\"", LabelType::Greek)]
404    #[case::other("\"FooBar\"", LabelType::Other("FooBar".to_string()))]
405    fn test_deserialize_label_type(#[case] input: &str, #[case] expected: LabelType) {
406        let result: LabelType = serde_json::from_str(input).unwrap();
407        assert_eq!(result, expected);
408    }
409}