i-dunno 0.6.0

RFC 8771 Internationalized Deliberately Unreadable Network Notation
Documentation
use crate::data;
use crate::i_dunno_unicode::{
    directionality, has_confusables, has_emoji, script, symbol, unprintable,
};

#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)]
pub enum ConfusionLevel {
    Minimum,
    Satisfactory,
    Delightful,
}

/// Return a measure of how confusing the supplied string is, according to
/// RFC8771.
///
/// Returns None if the string is not at all confusing.
pub fn confusion_level(input: &str) -> Option<ConfusionLevel> {
    // RFC: Unallocated code points MUST be avoided.
    // This is satified because the input is UTF-8.

    if !is_minimum(input) {
        None
    } else if !is_satisfactory(input) {
        Some(ConfusionLevel::Minimum)
    } else if !is_delightful(input) {
        Some(ConfusionLevel::Satisfactory)
    } else {
        Some(ConfusionLevel::Delightful)
    }
}

fn is_minimum(input: &str) -> bool {
    // Later: profile and check whether swapping makes it faster
    // RFC: As a minimum, a valid I-DUNNO MUST:

    // Contain at least one UTF-8 octet sequence with a length greater than
    // one octet.
    has_multi_octet_code_point(input) &&

        // Contain at least one character that is DISALLOWED in IDNA2008.
        has_idna2008_disallowed_code_point(input)
}

fn is_satisfactory(input: &str) -> bool {
    // Later: profile and check whether a different order is faster
    // An I-DUNNO with Satisfactory Confusion Level MUST adhere to the
    // Minimum Confusion Level, and additionally contain two of the following:

    let mut sats = 0;
    // At least one non-printable character.
    if has_unprintable_code_point(input) {
        sats += 1;
    }

    // Characters from at least two different Scripts.
    if has_multiple_scripts(input) {
        sats += 1;
    }

    // Quick check whether we have enough info to bail out early
    if sats == 0 {
        return false;
    } else if sats == 2 {
        return true;
    }

    // A character from the "Symbol" category.
    if has_symbol(input) {
        sats += 1;
    }

    sats >= 2
}

fn is_delightful(input: &str) -> bool {
    // An I-DUNNO with Delightful Confusion Level MUST adhere to the
    // Satisfactory Confusion Level, and additionally contain at least two of
    // the following:

    let mut sats = 0;
    // Characters from scripts with different directionalities.
    if has_different_directionalities(input) {
        sats += 1;
    }

    // One or more emoji.
    if has_emoji(input) {
        sats += 1;
    }

    // Quick check whether we have enough info to bail out early
    if sats == 0 {
        return false;
    } else if sats == 2 {
        return true;
    }

    // Character classified as "Confusables".
    // (This is a slower check, so we do it last.)
    if has_confusables(input) {
        sats += 1;
    }

    sats >= 2
}

fn has_multi_octet_code_point(input: &str) -> bool {
    input.chars().any(|ch| ch.len_utf8() > 1)
}

fn has_idna2008_disallowed_code_point(input: &str) -> bool {
    input.chars().any(data::idna_disallowed)
}

fn has_unprintable_code_point(input: &str) -> bool {
    input.chars().any(unprintable)
}

fn has_multiple_scripts(input: &str) -> bool {
    let mut scr = None;
    for ch in input.chars() {
        let new_scr = script(ch);
        if let Some(scr) = scr {
            if scr != new_scr {
                return true;
            }
        }
        scr = Some(new_scr);
    }
    return false;
}

fn has_symbol(input: &str) -> bool {
    input.chars().any(symbol)
}

fn has_different_directionalities(input: &str) -> bool {
    let mut dir = None;
    for ch in input.chars() {
        let new_dir = directionality(ch);
        if let Some(dir) = dir {
            if dir != new_dir {
                return true;
            }
        }
        dir = Some(new_dir);
    }
    false
}

#[cfg(test)]
mod tests {
    use super::*;
    use unicode_script::Script;

    #[test]
    fn confusion_levels_are_ordered() {
        assert!(ConfusionLevel::Satisfactory > ConfusionLevel::Minimum);
        assert!(ConfusionLevel::Delightful > ConfusionLevel::Minimum);
        assert!(ConfusionLevel::Delightful > ConfusionLevel::Satisfactory);
    }

    #[test]
    fn empty_string_does_not_contain_multi_octet_code_points() {
        assert!(!has_multi_octet_code_point(""));
    }

    #[test]
    fn ascii_string_does_not_contain_multi_octet_code_points() {
        assert!(!has_multi_octet_code_point("abc{}-+01"));
    }

    #[test]
    fn e_acute_does_contain_multi_octet_code_points() {
        assert!(has_multi_octet_code_point("\u{E9}"));
        assert!(has_multi_octet_code_point("abc{}-\u{E9}+01"));
    }

    #[test]
    fn zero_code_point_makes_string_idna_disallowed() {
        assert!(!has_idna2008_disallowed_code_point("foobar"));
        assert!(has_idna2008_disallowed_code_point("foo\u{00}bar"));
    }

    #[test]
    fn bel_code_point_is_unprintable() {
        assert!(!has_unprintable_code_point("foobar"));
        assert!(has_unprintable_code_point("foo\u{07}bar"));
    }

    #[test]
    fn form_feed_is_inprintable() {
        // As stated in the RFC, U+000C is unprintable.
        assert!(!has_unprintable_code_point("012345"));
        assert!(has_unprintable_code_point("012\u{0C}345"));
    }

    #[test]
    fn single_char_has_one_directionality() {
        assert!(!has_different_directionalities("a"));
    }

    #[test]
    fn many_chars_with_same_directionality_are_not_enough() {
        assert!(!has_different_directionalities("abcdef"));
        assert!(!has_different_directionalities("\u{07d2}\u{07de}\u{07cf}"));
    }

    #[test]
    fn chars_with_different_directionalities_are_detected() {
        assert!(has_different_directionalities("a\u{07d2}"));
    }

    #[test]
    fn empty_string_is_not_confusing() {
        assert_eq!(confusion_level(""), None);
    }

    #[test]
    fn normal_string_is_not_confusing() {
        assert_eq!(confusion_level("some-allowed-Chars-0"), None);
    }

    #[test]
    fn string_with_multibyte_and_disallowed_char_is_minimum() {
        assert_eq!(
            confusion_level("foo\u{037e}bar"),
            Some(ConfusionLevel::Minimum)
        );
    }

    #[test]
    fn string_with_multibyte_and_disallowed_char_is_delightful() {
        // U+E0075 is four utf-8 code units, which is more than one.
        // U+E0075 is DISALLOWED in IDNA2008.
        // U+E0075 is non-printable
        // a is from the Latin script, so we have multiple
        // + is a symbol

        assert_eq!(
            confusion_level("foo\u{e0075}bar+"),
            Some(ConfusionLevel::Delightful)
        );
    }

    #[test]
    fn string_with_just_bel_is_not_confusing() {
        assert_eq!(confusion_level("foo\u{07}bar"), None);
    }

    #[test]
    fn string_with_bel_plus_min_reqs_is_delightful() {
        assert_eq!(
            confusion_level("foo\u{07}bar\u{e0075}\u{2208}"),
            Some(ConfusionLevel::Delightful)
        );
    }

    #[test]
    fn string_without_multiple_scripts_is_delightful() {
        // U+1F4A9 is four utf-8 code units, which is more than one.
        // U+0007 BELL is DISALLOWED in IDNA2008.
        // U+0000 NULL is non-printable
        // U+2208 is a symbol
        // All 4 are in the Common script

        // So 2/3 = Satisfactory
        assert_eq!(
            confusion_level("\u{0007}\u{0000}\u{1F4A9}\u{2208}"),
            Some(ConfusionLevel::Delightful)
        );
    }

    #[test]
    fn string_without_symbols_is_delightful() {
        // U+00E9 is two utf-8 code units, which is more than one.
        // U+0007 BELL is DISALLOWED in IDNA2008.
        // U+0000 NULL is non-printable
        // a is from the Latin script, so we have multiple
        // But, we have no symbols

        // 2/3 = Satisfactory

        // They also have different directionalities
        // And some are confusable
        // But, there are no emoji

        // 2/3 = Delightful

        assert_eq!(
            confusion_level("\u{0007}\u{0000}\u{00E9}a"),
            Some(ConfusionLevel::Delightful)
        );
    }

    #[test]
    fn string_without_symbols_or_multiple_scripts_is_not_satisfactory() {
        // U+0084 is two utf-8 code units, which is more than one.
        // U+0007 BELL is DISALLOWED in IDNA2008.
        // U+0000 NULL is non-printable
        // All 3 are in the Common script
        // We have no symbols

        // 1/3 = Minimum

        let s = "\u{0007}\u{0000}\u{0084}";

        assert_eq!(
            scripts(s),
            vec![Script::Common, Script::Common, Script::Common]
        );
        assert_eq!(confusion_level(s), Some(ConfusionLevel::Minimum));
    }

    #[test]
    fn string_with_symbols_etc_is_satisfactory() {
        assert_eq!(
            confusion_level("\u{000B}\u{06ab}\u{0004}\u{0024}"),
            Some(ConfusionLevel::Satisfactory)
        );
    }

    #[test]
    fn string_with_symbols_etc_is_delightful() {
        // U+1F4A9 is four utf-8 code units, which is more than one.
        // U+0007 BELL is DISALLOWED in IDNA2008.
        // U+0000 NULL is non-printable
        // a is from the Latin script, so we have multiple
        // U+2208 ELEMENT OF is a Symbol
        assert_eq!(
            confusion_level("\u{0007}\u{0000}\u{1F4A9}\u{2208}a"),
            Some(ConfusionLevel::Delightful)
        );
    }

    fn scripts(input: &str) -> Vec<Script> {
        input.chars().map(|c| script(c)).collect()
    }
}