dotproperties 0.1.0

Parser for the Java .properties file format
Documentation
// Apparently, refering to a function from a macro is not enough to consider it
// being used.
#![allow(dead_code)]

use nom::hex_u32;

// Some Unicode terminology:
// * Unicode: The norm that gives a numerical value to all characters
// * UTF-16: A way to encode Unicode text. Each Unicode code point can be
//   represented by either one or two u16 code units.
// * Code point: One of the values defined by Unicode. Fits in a u32.
// * Code unit: A value used by a Unicode encoding. For UTF-16, a code unit fits
//   in a u16.
// * Basic Multilingual Plane (BMP): The set of Unicode code units that can be
//   represented in UTF-16 using a single u16 code unit.
// * Supplementary Plane: The set of Unicode code units that are not part of the
//   BMP.
// * Surrogate pair: A pair of UTF-16 code units that form a code point (so,
//   the code point is part of a Supplementary Plane)
// * High/Low Surrogate: High is the first code unit of a Surrogate pair. Low
//   is the second one.

/// An escaped code for a UTF-16 code unit.
/// example: \u1a5c
named!(code_unit<u16>,
    do_parse!(
        tag!(r"\u") >>
        hex: flat_map!(take!(4), terminated!(hex_u32, eof!())) >>
        (hex as u16)
    )
);

#[derive(PartialEq, Eq)]
enum Category {Bmp, LowSurrogate, HighSurrogate}
use self::Category::*;

/// A UTF-16 code unit can always be categorized in 3 non-overlapping groups.
fn category(v: u16) -> Category {
    if v < 0xD800 {Bmp}
    else if v < 0xDC00 {HighSurrogate}
    else if v < 0xE000 {LowSurrogate}
    else {Bmp}
}

/// A parser that reads an escaped code that belongs in a specific UTF-16 category
named_args!(code_unit_cat(cat: Category)<u16>,
    verify!(code_unit, |v| category(v) == cat)
);

/// Convert a pair of code unit to its corresponding Unicode value. Assumes
/// that the two code units belong to the correct category.
fn surrogate_pair_to_char(high: u16, low: u16) -> char {
    debug_assert!(category(high) == HighSurrogate);
    debug_assert!(category(low) == LowSurrogate);
    String::from_utf16(&[high, low])
        .expect("All combinations of high+ low surrogate form a valid character")
        .chars().next()
        .expect("The string contains one character")
}

/// A parser that reads two escaped codes that form a Surrogate Pair
named!(code_unit_supplementary<char>,
    do_parse!(
        high: call!(code_unit_cat, HighSurrogate) >>
        low: call!(code_unit_cat, LowSurrogate) >>
        (surrogate_pair_to_char(high, low))
    )
);

/// Convert a code unit that belongs to the BMP to its corresponding Unicode value.
/// Assumes that the code unit belongs to the correct category.
fn basic_to_char(v: u16) -> char {
    debug_assert!(category(v) == Bmp);
    ::std::char::from_u32(v as u32).expect("All BMP code units are valid characters")
}

/// A parser that reads an escaped code that belongs to the BMP.
named!(code_unit_bmp<char>,
    map!(
        call!(code_unit_cat, Bmp),
        basic_to_char
    )
);

/// A parser that reads a sequence of one or two escaped codes, depending on
/// the category they belong to.
named!(pub escape_sequence<char>,
    alt!(code_unit_bmp | code_unit_supplementary)
);

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_code_unit() {
        // A valid escape with decimal digits
        assert_done!(code_unit(br"\u1234"), 0x1234);

        // A valid escape with lowercase and uppercase hex digits
        assert_done!(code_unit(br"\uABcd"), 0xabcd);

        // A valid escape followed by non-matched bytes
        assert_done_partial!(code_unit(br"\u123456789"), 0x1234, b"56789");

        // Escape with invalid characters
        assert_error!(code_unit(br"\u54xt"));

        // Escape with not enough digits
        assert_incomplete!(code_unit(br"\u15"));
    }

    #[test]
    fn test_escape_sequence() {
        // https://en.wikipedia.org/wiki/UTF-16#Examples

        // A codepoint from the Basic Multilingual Plane is self-sufficient
        assert_done!(escape_sequence(br"\u0024"), '\u{0024}');
        assert_done!(escape_sequence(br"\uFA15"), '\u{FA15}');

        // A surrogate pair, that forms a code point from a Supplementary Plane
        assert_done!(escape_sequence(br"\uD801\uDC37"), '\u{10437}');
        assert_done!(escape_sequence(br"\uD852\uDF62"), '\u{24B62}');

        // A high surrogate alone is incomplete
        assert_incomplete!(escape_sequence(br"\uD801"));
        assert_incomplete!(escape_sequence(br"\uD852"));

        // A low surrogate alone is not valid
        assert_error!(escape_sequence(br"\uDC37"));
        assert_error!(escape_sequence(br"\uDF62"));

        // Two high surrogates are not valid
        assert_error!(escape_sequence(br"\uD801\uD852"));
    }
}