nom_lua/
string.rs

1// Copyright 2017 The nom-lua project developers
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9use ast::ASTNode;
10use nom::{hex_digit, digit};
11use std::{str, char};
12
13named!(pub parse_string<ASTNode>,
14       map!(alt!(/*parse_string_literal |*/ parse_string_short_literal), |s| ASTNode::String(s)));
15
16
17//named!(parse_string_literal<String>, map_res!(raw_string, |s, _| s));
18
19//pub fn raw_string(input: &str) -> IResult<&str, (String, usize)> {
20//    let mut chars = input.char_indices();
21//    let mut n = 0;
22//    while let Some((byte_offset, ch)) = chars.next() {
23//        match ch {
24//            '"' => {
25//                n = byte_offset;
26//                break;
27//            }
28//            '#' => {}
29//            _ => return IResult::Error,
30//        }
31//    }
32//    let mut s = String::new();
33//    for (byte_offset, ch) in chars {
34//        match ch {
35//            '"' if input[byte_offset + 1..].starts_with(&input[..n]) => {
36//                let rest = &input[byte_offset + 1 + n..];
37//                return IResult::Done(rest, (s, n));
38//            }
39//            '\r' => {}
40//            _ => s.push(ch),
41//        }
42//    }
43//    IResult::Error
44//}
45
46// TODO: A short literal string cannot contain unescaped line breaks nor escapes not forming a valid escape sequence.
47// TODO: " ' inside strings are valid
48named!(parse_string_short_literal<String>,
49       delimited!(
50        alt!(tag!("\"") | tag!("'")),
51        fold_many0!(alt!(
52            map!(linebreak, |_| '\n') |
53            parse_byte |
54            parse_unicode |
55            one_of!("\x07\x08\x09\x0A\x0B\x0C\x0D")
56            // Find a way to discard the output from this: preceded!(tag!(r#"\z"#), alt!(sp |
57            // linebreak))
58        ), String::new(), |mut acc: String, item| {
59            acc.push(item);
60            acc
61        }),
62        alt!(tag!("\"") | tag!("'"))));
63
64named!(parse_byte<char>, alt!(parse_byte_x | parse_byte_d));
65
66named!(parse_byte_x<char>, map!(map_res!(map_res!(
67                preceded!(tag!("\\x"), hex_digit),
68                str::from_utf8),
69            |s| u8::from_str_radix(s, 16)), |i: u8| i as char));
70
71named!(linebreak, alt!(tag!("\\\r\n") | tag!("\\\n\r") | tag!("\\\n")));
72
73
74// TODO: if a decimal escape sequence is to be followed by a digit, it must be expressed using exactly three digits
75// Notice, the fold_many_m_n is not actually enforcing bounds here, because digit recognizes
76// more than one character, but I think this implementation might be usefull for the future
77named!(parse_byte_d<char>, map!(map_res!(
78            preceded!(tag!("\\"), fold_many_m_n!(1, 3, digit, String::new(), |mut acc: String, item: &[u8]| {
79                for c in item {
80                    acc.push(*c as char);
81                }
82                acc
83            })),
84            |s: String| s.parse::<u8>()), |i: u8| i as char));
85
86named!(parse_unicode<char>,
87       map_opt!(
88           map_res!(
89               map_res!(
90                   delimited!(tag!("\\u{"), recognize!(hex_digit), tag!("}")),
91                   str::from_utf8),
92                   |h| u32::from_str_radix(h, 16)),
93                   char::from_u32));
94
95#[cfg(test)]
96mod tests {
97    ast_panic_test!(parse_unicode_1, parse_unicode, r#"\u{}"#);
98    ast_test!(parse_unicode_2, parse_unicode, r#"\u{A}"#, char::from_u32(0xA).unwrap());
99    ast_test!(parse_unicode_3, parse_unicode, r#"\u{a2}"#, char::from_u32(0xa2).unwrap());
100    ast_test!(parse_unicode_4, parse_unicode, r#"\u{AFf9}"#, char::from_u32(0xAFf9).unwrap());
101    ast_test!(parse_unicode_5, parse_unicode, r#"\u{0000000000000FFFF}"#, char::from_u32(0xFFFF).unwrap());
102    ast_test!(parse_unicode_6, parse_unicode, r#"\u{10FFFF}"#, char::from_u32(0x10FFFF).unwrap());
103    ast_panic_test!(parse_unicode_7, parse_unicode, r#"\u{110000}"#);
104
105
106    ast_test!(parse_byte_d_1, parse_byte_d, r#"\0"#, '\0');
107    ast_test!(parse_byte_d_2, parse_byte_d, r#"\00"#, '\0');
108    ast_test!(parse_byte_d_3, parse_byte_d, r#"\000"#, '\0');
109    // TODO: This should parse to the rust string r#"\u{0}0"# make this test reflect that
110    ast_test!(parse_byte_d_4, parse_byte_d, r#"\0000"#, '\0');
111    ast_test!(parse_byte_d_5, parse_byte_d, r#"\230"#, '\u{E6}');
112    ast_panic_test!(parse_byte_d_6, parse_byte_d, r#"\256"#);
113
114
115    ast_test!(parse_byte_x_1, parse_byte_x, r#"\x00"#, '\0');
116    // TODO: This should parse to the rust string "\u{0a}0" make this test reflect that
117    //ast_test!(parse_byte_x_2, parse_byte_x, r#"\x0a0"#, '\x0a');
118    ast_test!(parse_byte_x_3, parse_byte_x, r#"\x23"#, '\u{23}');
119    ast_test!(parse_byte_x_4, parse_byte_x, r#"\x000023"#, '\u{23}');
120    ast_test!(parse_byte_x_5, parse_byte_x, r#"\xFf"#, '\u{FF}');
121
122    ast_test!(parse_string_short_literal_1, parse_string_short_literal, r#""""#, "");
123    ast_test!(parse_string_short_literal_2, parse_string_short_literal, r#"''"#, "");
124    ast_test!(parse_string_short_literal_3, parse_string_short_literal, r#"'\u{1F62A}'"#, "😪");
125    ast_test!(parse_string_short_literal_4, parse_string_short_literal, r#"'\097'"#, "a");
126    ast_test!(parse_string_short_literal_5, parse_string_short_literal, format!("'{}'", "\x07\x08\x09\x0A\x0B\x0C\x0D"), "\x07\x08\x09\x0A\x0B\x0C\x0D");
127    ast_test!(parse_string_short_literal_6, parse_string_short_literal, "'\\\n\r'", "\n");
128    ast_test!(parse_string_short_literal_7, parse_string_short_literal, "'\\\r\n'", "\n");
129    ast_test!(parse_string_short_literal_8, parse_string_short_literal, "'\\\n'", "\n");
130}