boreal_parser/
string.rs

1//! Parsing related to strings and identifiers.
2
3use nom::bytes::complete::take_while;
4use nom::character::complete::char;
5use nom::combinator::{cut, map, opt, recognize};
6use nom::error::{ErrorKind as NomErrorKind, ParseError};
7use nom::sequence::{pair, preceded};
8use nom::Parser;
9
10use super::error::Error;
11use super::nom_recipes::{rtrim, take_one};
12use super::types::{Input, ParseResult};
13
14/// Returns true if the char is an identifier digit, ie a-z, a-Z, 0-9, _
15fn is_identifier_digit(c: char) -> bool {
16    matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_')
17}
18
19/// Parse the contents of an identifier string.
20///
21/// This is always the rest of an identifier type, where the first
22/// character determines which type of identifier is being parsed.
23///
24/// This function *does not* right-trim, as it can be followed
25/// by a '*' character that is meaningful in some contexts.
26fn identifier_contents(input: Input) -> ParseResult<String> {
27    map(take_while(is_identifier_digit), |input: Input| {
28        input.cursor().to_owned()
29    })
30    .parse(input)
31}
32
33/// Helper for [`string_identifier`] and [`string_identifier_with_wildcard`].
34fn string_identifier_no_rtrim(input: Input) -> ParseResult<String> {
35    preceded(char('$'), cut(identifier_contents)).parse(input)
36}
37
38/// Parse a string identifier.
39///
40/// This is equivalent to the `_STRING_IDENTIFIER_` lexical patterns in
41/// libyara.
42/// Roughly equivalent to `$[a-ZA-Z0-9_]*`.
43pub(crate) fn string_identifier(input: Input) -> ParseResult<String> {
44    rtrim(string_identifier_no_rtrim).parse(input)
45}
46
47/// Parse a string identifier with an optional trailing wildcard.
48///
49/// This is equivalent to
50/// `_STRING_IDENTIFIER_ | _STRING_IDENTIFIER_WITH_WILDCARD_` in libyara.
51pub(crate) fn string_identifier_with_wildcard(input: Input) -> ParseResult<(String, bool)> {
52    rtrim(pair(
53        string_identifier_no_rtrim,
54        map(opt(char('*')), |v| v.is_some()),
55    ))
56    .parse(input)
57}
58
59/// Parse a string count, roughly equivalent to `#[a-zA-Z0-9_]*`.
60pub(crate) fn count(input: Input) -> ParseResult<String> {
61    rtrim(preceded(char('#'), cut(identifier_contents))).parse(input)
62}
63
64/// Parse a string offset, roughly equivalent to `@[a-zA-Z0-9_]*`.
65pub(crate) fn offset(input: Input) -> ParseResult<String> {
66    rtrim(preceded(char('@'), cut(identifier_contents))).parse(input)
67}
68
69/// Parse a string length, roughly equivalent to `![a-zA-Z0-9_]*`.
70pub(crate) fn length(input: Input) -> ParseResult<String> {
71    rtrim(preceded(char('!'), cut(identifier_contents))).parse(input)
72}
73
74/// Parse an identifier.
75///
76/// This is roughly equivalent to `[a-ZA-Z_][a-zA-Z0-9_]*`.
77pub(crate) fn identifier(input: Input) -> ParseResult<String> {
78    rtrim(map(
79        recognize((
80            take_one(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '_')),
81            cut(take_while(is_identifier_digit)),
82        )),
83        |input| input.cursor().to_owned(),
84    ))
85    .parse(input)
86}
87
88/// Parse a quoted string with escapable characters.
89///
90/// Equivalent to the `_TEXT_STRING_` lexical pattern in libyara.
91/// This is roughly equivalent to the pattern `/"[^\n\"]*"/`, with control
92/// patterns `\t`, `\r`, `\n`, `\"`, `\\`, and `\x[0-9a-fA-F]{2}`.
93///
94/// This parser allows non ascii bytes, hence returning a byte string.
95pub(crate) fn quoted(input: Input) -> ParseResult<Vec<u8>> {
96    rtrim(quoted_no_rtrim).parse(input)
97}
98
99fn quoted_no_rtrim(input: Input) -> ParseResult<Vec<u8>> {
100    let (mut input, _) = char('"').parse(input)?;
101
102    let mut index = 0;
103    let mut res = Vec::new();
104
105    let mut chars = input.cursor().char_indices();
106
107    while let Some((i, c)) = chars.next() {
108        index = i;
109        match c {
110            '\\' => match chars.next() {
111                Some((_, 't')) => res.push(b'\t'),
112                Some((_, 'r')) => res.push(b'\r'),
113                Some((_, 'n')) => res.push(b'\n'),
114                Some((_, '"')) => res.push(b'"'),
115                Some((_, '\\')) => res.push(b'\\'),
116                Some((_, 'x')) => match (chars.next(), chars.next()) {
117                    (Some((i1, a)), Some((i2, b))) => {
118                        let Some(a) = a.to_digit(16) else {
119                            index = i1;
120                            break;
121                        };
122                        let Some(b) = b.to_digit(16) else {
123                            index = i2;
124                            break;
125                        };
126                        #[allow(clippy::cast_possible_truncation)]
127                        res.push(((a as u8) << 4) + (b as u8));
128                    }
129                    _ => break,
130                },
131                Some((j, _)) => {
132                    index = j;
133                    break;
134                }
135                None => break,
136            },
137            '"' => {
138                input.advance(i + 1);
139                return Ok((input, res));
140            }
141            c => {
142                let mut buf = [0; 4];
143                let _r = c.encode_utf8(&mut buf);
144                res.extend(&buf[..c.len_utf8()]);
145            }
146        }
147    }
148
149    input.advance(index);
150    Err(nom::Err::Error(Error::from_error_kind(
151        input,
152        NomErrorKind::EscapedTransform,
153    )))
154}
155
156#[cfg(test)]
157mod tests {
158    use super::super::test_helpers::{parse, parse_err};
159
160    #[test]
161    fn test_parse_quoted() {
162        use super::quoted;
163
164        parse(quoted, "\"\" b", "b", "");
165        parse(quoted, "\"1\"b", "b", "1");
166        parse(quoted, "\"abc +$\" b", "b", "abc +$");
167
168        parse(
169            quoted,
170            r#"" \r \n \t \"\\a \\r""#,
171            "",
172            " \r \n \t \"\\a \\r",
173        );
174        parse(quoted, r#""\x10 \x32""#, "", "\u{10} 2");
175        parse(quoted, r#""\x00 \xFF""#, "", [0, b' ', 255]);
176
177        parse(quoted, r#""\xc3\x0f]\x00""#, "", [0xc3, 0x0f, b']', 0x00]);
178
179        parse(quoted, r#""é"a"#, "a", [0xc3, 0xa9]);
180
181        parse_err(quoted, "a");
182        parse_err(quoted, r#"""#);
183        parse_err(quoted, r#""ab"#);
184        parse_err(quoted, r#""a\"#);
185        parse_err(quoted, r#""a\xAG""#);
186        parse_err(quoted, r#""a\xGA""#);
187        parse_err(quoted, r#""\a""#);
188        parse_err(quoted, r#""\x"#);
189        parse_err(quoted, r#""\x1"#);
190        parse_err(quoted, r#""\x1""#);
191    }
192
193    #[test]
194    fn test_string_identifier() {
195        use super::string_identifier;
196
197        parse(string_identifier, "$-", "-", "");
198        parse(string_identifier, "$*", "*", "");
199        parse(string_identifier, "$a c", "c", "a");
200        parse(string_identifier, "$9b*c", "*c", "9b");
201        parse(string_identifier, "$_1Bd_F+", "+", "_1Bd_F");
202
203        parse_err(string_identifier, "");
204        parse_err(string_identifier, "*");
205    }
206
207    #[test]
208    fn test_string_identifier_with_wildcard() {
209        use super::string_identifier_with_wildcard as siww;
210
211        parse(siww, "$_*", "", ("_".to_owned(), true));
212        parse(siww, "$", "", (String::new(), false));
213        parse(siww, "$a* c", "c", ("a".to_owned(), true));
214        parse(siww, "$9b*c", "c", ("9b".to_owned(), true));
215        parse(siww, "$_1Bd_F+", "+", ("_1Bd_F".to_owned(), false));
216
217        parse_err(siww, "");
218        parse_err(siww, "*");
219    }
220
221    #[test]
222    fn test_count() {
223        use super::count;
224
225        parse(count, "#-", "-", "");
226        parse(count, "#*", "*", "");
227        parse(count, "#a c", "c", "a");
228        parse(count, "#9b*c", "*c", "9b");
229        parse(count, "#_1Bd_F+", "+", "_1Bd_F");
230
231        parse_err(count, "");
232        parse_err(count, "$");
233        parse_err(count, "@");
234        parse_err(count, "!");
235        parse_err(count, "*");
236    }
237
238    #[test]
239    fn test_offset() {
240        use super::offset;
241
242        parse(offset, "@-", "-", "");
243        parse(offset, "@*", "*", "");
244        parse(offset, "@a c", "c", "a");
245        parse(offset, "@9b*c", "*c", "9b");
246        parse(offset, "@_1Bd_F+", "+", "_1Bd_F");
247
248        parse_err(offset, "");
249        parse_err(offset, "$");
250        parse_err(offset, "#");
251        parse_err(offset, "!");
252        parse_err(offset, "*");
253    }
254
255    #[test]
256    fn test_length() {
257        use super::length;
258
259        parse(length, "!-", "-", "");
260        parse(length, "!*", "*", "");
261        parse(length, "!a c", "c", "a");
262        parse(length, "!9b*c", "*c", "9b");
263        parse(length, "!_1Bd_F+", "+", "_1Bd_F");
264
265        parse_err(length, "");
266        parse_err(length, "$");
267        parse_err(length, "#");
268        parse_err(length, "@");
269        parse_err(length, "*");
270    }
271
272    #[test]
273    fn test_identifier() {
274        use super::identifier;
275
276        parse(identifier, "a+", "+", "a");
277        parse(identifier, "_*", "*", "_");
278        parse(identifier, "A5 c", "c", "A5");
279        parse(identifier, "g9b*c", "*c", "g9b");
280        parse(identifier, "__1Bd_F+", "+", "__1Bd_F");
281
282        parse_err(identifier, "");
283        parse_err(identifier, "*");
284        parse_err(identifier, "$");
285        parse_err(identifier, "9");
286        parse_err(identifier, "9b");
287    }
288}